Whamcloud - gitweb
LU-17010 lfsck: don't create trans in dryrun mode
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2017, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lu_object.h>
40 #include <dt_object.h>
41 #include <lustre_fid.h>
42 #include <lustre_lib.h>
43 #include <lustre_net.h>
44 #include <md_object.h>
45 #include <obd_class.h>
46
47 #include "lfsck_internal.h"
48
49 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
50 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
51 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
52 #define LFSCK_LAYOUT_MAGIC_V4           0xB1732FED
53
54 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V4
55
56 struct lfsck_layout_seq {
57         struct list_head         lls_list;
58         __u64                    lls_seq;
59         __u64                    lls_lastid;
60         __u64                    lls_lastid_known;
61         struct dt_object        *lls_lastid_obj;
62         unsigned int             lls_dirty:1;
63 };
64
65 struct lfsck_layout_slave_target {
66         /* link into lfsck_layout_slave_data::llsd_master_list. */
67         struct list_head        llst_list;
68         /* The position for next record in the rbtree for iteration. */
69         struct lu_fid           llst_fid;
70         /* Dummy hash for iteration against the rbtree. */
71         __u64                   llst_hash;
72         __u64                   llst_gen;
73         atomic_t                llst_ref;
74         __u32                   llst_index;
75         /* How many times we have failed to get the master status. */
76         int                     llst_failures;
77 };
78
79 struct lfsck_layout_slave_data {
80         /* list for lfsck_layout_seq */
81         struct list_head         llsd_seq_list;
82
83         /* list for the masters involve layout verification. */
84         struct list_head         llsd_master_list;
85         spinlock_t               llsd_lock;
86         __u64                    llsd_touch_gen;
87         struct dt_object        *llsd_rb_obj;
88         struct rb_root           llsd_rb_root;
89         struct rw_semaphore      llsd_rb_rwsem;
90         unsigned int             llsd_rbtree_valid:1;
91 };
92
93 struct lfsck_layout_slave_async_args {
94         struct obd_export                *llsaa_exp;
95         struct lfsck_component           *llsaa_com;
96         struct lfsck_layout_slave_target *llsaa_llst;
97 };
98
99 static inline bool lfsck_comp_extent_aligned(__u64 border, __u32 size)
100 {
101         return (border & (size - 1)) == 0;
102 }
103
104 static inline void
105 lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst)
106 {
107         if (atomic_dec_and_test(&llst->llst_ref)) {
108                 LASSERT(list_empty(&llst->llst_list));
109
110                 OBD_FREE_PTR(llst);
111         }
112 }
113
114 static inline int
115 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
116 {
117         struct lfsck_layout_slave_target *llst;
118         struct lfsck_layout_slave_target *tmp;
119         int                               rc   = 0;
120
121         OBD_ALLOC_PTR(llst);
122         if (llst == NULL)
123                 return -ENOMEM;
124
125         INIT_LIST_HEAD(&llst->llst_list);
126         llst->llst_gen = 0;
127         llst->llst_index = index;
128         atomic_set(&llst->llst_ref, 1);
129
130         spin_lock(&llsd->llsd_lock);
131         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
132                 if (tmp->llst_index == index) {
133                         rc = -EALREADY;
134                         break;
135                 }
136         }
137         if (rc == 0)
138                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
139         spin_unlock(&llsd->llsd_lock);
140
141         if (rc != 0)
142                 OBD_FREE_PTR(llst);
143
144         return rc;
145 }
146
147 static inline void
148 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
149                       struct lfsck_layout_slave_target *llst)
150 {
151         bool del = false;
152
153         spin_lock(&llsd->llsd_lock);
154         if (!list_empty(&llst->llst_list)) {
155                 list_del_init(&llst->llst_list);
156                 del = true;
157         }
158         spin_unlock(&llsd->llsd_lock);
159
160         if (del)
161                 lfsck_layout_llst_put(llst);
162 }
163
164 static inline struct lfsck_layout_slave_target *
165 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
166                                __u32 index, bool unlink)
167 {
168         struct lfsck_layout_slave_target *llst;
169
170         spin_lock(&llsd->llsd_lock);
171         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
172                 if (llst->llst_index == index) {
173                         if (unlink)
174                                 list_del_init(&llst->llst_list);
175                         else
176                                 atomic_inc(&llst->llst_ref);
177                         spin_unlock(&llsd->llsd_lock);
178
179                         return llst;
180                 }
181         }
182         spin_unlock(&llsd->llsd_lock);
183
184         return NULL;
185 }
186
187 static struct lfsck_layout_req *
188 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
189                                 struct dt_object *child, __u32 comp_id,
190                                 __u32 ost_idx, __u32 lov_idx)
191 {
192         struct lfsck_layout_req *llr;
193
194         OBD_ALLOC_PTR(llr);
195         if (llr == NULL)
196                 return ERR_PTR(-ENOMEM);
197
198         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
199         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
200         llr->llr_child = child;
201         llr->llr_comp_id = comp_id;
202         llr->llr_ost_idx = ost_idx;
203         llr->llr_lov_idx = lov_idx;
204
205         return llr;
206 }
207
208 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
209                                             struct lfsck_assistant_req *lar)
210 {
211         struct lfsck_layout_req *llr =
212                 container_of(lar, struct lfsck_layout_req, llr_lar);
213
214         lfsck_object_put(env, llr->llr_child);
215         lfsck_assistant_object_put(env, lar->lar_parent);
216         OBD_FREE_PTR(llr);
217 }
218
219 static int
220 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
221                                                struct ptlrpc_request *req,
222                                                void *args, int rc)
223 {
224         if (rc == 0) {
225                 struct lfsck_async_interpret_args *laia = args;
226                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
227
228                 ltd->ltd_synced_failures = 1;
229                 atomic_dec(laia->laia_count);
230         }
231
232         return 0;
233 }
234
235 /**
236  * Notify remote LFSCK instances about former failures.
237  *
238  * The local LFSCK instance has recorded which OSTs have ever failed to respond
239  * some LFSCK verification requests (maybe because of network issues or the OST
240  * itself trouble). During the respond gap, the OST may missed some OST-objects
241  * verification, then the OST cannot know whether related OST-objects have been
242  * referenced by related MDT-objects or not, then in the second-stage scanning,
243  * these OST-objects will be regarded as orphan, if the OST-object contains bad
244  * parent FID for back reference, then it will misguide the LFSCK to make wrong
245  * fixing for the fake orphan.
246  *
247  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
248  * it will scan the bitmap for the ever failed OSTs, and notify them that they
249  * have ever missed some OST-object verification and should skip the handling
250  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
251  *
252  * \param[in] env       pointer to the thread context
253  * \param[in] com       pointer to the lfsck component
254  * \param[in] lr        pointer to the lfsck request
255  */
256 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
257                                                  struct lfsck_component *com,
258                                                  struct lfsck_request *lr)
259 {
260         struct lfsck_async_interpret_args *laia  =
261                                 &lfsck_env_info(env)->lti_laia2;
262         struct lfsck_assistant_data       *lad   = com->lc_data;
263         struct lfsck_layout               *lo    = com->lc_file_ram;
264         struct lfsck_instance             *lfsck = com->lc_lfsck;
265         struct lfsck_tgt_descs            *ltds  = &lfsck->li_ost_descs;
266         struct lfsck_tgt_desc             *ltd;
267         struct ptlrpc_request_set         *set;
268         atomic_t                           count;
269         __u32                              idx;
270         int                                rc    = 0;
271         ENTRY;
272
273         if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
274                 RETURN_EXIT;
275
276         /* If the MDT has ever failed to verfiy some OST-objects,
277          * then sync failures with them firstly. */
278         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
279
280         atomic_set(&count, 0);
281         memset(laia, 0, sizeof(*laia));
282         laia->laia_count = &count;
283         set = ptlrpc_prep_set();
284         if (set == NULL)
285                 GOTO(out, rc = -ENOMEM);
286
287         down_read(&ltds->ltd_rw_sem);
288         for_each_set_bit(idx, lad->lad_bitmap, lad->lad_bitmap_count) {
289                 ltd = lfsck_ltd2tgt(ltds, idx);
290                 if (unlikely(!ltd))
291                         continue;
292
293                 laia->laia_ltd = ltd;
294                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
295                                 lfsck_layout_assistant_sync_failures_interpret,
296                                 laia, LFSCK_NOTIFY);
297                 if (rc != 0) {
298                         CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to "
299                                "notify target %x for %s phase1 done: "
300                                "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
301                                ltd->ltd_index, lad->lad_name, rc);
302
303                         break;
304                 }
305
306                 atomic_inc(&count);
307         }
308         up_read(&ltds->ltd_rw_sem);
309
310         if (rc == 0 && atomic_read(&count) > 0)
311                 rc = ptlrpc_set_wait(env, set);
312
313         ptlrpc_set_destroy(set);
314
315         if (rc == 0 && atomic_read(&count) > 0)
316                 rc = -EINVAL;
317
318         GOTO(out, rc);
319
320 out:
321         if (rc != 0)
322                 /* If failed to sync failures with the OSTs, then have to
323                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
324                  * subsequent orphan OST-object handling. */
325                 lo->ll_flags |= LF_INCOMPLETE;
326
327         lr->lr_flags2 = lo->ll_flags;
328 }
329
330 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
331                                            struct lov_mds_md_v1 *lmm,
332                                            __u64 start, __u64 end,
333                                            __u32 comp_id,
334                                            bool ext, bool *dom)
335 {
336         __u32 magic;
337         __u32 pattern;
338         __u32 size;
339
340         magic = le32_to_cpu(lmm->lmm_magic);
341         /* If magic crashed, keep it there. Sometime later, during OST-object
342          * orphan handling, if some OST-object(s) back-point to it, it can be
343          * verified and repaired. */
344         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
345                 int rc;
346
347                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
348                         rc = -EOPNOTSUPP;
349                 else
350                         rc = -EINVAL;
351
352                 CDEBUG(D_LFSCK, "%s LOV EA magic 0x%X for the file "DFID"\n",
353                        rc == -EINVAL ? "Unknown" : "Unsupported",
354                        magic, PFID(lfsck_dto2fid(obj)));
355
356                 return rc;
357         }
358
359         pattern = le32_to_cpu(lmm->lmm_pattern);
360         *dom = !!(lov_pattern(pattern) == LOV_PATTERN_MDT);
361
362         /* XXX: DoM file verification will be supportted via LU-11081. */
363         if (lov_pattern(pattern) == LOV_PATTERN_MDT) {
364 #if 0
365                 if (start != 0) {
366                         CDEBUG(D_LFSCK, "The DoM entry for "DFID" is not "
367                                "the first component in the mirror %x/%llu\n",
368                                PFID(lfsck_dto2fid(obj)), comp_id, start);
369
370                         return -EINVAL;
371                 }
372 #endif
373         } else if (!lov_pattern_supported_normal_comp(lov_pattern(pattern))) {
374                 CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file "
375                        DFID" in the component %x\n",
376                        pattern, PFID(lfsck_dto2fid(obj)), comp_id);
377
378                 return -EOPNOTSUPP;
379         }
380
381         size = le32_to_cpu(lmm->lmm_stripe_size);
382         if (!ext && end != LUSTRE_EOF && start != end &&
383             !lfsck_comp_extent_aligned(end, size)){
384                 CDEBUG(D_LFSCK, "not aligned border in PFL extent range "
385                        "[%llu - %llu) stripesize %u for the file "DFID
386                        " at idx %d\n", start, end, size,
387                        PFID(lfsck_dto2fid(obj)), comp_id);
388
389                 return -EINVAL;
390         }
391
392         return 0;
393 }
394
395 static int lfsck_layout_verify_header_foreign(struct dt_object *obj,
396                                               struct lov_foreign_md *lfm,
397                                               size_t len)
398 {
399         /* magic has been verified already */
400         __u32 value_len = le32_to_cpu(lfm->lfm_length);
401         /* type and flags are not checked for instance */
402
403         CDEBUG(D_INFO, "foreign LOV EA, magic %x, len %u, type %x, flags %x, for file "DFID"\n",
404                le32_to_cpu(lfm->lfm_magic), value_len,
405                le32_to_cpu(lfm->lfm_type), le32_to_cpu(lfm->lfm_flags),
406                PFID(lfsck_dto2fid(obj)));
407
408         if (len != value_len + offsetof(typeof(*lfm), lfm_value))
409                 CDEBUG(D_LFSCK, "foreign LOV EA internal size %u does not match EA full size %zu for file "DFID"\n",
410                        value_len, len, PFID(lfsck_dto2fid(obj)));
411
412         /* nothing to repair */
413         return -ENODATA;
414 }
415
416 static int lfsck_layout_verify_header(struct dt_object *obj,
417                                       struct lov_mds_md_v1 *lmm, size_t len)
418 {
419         bool p_dom = false;
420         int rc = 0;
421
422         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1 ||
423             le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_SEL) {
424                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
425                 bool p_zero = false;
426                 int i;
427                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
428
429                 if (unlikely(count == 0)) {
430                         CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid "
431                                "components count 0\n",
432                                PFID(lfsck_dto2fid(obj)));
433
434                         return -EINVAL;
435                 }
436
437                 for (i = 0; i < count && !rc; i++) {
438                         struct lov_comp_md_entry_v1 *lcme =
439                                                 &lcm->lcm_entries[i];
440                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
441                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
442                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
443                         bool ext, inited, zero;
444                         __u32 flags;
445
446                         if (unlikely(comp_id == LCME_ID_INVAL ||
447                                      comp_id > LCME_ID_MAX)) {
448                                 CDEBUG(D_LFSCK, "found invalid PFL ID %u "
449                                        "for the file "DFID" at idx %d\n",
450                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
451
452                                 return -EINVAL;
453                         }
454
455                         flags = le32_to_cpu(lcme->lcme_flags);
456                         ext = flags & LCME_FL_EXTENSION;
457                         inited = flags & LCME_FL_INIT;
458                         zero = !!(start == end);
459
460                         if ((i == 0) && zero) {
461                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu "
462                                        "- %llu) for "DFID"\n", i, start, end,
463                                        PFID(lfsck_dto2fid(obj)));
464                                 return -EINVAL;
465                         }
466
467                         if ((zero && (inited || (i + 1 == count))) ||
468                             (start > end)) {
469                                 CDEBUG(D_LFSCK, "invalid PFL comp %d/%d: "
470                                        "[%llu, %llu) for "DFID", %sinited\n",
471                                        i, count, start, end,
472                                        PFID(lfsck_dto2fid(obj)),
473                                        inited ? "" : "NOT ");
474                                 return -EINVAL;
475                         }
476
477                         if (!ext && p_zero) {
478                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu, "
479                                        "%llu) for "DFID": NOT extension "
480                                        "after 0-length component\n", i,
481                                        start, end, PFID(lfsck_dto2fid(obj)));
482                                 return -EINVAL;
483                         }
484
485                         if (ext && (inited || p_dom || zero)) {
486                                 CDEBUG(D_LFSCK, "invalid PFL comp %d: [%llu, "
487                                        "%llu) for "DFID": %s\n", i,
488                                        start, end, PFID(lfsck_dto2fid(obj)),
489                                        inited ? "inited extension" :
490                                        p_dom ? "extension follows DOM" :
491                                        zero ? "zero length extension" : "");
492                                 return -EINVAL;
493                         }
494
495                         rc = lfsck_layout_verify_header_v1v3(obj,
496                                         (struct lov_mds_md_v1 *)((char *)lmm +
497                                         le32_to_cpu(lcme->lcme_offset)), start,
498                                         end, comp_id, ext, &p_dom);
499
500                         p_zero = zero;
501                 }
502         } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_FOREIGN) {
503                 rc = lfsck_layout_verify_header_foreign(obj,
504                                                 (struct lov_foreign_md *)lmm,
505                                                 len);
506         } else {
507                 rc = lfsck_layout_verify_header_v1v3(obj, lmm, 0, LUSTRE_EOF,
508                                                      0, false, &p_dom);
509         }
510
511         return rc;
512 }
513
514 static int lfsck_layout_get_lovea(const struct lu_env *env,
515                                   struct dt_object *obj, struct lu_buf *buf)
516 {
517         int rc;
518         int rc1;
519
520 again:
521         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
522         if (rc == -ERANGE) {
523                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
524                 if (rc <= 0)
525                         return !rc ? -ENODATA : rc;
526
527                 lu_buf_realloc(buf, rc);
528                 if (buf->lb_buf == NULL)
529                         return -ENOMEM;
530
531                 goto again;
532         }
533
534         if (rc <= 0)
535                 return !rc ? -ENODATA : rc;
536
537         if (unlikely(buf->lb_buf == NULL)) {
538                 lu_buf_alloc(buf, rc);
539                 if (buf->lb_buf == NULL)
540                         return -ENOMEM;
541
542                 goto again;
543         }
544
545         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf, rc);
546
547         return rc1 ? rc1 : rc;
548 }
549
550 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
551 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
552 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
553
554 struct lfsck_rbtree_node {
555         struct rb_node   lrn_node;
556         __u64            lrn_seq;
557         __u32            lrn_first_oid;
558         atomic_t         lrn_known_count;
559         atomic_t         lrn_accessed_count;
560         void            *lrn_known_bitmap;
561         void            *lrn_accessed_bitmap;
562 };
563
564 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
565                                    __u64 seq, __u32 oid)
566 {
567         if (seq < lrn->lrn_seq)
568                 return -1;
569
570         if (seq > lrn->lrn_seq)
571                 return 1;
572
573         if (oid < lrn->lrn_first_oid)
574                 return -1;
575
576         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
577                 return 1;
578
579         return 0;
580 }
581
582 /* The caller should hold llsd->llsd_rb_lock. */
583 static struct lfsck_rbtree_node *
584 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
585                     const struct lu_fid *fid, bool *exact)
586 {
587         struct rb_node           *node  = llsd->llsd_rb_root.rb_node;
588         struct rb_node           *prev  = NULL;
589         struct lfsck_rbtree_node *lrn   = NULL;
590         int                       rc    = 0;
591
592         if (exact != NULL)
593                 *exact = true;
594
595         while (node != NULL) {
596                 prev = node;
597                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
598                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
599                 if (rc < 0)
600                         node = node->rb_left;
601                 else if (rc > 0)
602                         node = node->rb_right;
603                 else
604                         return lrn;
605         }
606
607         if (exact == NULL)
608                 return NULL;
609
610         /* If there is no exactly matched one, then to the next valid one. */
611         *exact = false;
612
613         /* The rbtree is empty. */
614         if (rc == 0)
615                 return NULL;
616
617         if (rc < 0)
618                 return lrn;
619
620         node = rb_next(prev);
621
622         /* The end of the rbtree. */
623         if (node == NULL)
624                 return NULL;
625
626         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
627
628         return lrn;
629 }
630
631 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
632                                                   const struct lu_fid *fid)
633 {
634         struct lfsck_rbtree_node *lrn;
635
636         OBD_ALLOC_PTR(lrn);
637         if (lrn == NULL)
638                 return ERR_PTR(-ENOMEM);
639
640         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
641         if (lrn->lrn_known_bitmap == NULL) {
642                 OBD_FREE_PTR(lrn);
643
644                 return ERR_PTR(-ENOMEM);
645         }
646
647         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
648         if (lrn->lrn_accessed_bitmap == NULL) {
649                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
650                 OBD_FREE_PTR(lrn);
651
652                 return ERR_PTR(-ENOMEM);
653         }
654
655         RB_CLEAR_NODE(&lrn->lrn_node);
656         lrn->lrn_seq = fid_seq(fid);
657         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
658         atomic_set(&lrn->lrn_known_count, 0);
659         atomic_set(&lrn->lrn_accessed_count, 0);
660
661         return lrn;
662 }
663
664 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
665 {
666         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
667         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
668         OBD_FREE_PTR(lrn);
669 }
670
671 /* The caller should hold lock. */
672 static struct lfsck_rbtree_node *
673 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
674                     struct lfsck_rbtree_node *lrn)
675 {
676         struct rb_node           **pos    = &llsd->llsd_rb_root.rb_node;
677         struct rb_node            *parent = NULL;
678         struct lfsck_rbtree_node  *tmp;
679         int                        rc;
680
681         while (*pos != NULL) {
682                 parent = *pos;
683                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
684                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
685                 if (rc < 0)
686                         pos = &(*pos)->rb_left;
687                 else if (rc > 0)
688                         pos = &(*pos)->rb_right;
689                 else
690                         return tmp;
691         }
692
693         rb_link_node(&lrn->lrn_node, parent, pos);
694         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
695
696         return lrn;
697 }
698
699 static const struct dt_index_operations lfsck_orphan_index_ops;
700
701 static int lfsck_rbtree_setup(const struct lu_env *env,
702                               struct lfsck_component *com)
703 {
704         struct lu_fid                   *fid    = &lfsck_env_info(env)->lti_fid;
705         struct lfsck_instance           *lfsck  = com->lc_lfsck;
706         struct dt_device                *dev    = lfsck->li_bottom;
707         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
708         struct dt_object                *obj;
709
710         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
711         fid->f_oid = lfsck_dev_idx(lfsck);
712         fid->f_ver = 0;
713         obj = dt_locate(env, dev, fid);
714         if (IS_ERR(obj))
715                 RETURN(PTR_ERR(obj));
716
717         /* Generate an in-RAM object to stand for the layout rbtree.
718          * Scanning the layout rbtree will be via the iteration over
719          * the object. In the future, the rbtree may be written onto
720          * disk with the object.
721          *
722          * Mark the object to be as exist. */
723         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
724         obj->do_index_ops = &lfsck_orphan_index_ops;
725         llsd->llsd_rb_obj = obj;
726         llsd->llsd_rbtree_valid = 1;
727         dev->dd_record_fid_accessed = 1;
728
729         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
730                lfsck_lfsck2name(lfsck));
731
732         return 0;
733 }
734
735 static void lfsck_rbtree_cleanup(const struct lu_env *env,
736                                  struct lfsck_component *com)
737 {
738         struct lfsck_instance           *lfsck = com->lc_lfsck;
739         struct lfsck_layout_slave_data  *llsd  = com->lc_data;
740         struct rb_node                  *node  = rb_first(&llsd->llsd_rb_root);
741         struct rb_node                  *next;
742         struct lfsck_rbtree_node        *lrn;
743
744         lfsck->li_bottom->dd_record_fid_accessed = 0;
745         /* Invalid the rbtree, then no others will use it. */
746         down_write(&llsd->llsd_rb_rwsem);
747         llsd->llsd_rbtree_valid = 0;
748         up_write(&llsd->llsd_rb_rwsem);
749
750         while (node != NULL) {
751                 next = rb_next(node);
752                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
753                 rb_erase(node, &llsd->llsd_rb_root);
754                 lfsck_rbtree_free(lrn);
755                 node = next;
756         }
757
758         if (llsd->llsd_rb_obj != NULL) {
759                 lfsck_object_put(env, llsd->llsd_rb_obj);
760                 llsd->llsd_rb_obj = NULL;
761         }
762
763         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
764                lfsck_lfsck2name(lfsck));
765 }
766
767 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
768                                        struct lfsck_component *com,
769                                        const struct lu_fid *fid,
770                                        bool accessed)
771 {
772         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
773         struct lfsck_rbtree_node        *lrn;
774         bool                             insert = false;
775         int                              idx;
776         int                              rc     = 0;
777         ENTRY;
778
779         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
780                 RETURN_EXIT;
781
782         if (!fid_is_idif(fid) && !fid_is_norm(fid))
783                 RETURN_EXIT;
784
785         down_read(&llsd->llsd_rb_rwsem);
786         if (!llsd->llsd_rbtree_valid)
787                 GOTO(unlock, rc = 0);
788
789         lrn = lfsck_rbtree_search(llsd, fid, NULL);
790         if (lrn == NULL) {
791                 struct lfsck_rbtree_node *tmp;
792
793                 LASSERT(!insert);
794
795                 up_read(&llsd->llsd_rb_rwsem);
796                 tmp = lfsck_rbtree_new(env, fid);
797                 if (IS_ERR(tmp))
798                         GOTO(out, rc = PTR_ERR(tmp));
799
800                 insert = true;
801                 down_write(&llsd->llsd_rb_rwsem);
802                 if (!llsd->llsd_rbtree_valid) {
803                         lfsck_rbtree_free(tmp);
804                         GOTO(unlock, rc = 0);
805                 }
806
807                 lrn = lfsck_rbtree_insert(llsd, tmp);
808                 if (lrn != tmp)
809                         lfsck_rbtree_free(tmp);
810         }
811
812         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
813         /* Any accessed object must be a known object. */
814         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
815                 atomic_inc(&lrn->lrn_known_count);
816         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
817                 atomic_inc(&lrn->lrn_accessed_count);
818
819         GOTO(unlock, rc = 0);
820
821 unlock:
822         if (insert)
823                 up_write(&llsd->llsd_rb_rwsem);
824         else
825                 up_read(&llsd->llsd_rb_rwsem);
826 out:
827         if (rc != 0 && accessed) {
828                 struct lfsck_layout *lo = com->lc_file_ram;
829
830                 CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing "
831                        "bitmap, and will cause incorrect LFSCK OST-object "
832                        "handling, so disable it to cancel orphan handling "
833                        "for related device. rc = %d\n",
834                        lfsck_lfsck2name(com->lc_lfsck), rc);
835
836                 lo->ll_flags |= LF_INCOMPLETE;
837                 lfsck_rbtree_cleanup(env, com);
838         }
839 }
840
841 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
842                                   const struct lfsck_layout_dangling_key *src)
843 {
844         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
845         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
846         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
847 }
848
849 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
850                                   const struct lfsck_layout_dangling_key *src)
851 {
852         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
853         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
854         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
855 }
856
857 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
858                                   const struct lfsck_layout_dangling_key *src)
859 {
860         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
861         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
862         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
863 }
864
865 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
866                                   const struct lfsck_layout_dangling_key *src)
867 {
868         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
869         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
870         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
871 }
872
873 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
874                                    const struct lfsck_layout *src)
875 {
876         int i;
877
878         des->ll_magic = le32_to_cpu(src->ll_magic);
879         des->ll_status = le32_to_cpu(src->ll_status);
880         des->ll_flags = le32_to_cpu(src->ll_flags);
881         des->ll_success_count = le32_to_cpu(src->ll_success_count);
882         des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1);
883         des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2);
884         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
885         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
886         des->ll_time_last_checkpoint =
887                                 le64_to_cpu(src->ll_time_last_checkpoint);
888         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
889         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
890         des->ll_pos_first_inconsistent =
891                         le64_to_cpu(src->ll_pos_first_inconsistent);
892         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
893         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
894         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
895         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
896         for (i = 0; i < LLIT_MAX; i++)
897                 des->ll_objs_repaired[i] =
898                                 le64_to_cpu(src->ll_objs_repaired[i]);
899         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
900         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
901         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
902                        &src->ll_lldk_latest_scanned_phase2);
903 }
904
905 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
906                                    const struct lfsck_layout *src)
907 {
908         int i;
909
910         des->ll_magic = cpu_to_le32(src->ll_magic);
911         des->ll_status = cpu_to_le32(src->ll_status);
912         des->ll_flags = cpu_to_le32(src->ll_flags);
913         des->ll_success_count = cpu_to_le32(src->ll_success_count);
914         des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1);
915         des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2);
916         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
917         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
918         des->ll_time_last_checkpoint =
919                                 cpu_to_le64(src->ll_time_last_checkpoint);
920         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
921         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
922         des->ll_pos_first_inconsistent =
923                         cpu_to_le64(src->ll_pos_first_inconsistent);
924         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
925         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
926         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
927         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
928         for (i = 0; i < LLIT_MAX; i++)
929                 des->ll_objs_repaired[i] =
930                                 cpu_to_le64(src->ll_objs_repaired[i]);
931         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
932         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
933         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
934                        &src->ll_lldk_latest_scanned_phase2);
935 }
936
937 /**
938  * Load the OST bitmap from the lfsck_layout trace file.
939  *
940  * \param[in] env       pointer to the thread context
941  * \param[in] com       pointer to the lfsck component
942  *
943  * \retval              0 for success
944  * \retval              negative error number on failure or data corruption
945  */
946 static int lfsck_layout_load_bitmap(const struct lu_env *env,
947                                     struct lfsck_component *com)
948 {
949         struct dt_object *obj = com->lc_obj;
950         struct lfsck_assistant_data *lad = com->lc_data;
951         struct lfsck_layout *lo = com->lc_file_ram;
952         unsigned long *bitmap = lad->lad_bitmap;
953         loff_t pos = com->lc_file_size;
954         ssize_t size;
955         __u32 nbits;
956         int rc;
957
958         ENTRY;
959         if (com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len > lo->ll_bitmap_size)
960                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len;
961         else
962                 nbits = lo->ll_bitmap_size;
963
964         if (unlikely(nbits < BITS_PER_LONG))
965                 nbits = BITS_PER_LONG;
966
967         if (nbits > lad->lad_bitmap_count) {
968                 u32 new_bits = lad->lad_bitmap_count;
969                 unsigned long *new_bitmap;
970
971                 while (new_bits < nbits)
972                         new_bits <<= 1;
973
974                 new_bitmap = bitmap_zalloc(new_bits, GFP_KERNEL);
975                 if (new_bitmap == NULL)
976                         RETURN(-ENOMEM);
977
978                 lad->lad_bitmap = new_bitmap;
979                 lad->lad_bitmap_count = new_bits;
980                 bitmap_free(bitmap);
981                 bitmap = new_bitmap;
982         }
983
984         if (lo->ll_bitmap_size == 0) {
985                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
986                 bitmap_zero(bitmap, lad->lad_bitmap_count);
987                 RETURN(0);
988         }
989
990         size = (lo->ll_bitmap_size + 7) >> 3;
991         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap, size), &pos);
992         if (rc != size)
993                 RETURN(rc >= 0 ? -EINVAL : rc);
994
995         if (bitmap_empty(bitmap, lad->lad_bitmap_count))
996                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
997         else
998                 set_bit(LAD_INCOMPLETE, &lad->lad_flags);
999
1000         RETURN(0);
1001 }
1002
1003 /**
1004  * Load the layout LFSCK trace file from disk.
1005  *
1006  * The layout LFSCK trace file records the layout LFSCK status information
1007  * and other statistics, such as how many objects have been scanned, and how
1008  * many objects have been repaired, and etc. It also contains the bitmap for
1009  * failed OSTs during the layout LFSCK. All these information will be loaded
1010  * from disk to RAM when the layout LFSCK component setup.
1011  *
1012  * \param[in] env       pointer to the thread context
1013  * \param[in] com       pointer to the lfsck component
1014  *
1015  * \retval              positive number for file data corruption, the caller
1016  *                      should reset the layout LFSCK trace file
1017  * \retval              0 for success
1018  * \retval              negative error number on failure
1019  */
1020 static int lfsck_layout_load(const struct lu_env *env,
1021                              struct lfsck_component *com)
1022 {
1023         struct lfsck_layout             *lo     = com->lc_file_ram;
1024         ssize_t                          size   = com->lc_file_size;
1025         loff_t                           pos    = 0;
1026         int                              rc;
1027
1028         rc = dt_read(env, com->lc_obj,
1029                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
1030         if (rc == 0) {
1031                 return -ENOENT;
1032         } else if (rc < 0) {
1033                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
1034                        lfsck_lfsck2name(com->lc_lfsck), rc);
1035                 return rc;
1036         } else if (rc != size) {
1037                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
1038                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
1039                 return 1;
1040         }
1041
1042         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
1043         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
1044                 CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, "
1045                        "to be reset\n", lfsck_lfsck2name(com->lc_lfsck),
1046                        lo->ll_magic, LFSCK_LAYOUT_MAGIC);
1047                 return 1;
1048         }
1049
1050         return 0;
1051 }
1052
1053 /**
1054  * Store the layout LFSCK trace file on disk.
1055  *
1056  * The layout LFSCK trace file records the layout LFSCK status information
1057  * and other statistics, such as how many objects have been scanned, and how
1058  * many objects have been repaired, and etc. It also contains the bitmap for
1059  * failed OSTs during the layout LFSCK. All these information will be synced
1060  * from RAM to disk periodically.
1061  *
1062  * \param[in] env       pointer to the thread context
1063  * \param[in] com       pointer to the lfsck component
1064  *
1065  * \retval              0 for success
1066  * \retval              negative error number on failure
1067  */
1068 static int lfsck_layout_store(const struct lu_env *env,
1069                               struct lfsck_component *com)
1070 {
1071         struct dt_object *obj = com->lc_obj;
1072         struct lfsck_instance *lfsck = com->lc_lfsck;
1073         struct lfsck_layout *lo_ram = com->lc_file_ram;
1074         struct lfsck_layout *lo = com->lc_file_disk;
1075         struct thandle *th;
1076         struct dt_device *dev = lfsck_obj2dev(obj);
1077         unsigned long *bitmap = NULL;
1078         loff_t pos;
1079         ssize_t size = com->lc_file_size;
1080         __u32 nbits = 0;
1081         int rc;
1082
1083         ENTRY;
1084         if (lfsck->li_master) {
1085                 struct lfsck_assistant_data *lad = com->lc_data;
1086
1087                 bitmap = lad->lad_bitmap;
1088                 nbits = lad->lad_bitmap_count;
1089
1090                 LASSERT(nbits > 0);
1091                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1092         }
1093
1094         lo_ram->ll_bitmap_size = nbits;
1095         lfsck_layout_cpu_to_le(lo, lo_ram);
1096         th = dt_trans_create(env, dev);
1097         if (IS_ERR(th))
1098                 GOTO(log, rc = PTR_ERR(th));
1099
1100         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1101                                      (loff_t)0, th);
1102         if (rc != 0)
1103                 GOTO(out, rc);
1104
1105         if (bitmap != NULL) {
1106                 rc = dt_declare_record_write(env, obj,
1107                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1108                                 (loff_t)size, th);
1109                 if (rc != 0)
1110                         GOTO(out, rc);
1111         }
1112
1113         rc = dt_trans_start_local(env, dev, th);
1114         if (rc != 0)
1115                 GOTO(out, rc);
1116
1117         pos = 0;
1118         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1119         if (rc != 0)
1120                 GOTO(out, rc);
1121
1122         if (bitmap != NULL) {
1123                 pos = size;
1124                 rc = dt_record_write(env, obj,
1125                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1126                                 &pos, th);
1127         }
1128
1129         GOTO(out, rc);
1130
1131 out:
1132         dt_trans_stop(env, dev, th);
1133
1134 log:
1135         if (rc != 0)
1136                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1137                        lfsck_lfsck2name(lfsck), rc);
1138
1139         return rc;
1140 }
1141
1142 static int lfsck_layout_init(const struct lu_env *env,
1143                              struct lfsck_component *com)
1144 {
1145         struct lfsck_layout *lo = com->lc_file_ram;
1146         int rc;
1147
1148         memset(lo, 0, com->lc_file_size);
1149         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1150         lo->ll_status = LS_INIT;
1151         down_write(&com->lc_sem);
1152         rc = lfsck_layout_store(env, com);
1153         if (rc == 0 && com->lc_lfsck->li_master)
1154                 rc = lfsck_load_sub_trace_files(env, com,
1155                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1156         up_write(&com->lc_sem);
1157
1158         return rc;
1159 }
1160
1161 static int fid_is_for_ostobj(const struct lu_env *env,
1162                              struct lfsck_instance *lfsck,
1163                              struct dt_object *obj, const struct lu_fid *fid)
1164 {
1165         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1166         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1167         struct lustre_ost_attrs *loa;
1168         int                      rc;
1169
1170         fld_range_set_any(range);
1171         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1172         if (rc == 0) {
1173                 if (fld_range_is_ost(range))
1174                         return 1;
1175
1176                 return 0;
1177         }
1178
1179         loa = &lfsck_env_info(env)->lti_loa;
1180         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1181                           XATTR_NAME_LMA);
1182         if (rc >= (int)sizeof(struct lustre_mdt_attrs)) {
1183                 lustre_lma_swab(&loa->loa_lma);
1184
1185                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1186         }
1187
1188         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1189
1190         return rc > 0;
1191 }
1192
1193 static struct lfsck_layout_seq *
1194 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1195 {
1196         struct lfsck_layout_seq *lls;
1197
1198         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1199                 if (lls->lls_seq == seq)
1200                         return lls;
1201
1202                 if (lls->lls_seq > seq)
1203                         return NULL;
1204         }
1205
1206         return NULL;
1207 }
1208
1209 static void
1210 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1211                         struct lfsck_layout_seq *lls)
1212 {
1213         struct lfsck_layout_seq *tmp;
1214         struct list_head        *pos = &llsd->llsd_seq_list;
1215
1216         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1217                 if (lls->lls_seq < tmp->lls_seq) {
1218                         pos = &tmp->lls_list;
1219                         break;
1220                 }
1221         }
1222         list_add_tail(&lls->lls_list, pos);
1223 }
1224
1225 static int
1226 lfsck_layout_lastid_create(const struct lu_env *env,
1227                            struct lfsck_instance *lfsck,
1228                            struct dt_object *obj)
1229 {
1230         struct lfsck_thread_info *info   = lfsck_env_info(env);
1231         struct lu_attr           *la     = &info->lti_la;
1232         struct dt_object_format  *dof    = &info->lti_dof;
1233         struct lfsck_bookmark    *bk     = &lfsck->li_bookmark_ram;
1234         struct dt_device         *dt     = lfsck_obj2dev(obj);
1235         struct thandle           *th;
1236         __u64                     lastid = 0;
1237         loff_t                    pos    = 0;
1238         int                       rc;
1239         ENTRY;
1240
1241         if (bk->lb_param & LPF_DRYRUN)
1242                 return 0;
1243
1244         memset(la, 0, sizeof(*la));
1245         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1246         la->la_valid = LA_MODE | LA_UID | LA_GID;
1247         memset(dof, 0, sizeof(*dof));
1248         dof->dof_type = dt_mode_to_dft(S_IFREG);
1249
1250         th = lfsck_trans_create(env, dt, lfsck);
1251         if (IS_ERR(th))
1252                 GOTO(log, rc = PTR_ERR(th));
1253
1254         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1255         if (rc != 0)
1256                 GOTO(stop, rc);
1257
1258         rc = dt_declare_record_write(env, obj,
1259                                      lfsck_buf_get(env, &lastid,
1260                                                    sizeof(lastid)),
1261                                      pos, th);
1262         if (rc != 0)
1263                 GOTO(stop, rc);
1264
1265         rc = dt_trans_start_local(env, dt, th);
1266         if (rc != 0)
1267                 GOTO(stop, rc);
1268
1269         dt_write_lock(env, obj, 0);
1270         if (likely(dt_object_exists(obj) == 0)) {
1271                 rc = dt_create(env, obj, la, NULL, dof, th);
1272                 if (rc == 0)
1273                         rc = dt_record_write(env, obj,
1274                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1275                                 &pos, th);
1276         }
1277         dt_write_unlock(env, obj);
1278
1279         GOTO(stop, rc);
1280
1281 stop:
1282         dt_trans_stop(env, dt, th);
1283
1284 log:
1285         CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for <seq> "
1286                "%#llx: rc = %d\n",
1287                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1288
1289         return rc;
1290 }
1291
1292 static int
1293 lfsck_layout_lastid_reload(const struct lu_env *env,
1294                            struct lfsck_component *com,
1295                            struct lfsck_layout_seq *lls)
1296 {
1297         __u64   lastid;
1298         loff_t  pos     = 0;
1299         int     rc;
1300
1301         dt_read_lock(env, lls->lls_lastid_obj, 0);
1302         rc = dt_record_read(env, lls->lls_lastid_obj,
1303                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1304         dt_read_unlock(env, lls->lls_lastid_obj);
1305         if (unlikely(rc != 0))
1306                 return rc;
1307
1308         lastid = le64_to_cpu(lastid);
1309         if (lastid < lls->lls_lastid_known) {
1310                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1311                 struct lfsck_layout     *lo     = com->lc_file_ram;
1312
1313                 lls->lls_lastid = lls->lls_lastid_known;
1314                 lls->lls_dirty = 1;
1315                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1316                         LASSERT(lfsck->li_out_notify != NULL);
1317
1318                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1319                                              LE_LASTID_REBUILDING);
1320                         lo->ll_flags |= LF_CRASHED_LASTID;
1321
1322                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
1323                                "LAST_ID file (1) for the sequence %#llx"
1324                                ", old value %llu, known value %llu\n",
1325                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1326                                lastid, lls->lls_lastid);
1327                 }
1328         } else if (lastid >= lls->lls_lastid) {
1329                 lls->lls_lastid = lastid;
1330                 lls->lls_dirty = 0;
1331         }
1332
1333         return 0;
1334 }
1335
1336 static int
1337 lfsck_layout_lastid_store(const struct lu_env *env,
1338                           struct lfsck_component *com)
1339 {
1340         struct lfsck_instance           *lfsck  = com->lc_lfsck;
1341         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
1342         struct dt_device                *dt     = lfsck->li_bottom;
1343         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
1344         struct lfsck_layout_seq         *lls;
1345         struct thandle                  *th;
1346         __u64                            lastid;
1347         int                              rc     = 0;
1348         int                              rc1    = 0;
1349
1350         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1351                 loff_t pos = 0;
1352
1353                 if (!lls->lls_dirty)
1354                         continue;
1355
1356                 CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for "
1357                        "<seq> %#llx as <oid> %llu\n",
1358                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1359
1360                 if (bk->lb_param & LPF_DRYRUN) {
1361                         lls->lls_dirty = 0;
1362                         continue;
1363                 }
1364
1365                 th = lfsck_trans_create(env, dt, lfsck);
1366                 if (IS_ERR(th)) {
1367                         rc1 = PTR_ERR(th);
1368                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1369                                "the LAST_ID for <seq> %#llx(1): rc = %d\n",
1370                                lfsck_lfsck2name(com->lc_lfsck),
1371                                lls->lls_seq, rc1);
1372                         continue;
1373                 }
1374
1375                 lastid = cpu_to_le64(lls->lls_lastid);
1376                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1377                                              lfsck_buf_get(env, &lastid,
1378                                                            sizeof(lastid)),
1379                                              pos, th);
1380                 if (rc != 0)
1381                         goto stop;
1382
1383                 rc = dt_trans_start_local(env, dt, th);
1384                 if (rc != 0)
1385                         goto stop;
1386
1387                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1388                 rc = dt_record_write(env, lls->lls_lastid_obj,
1389                                      lfsck_buf_get(env, &lastid,
1390                                      sizeof(lastid)), &pos, th);
1391                 dt_write_unlock(env, lls->lls_lastid_obj);
1392                 if (rc == 0)
1393                         lls->lls_dirty = 0;
1394
1395 stop:
1396                 dt_trans_stop(env, dt, th);
1397                 if (rc != 0) {
1398                         rc1 = rc;
1399                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1400                                "the LAST_ID for <seq> %#llx(2): rc = %d\n",
1401                                lfsck_lfsck2name(com->lc_lfsck),
1402                                lls->lls_seq, rc1);
1403                 }
1404         }
1405
1406         return rc1;
1407 }
1408
1409 static int
1410 lfsck_layout_lastid_load(const struct lu_env *env,
1411                          struct lfsck_component *com,
1412                          struct lfsck_layout_seq *lls)
1413 {
1414         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1415         struct lfsck_layout     *lo     = com->lc_file_ram;
1416         struct lu_fid           *fid    = &lfsck_env_info(env)->lti_fid;
1417         struct dt_object        *obj;
1418         loff_t                   pos    = 0;
1419         int                      rc;
1420         ENTRY;
1421
1422         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1423         obj = dt_locate(env, lfsck->li_bottom, fid);
1424         if (IS_ERR(obj))
1425                 RETURN(PTR_ERR(obj));
1426
1427         /* LAST_ID crashed, to be rebuilt */
1428         if (dt_object_exists(obj) == 0) {
1429                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1430                         LASSERT(lfsck->li_out_notify != NULL);
1431
1432                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1433                                              LE_LASTID_REBUILDING);
1434                         lo->ll_flags |= LF_CRASHED_LASTID;
1435
1436                         CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the "
1437                                "LAST_ID file for sequence %#llx\n",
1438                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1439
1440                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1441                             cfs_fail_val > 0) {
1442                                 struct ptlrpc_thread *thread =
1443                                         &lfsck->li_thread;
1444
1445                                 up_write(&com->lc_sem);
1446                                 wait_event_idle_timeout(
1447                                         thread->t_ctl_waitq,
1448                                         !thread_is_running(thread),
1449                                         cfs_time_seconds(cfs_fail_val));
1450                                 down_write(&com->lc_sem);
1451                         }
1452                 }
1453
1454                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1455         } else {
1456                 dt_read_lock(env, obj, 0);
1457                 rc = dt_read(env, obj,
1458                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1459                         &pos);
1460                 dt_read_unlock(env, obj);
1461                 if (rc != 0 && rc != sizeof(__u64))
1462                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1463
1464                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1465                         LASSERT(lfsck->li_out_notify != NULL);
1466
1467                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1468                                              LE_LASTID_REBUILDING);
1469                         lo->ll_flags |= LF_CRASHED_LASTID;
1470
1471                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid "
1472                                "LAST_ID file for the sequence %#llx"
1473                                ": rc = %d\n",
1474                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1475                 }
1476
1477                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1478                 rc = 0;
1479         }
1480
1481         GOTO(out, rc);
1482
1483 out:
1484         if (rc != 0)
1485                 lfsck_object_put(env, obj);
1486         else
1487                 lls->lls_lastid_obj = obj;
1488
1489         return rc;
1490 }
1491
1492 static void lfsck_layout_record_failure(const struct lu_env *env,
1493                                         struct lfsck_instance *lfsck,
1494                                         struct lfsck_layout *lo)
1495 {
1496         __u64 cookie;
1497
1498         lo->ll_objs_failed_phase1++;
1499         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1500                                                         lfsck->li_di_oit);
1501         if (lo->ll_pos_first_inconsistent == 0 ||
1502             lo->ll_pos_first_inconsistent < cookie) {
1503                 lo->ll_pos_first_inconsistent = cookie;
1504
1505                 CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired "
1506                        "inconsistency at the pos [%llu]\n",
1507                        lfsck_lfsck2name(lfsck),
1508                        lo->ll_pos_first_inconsistent);
1509         }
1510 }
1511
1512 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1513                                            struct lfsck_component *com,
1514                                            int rc)
1515 {
1516         struct lfsck_instance   *lfsck = com->lc_lfsck;
1517         struct lfsck_layout     *lo    = com->lc_file_ram;
1518
1519         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1520                lfsck_lfsck2name(lfsck), rc);
1521
1522         down_write(&com->lc_sem);
1523         lo->ll_run_time_phase2 += ktime_get_seconds() -
1524                                   com->lc_time_last_checkpoint;
1525         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
1526         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1527
1528         if (rc > 0) {
1529                 if (lo->ll_flags & LF_INCOMPLETE) {
1530                         lo->ll_status = LS_PARTIAL;
1531                 } else {
1532                         if (lfsck->li_master) {
1533                                 struct lfsck_assistant_data *lad = com->lc_data;
1534
1535                                 if (test_bit(LAD_INCOMPLETE, &lad->lad_flags))
1536                                         lo->ll_status = LS_PARTIAL;
1537                                 else
1538                                         lo->ll_status = LS_COMPLETED;
1539                         } else {
1540                                 lo->ll_status = LS_COMPLETED;
1541                         }
1542                 }
1543                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1544                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1545                         lo->ll_flags &= ~LF_INCONSISTENT;
1546                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1547                 lo->ll_success_count++;
1548         } else if (rc == 0) {
1549                 if (lfsck->li_status != 0)
1550                         lo->ll_status = lfsck->li_status;
1551                 else
1552                         lo->ll_status = LS_STOPPED;
1553         } else {
1554                 lo->ll_status = LS_FAILED;
1555         }
1556
1557         rc = lfsck_layout_store(env, com);
1558         up_write(&com->lc_sem);
1559
1560         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1561                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1562
1563         return rc;
1564 }
1565
1566 static int lfsck_layout_trans_stop(const struct lu_env *env,
1567                                    struct dt_device *dev,
1568                                    struct thandle *handle, int result)
1569 {
1570         int rc;
1571
1572         /* XXX: If there is something worng or it needs to repair nothing,
1573          *      then notify the lower to stop the modification. Currently,
1574          *      we use th_result for such purpose, that may be replaced by
1575          *      some rollback mechanism in the future. */
1576         handle->th_result = result;
1577         rc = dt_trans_stop(env, dev, handle);
1578         if (result != 0)
1579                 return result > 0 ? 0 : result;
1580
1581         return rc == 0 ? 1 : rc;
1582 }
1583
1584 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1585                                          struct lfsck_component *com,
1586                                          const struct lu_fid *pfid,
1587                                          const struct lu_fid *cfid,
1588                                          __u32 comp_id, __u32 ea_off,
1589                                          __u32 ost_idx)
1590 {
1591         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1592         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1593         struct dt_device *dev;
1594         struct dt_object *obj;
1595         struct thandle *th = NULL;
1596         int idx;
1597         int rc = 0;
1598         ENTRY;
1599
1600         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
1601                 GOTO(log, rc = 0);
1602
1603         idx = lfsck_sub_trace_file_fid2idx(pfid);
1604         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1605         dev = lfsck_obj2dev(obj);
1606
1607         fid_cpu_to_be(&key->lldk_fid, pfid);
1608         key->lldk_comp_id = cpu_to_be32(comp_id);
1609         key->lldk_ea_off = cpu_to_be32(ea_off);
1610
1611         fid_cpu_to_be(rec, cfid);
1612         rec->f_ver = cpu_to_be32(ost_idx);
1613
1614         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1615
1616         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1617         if (IS_ERR(th))
1618                 GOTO(unlock, rc = PTR_ERR(th));
1619
1620         rc = dt_declare_insert(env, obj,
1621                                (const struct dt_rec *)rec,
1622                                (const struct dt_key *)key, th);
1623         if (rc)
1624                 GOTO(unlock, rc);
1625
1626         rc = dt_trans_start_local(env, dev, th);
1627         if (rc)
1628                 GOTO(unlock, rc);
1629
1630         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1631                        (const struct dt_key *)key, th);
1632
1633         GOTO(unlock, rc);
1634
1635 unlock:
1636         if (th && !IS_ERR(th))
1637                 dt_trans_stop(env, dev, th);
1638
1639         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1640
1641 log:
1642         CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, "
1643                "ea_off = %u, ost_idx = %u, into the trace file for further "
1644                "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
1645                PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc);
1646
1647         return rc;
1648 }
1649
1650 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1651                                          struct lfsck_component *com,
1652                                          const struct lu_fid *fid,
1653                                          __u32 comp_id, __u32 ea_off)
1654 {
1655         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1656         struct dt_device *dev;
1657         struct dt_object *obj;
1658         struct thandle *th = NULL;
1659         int idx;
1660         int rc = 0;
1661         ENTRY;
1662
1663         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
1664                 GOTO(log, rc = 0);
1665
1666         idx = lfsck_sub_trace_file_fid2idx(fid);
1667         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1668         dev = lfsck_obj2dev(obj);
1669
1670         fid_cpu_to_be(&key->lldk_fid, fid);
1671         key->lldk_comp_id = cpu_to_be32(comp_id);
1672         key->lldk_ea_off = cpu_to_be32(ea_off);
1673
1674         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1675
1676         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1677         if (IS_ERR(th))
1678                 GOTO(unlock, rc = PTR_ERR(th));
1679
1680         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1681         if (rc)
1682                 GOTO(unlock, rc);
1683
1684         rc = dt_trans_start_local(env, dev, th);
1685         if (rc)
1686                 GOTO(unlock, rc);
1687
1688         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1689
1690         GOTO(unlock, rc);
1691
1692 unlock:
1693         if (th && !IS_ERR(th))
1694                 dt_trans_stop(env, dev, th);
1695
1696         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1697
1698 log:
1699         CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID
1700                ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1701                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1702
1703         return rc;
1704 }
1705
1706 /**
1707  * Get the system default stripe size.
1708  *
1709  * \param[in] env       pointer to the thread context
1710  * \param[in] lfsck     pointer to the lfsck instance
1711  * \param[out] size     pointer to the default stripe size
1712  *
1713  * \retval              0 for success
1714  * \retval              negative error number on failure
1715  */
1716 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1717                                            struct lfsck_instance *lfsck,
1718                                            __u32 *size)
1719 {
1720         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1721         struct dt_object        *root;
1722         int                      rc;
1723
1724         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1725         if (IS_ERR(root))
1726                 return PTR_ERR(root);
1727
1728         /* Get the default stripe size via xattr_get on the backend root. */
1729         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1730                           XATTR_NAME_LOV);
1731         if (rc > 0) {
1732                 /* The lum->lmm_stripe_size is LE mode. The *size also
1733                  * should be LE mode. So it is unnecessary to convert. */
1734                 *size = lum->lmm_stripe_size;
1735                 rc = 0;
1736         } else if (unlikely(rc == 0)) {
1737                 rc = -EINVAL;
1738         }
1739
1740         lfsck_object_put(env, root);
1741
1742         return rc;
1743 }
1744
1745 /**
1746  * \retval       +1: repaired
1747  * \retval        0: did nothing
1748  * \retval      -ve: on error
1749  */
1750 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1751                                      struct lfsck_instance *lfsck,
1752                                      struct thandle *handle,
1753                                      struct dt_object *parent,
1754                                      const struct lu_fid *cfid,
1755                                      struct lu_buf *buf,
1756                                      struct lov_mds_md_v1 *lmm,
1757                                      struct lov_ost_data_v1 *slot,
1758                                      int fl, __u32 ost_idx, int size)
1759 {
1760         struct ost_id           *oi     = &lfsck_env_info(env)->lti_oi;
1761         struct lu_buf            ea_buf;
1762         int                      rc;
1763         __u32                    magic;
1764         __u32                    pattern;
1765         __u16                    count;
1766         ENTRY;
1767
1768         magic = le32_to_cpu(lmm->lmm_magic);
1769         pattern = le32_to_cpu(lmm->lmm_pattern);
1770         count = le16_to_cpu(lmm->lmm_stripe_count);
1771
1772         fid_to_ostid(cfid, oi);
1773         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1774         slot->l_ost_gen = cpu_to_le32(0);
1775         slot->l_ost_idx = cpu_to_le32(ost_idx);
1776
1777         if (pattern & LOV_PATTERN_F_HOLE) {
1778                 struct lov_ost_data_v1 *objs;
1779                 int                     i;
1780
1781                 if (magic == LOV_MAGIC_V1)
1782                         objs = &lmm->lmm_objects[0];
1783                 else
1784                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1785                 for (i = 0; i < count; i++, objs++) {
1786                         if (lovea_slot_is_dummy(objs))
1787                                 break;
1788                 }
1789
1790                 /* If the @slot is the last dummy slot to be refilled,
1791                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */
1792                 if (i == count) {
1793                         lmm->lmm_pattern =
1794                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1795
1796                         CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID
1797                                ": parent "DFID"\n", lfsck_lfsck2name(lfsck),
1798                                PFID(cfid), PFID(lfsck_dto2fid(parent)));
1799                 }
1800         }
1801
1802         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1803         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1804         if (rc == 0)
1805                 rc = 1;
1806
1807         RETURN(rc);
1808 }
1809
1810 static struct lov_ost_data_v1 *
1811 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1812                             const struct lu_fid *pfid,
1813                             __u32 stripe_size, __u32 ea_off,
1814                             __u32 pattern, __u16 count)
1815 {
1816         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1817         lmm->lmm_pattern = cpu_to_le32(pattern);
1818         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1819         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1820         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1821         lmm->lmm_stripe_count = cpu_to_le16(count);
1822         lmm->lmm_layout_gen = cpu_to_le16(1);
1823         memset(&lmm->lmm_objects[0], 0,
1824                sizeof(struct lov_ost_data_v1) * count);
1825
1826         return &lmm->lmm_objects[ea_off];
1827 }
1828
1829 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1830                                      struct lfsck_instance *lfsck,
1831                                      struct ost_layout *ol,
1832                                      struct dt_object *parent,
1833                                      struct lu_buf *buf, __u32 ea_off,
1834                                      struct lov_mds_md_v1 **lmm,
1835                                      struct lov_ost_data_v1 **objs)
1836 {
1837         int size;
1838         __u32 stripe_size = ol->ol_stripe_size;
1839         __u32 pattern = LOV_PATTERN_RAID0;
1840         __u16 count;
1841
1842         if (ol->ol_stripe_count != 0)
1843                 count = ol->ol_stripe_count;
1844         else
1845                 count = ea_off + 1;
1846
1847         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1848         LASSERTF(buf->lb_len >= size,
1849                  "buffer len %d is less than real size %d\n",
1850                  (int)buf->lb_len, size);
1851
1852         if (stripe_size == 0) {
1853                 int rc;
1854
1855                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1856                 if (rc)
1857                         return rc;
1858         }
1859
1860         *lmm = buf->lb_buf;
1861         if (ol->ol_stripe_count > 1 ||
1862             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1863                 pattern |= LOV_PATTERN_F_HOLE;
1864                 memset(&(*lmm)->lmm_objects[0], 0,
1865                        count * sizeof(struct lov_ost_data_v1));
1866         }
1867
1868         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1869                                 stripe_size, ea_off, pattern, count);
1870
1871         return size;
1872 }
1873
1874 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1875                                        struct lu_orphan_rec_v3 *rec,
1876                                        struct dt_object *parent,
1877                                        struct lu_buf *buf, __u32 ea_off,
1878                                        struct lov_mds_md_v1 **lmm,
1879                                        struct lov_ost_data_v1 **objs)
1880 {
1881         struct ost_layout *ol = &rec->lor_layout;
1882         struct lov_comp_md_v1 *lcm;
1883         struct lov_comp_md_entry_v1 *lcme;
1884         __u32 pattern = LOV_PATTERN_RAID0;
1885         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1886         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1887         int size = offset + lcme_size;
1888
1889         LASSERTF(buf->lb_len >= size,
1890                  "buffer len %d is less than real size %d\n",
1891                  (int)buf->lb_len, size);
1892
1893         lcm = buf->lb_buf;
1894         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1895         lcm->lcm_size = cpu_to_le32(size);
1896         if (rec->lor_range) {
1897                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1898                                                   rec->lor_range);
1899                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1900         } else if (rec->lor_layout_version) {
1901                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1902                                                   rec->lor_range);
1903                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1904         } else {
1905                 /*
1906                  * if OST doesn't provide layout version, then try
1907                  * to inherit one from MDS's layout, but increment
1908                  * it so the client notices and applies modified
1909                  * layout
1910                  */
1911                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1912                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1913         }
1914         lcm->lcm_entry_count = cpu_to_le16(1);
1915         /* Currently, we do not know how many mirrors will be, set it as zero
1916          * at the beginning. It will be updated when more mirrors are found. */
1917         lcm->lcm_mirror_count = 0;
1918
1919         lcme = &lcm->lcm_entries[0];
1920         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1921         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1922         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1923         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1924         lcme->lcme_offset = cpu_to_le32(offset);
1925         lcme->lcme_size = cpu_to_le32(lcme_size);
1926         lcme->lcme_layout_gen = lcm->lcm_layout_gen;
1927         if (ol->ol_stripe_count > 1)
1928                 pattern |= LOV_PATTERN_F_HOLE;
1929
1930         *lmm = buf->lb_buf + offset;
1931         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1932                                             ol->ol_stripe_size, ea_off,
1933                                             pattern, ol->ol_stripe_count);
1934
1935         return size;
1936 }
1937
1938 static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm,
1939                                     struct lov_comp_md_entry_v1 *lcme,
1940                                     __u32 version, __u32 range)
1941 {
1942         struct lov_comp_md_entry_v1 *tmp;
1943         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
1944         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
1945         __u32 gen = version + range;
1946         __u32 tmp_gen;
1947         int i;
1948         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1949         __u16 flags = le16_to_cpu(lcm->lcm_flags);
1950
1951         if (!gen)
1952                 gen = 1;
1953         lcme->lcme_layout_gen = cpu_to_le32(gen);
1954         if (le32_to_cpu(lcm->lcm_layout_gen) < gen)
1955                 lcm->lcm_layout_gen = cpu_to_le32(gen);
1956
1957         if (range)
1958                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1959         else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0)
1960                 lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY);
1961
1962         for (i = 0; i < count; i++) {
1963                 tmp = &lcm->lcm_entries[i];
1964                 if (le64_to_cpu(tmp->lcme_extent.e_end) <= start)
1965                         continue;
1966
1967                 if (le64_to_cpu(tmp->lcme_extent.e_start) >= end)
1968                         continue;
1969
1970                 if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE)
1971                         continue;
1972
1973                 tmp_gen = le32_to_cpu(tmp->lcme_layout_gen);
1974                 /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag,
1975                  * then it should be the latest version of all mirrors. */
1976                 if (tmp_gen == 0 || tmp_gen > gen) {
1977                         lcme->lcme_flags = cpu_to_le32(
1978                                 le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE);
1979                         break;
1980                 }
1981
1982                 if (tmp_gen < gen)
1983                         tmp->lcme_flags = cpu_to_le32(
1984                                 le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE);
1985         }
1986 }
1987
1988 static int lfsck_layout_add_comp(const struct lu_env *env,
1989                                  struct lfsck_instance *lfsck,
1990                                  struct thandle *handle,
1991                                  struct lu_orphan_rec_v3 *rec,
1992                                  struct dt_object *parent,
1993                                  const struct lu_fid *cfid,
1994                                  struct lu_buf *buf, __u32 ost_idx,
1995                                  __u32 ea_off, int pos, bool new_mirror)
1996 {
1997         struct ost_layout *ol = &rec->lor_layout;
1998         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1999         struct lov_comp_md_entry_v1 *lcme;
2000         struct lov_mds_md_v1 *lmm;
2001         struct lov_ost_data_v1 *objs;
2002         int added = sizeof(*lcme) +
2003                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2004         int size = le32_to_cpu(lcm->lcm_size) + added;
2005         int rc;
2006         int i;
2007         __u32 offset;
2008         __u32 pattern = LOV_PATTERN_RAID0;
2009         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
2010         ENTRY;
2011
2012         lu_buf_check_and_grow(buf, size);
2013         /* set the lcm again because lu_buf_check_and_grow() may
2014          * have reallocated the buf. */
2015         lcm = buf->lb_buf;
2016         lcm->lcm_size = cpu_to_le32(size);
2017         lcm->lcm_entry_count = cpu_to_le16(count + 1);
2018         if (new_mirror)
2019                 le16_add_cpu(&lcm->lcm_mirror_count, 1);
2020
2021         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
2022          *    with distance of 'added'. */
2023         if (pos < count) {
2024                 size = 0;
2025                 for (i = pos; i < count; i++) {
2026                         lcme = &lcm->lcm_entries[i];
2027                         size += le32_to_cpu(lcme->lcme_size);
2028                 }
2029
2030                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
2031                 memmove(buf->lb_buf + offset + added,
2032                         buf->lb_buf + offset, size);
2033         }
2034
2035         size = 0;
2036         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
2037          *    of 'sizeof(struct lov_comp_md_entry_v1)' */
2038         if (pos > 0) {
2039                 for (i = 0; i < pos; i++) {
2040                         lcme = &lcm->lcm_entries[i];
2041                         size += le32_to_cpu(lcme->lcme_size);
2042                 }
2043
2044                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
2045                 memmove(buf->lb_buf + offset + sizeof(*lcme),
2046                         buf->lb_buf + offset, size);
2047         }
2048
2049         /* 3. Recalculate the enter offset for the component [pos, count-1] */
2050         for (i = count - 1; i >= pos; i--) {
2051                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
2052                 lcm->lcm_entries[i + 1].lcme_offset =
2053                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
2054                                                 lcme_offset) + added);
2055         }
2056
2057         /* 4. Recalculate the enter offset for the component [0, pos) */
2058         for (i = 0; i < pos; i++) {
2059                 lcm->lcm_entries[i].lcme_offset =
2060                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
2061                                                 lcme_offset) + sizeof(*lcme));
2062         }
2063
2064         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
2065         /* 4. Insert the new component header (entry) at the slot 'pos'. */
2066         lcme = &lcm->lcm_entries[pos];
2067         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
2068         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
2069         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
2070         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
2071         lcme->lcme_offset = cpu_to_le32(offset);
2072         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
2073                                                       LOV_MAGIC_V1));
2074
2075         if (ol->ol_stripe_count > 1)
2076                 pattern |= LOV_PATTERN_F_HOLE;
2077
2078         lmm = buf->lb_buf + offset;
2079         /* 5. Insert teh new component body at the 'offset'. */
2080         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
2081                                            ol->ol_stripe_size, ea_off,
2082                                            pattern, ol->ol_stripe_count);
2083
2084         /* 6. Update mirror related flags and version. */
2085         lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version,
2086                                 rec->lor_range);
2087
2088         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2089                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
2090                                        le32_to_cpu(lcm->lcm_size));
2091
2092         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for "
2093                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2094                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2095                "comp_end %llu, layout version %u, range %u, "
2096                "%s LOV EA hole: rc = %d\n",
2097                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2098                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2099                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2100                rec->lor_layout_version, rec->lor_range,
2101                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2102                "with" : "without", rc);
2103
2104         RETURN(rc);
2105 }
2106
2107 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
2108                                           struct lfsck_instance *lfsck,
2109                                           struct thandle *handle,
2110                                           struct ost_layout *ol,
2111                                           struct dt_object *parent,
2112                                           const struct lu_fid *cfid,
2113                                           struct lu_buf *buf, __u32 ost_idx,
2114                                           __u32 ea_off)
2115 {
2116         struct lov_mds_md_v1 *lmm = buf->lb_buf;
2117         struct lov_ost_data_v1 *objs;
2118         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
2119         __u32 magic = le32_to_cpu(lmm->lmm_magic);
2120         int size;
2121         int gap;
2122         int rc;
2123         ENTRY;
2124
2125         /* The original LOVEA maybe re-generated via old filter_fid, at
2126          * that time, we do not know the stripe count and stripe size. */
2127         if (ol->ol_stripe_count > count)
2128                 count = ol->ol_stripe_count;
2129         if (ol->ol_stripe_size != 0 &&
2130             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
2131                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
2132
2133         if (magic == LOV_MAGIC_V1)
2134                 objs = &lmm->lmm_objects[count];
2135         else
2136                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
2137
2138         gap = ea_off - count;
2139         if (gap >= 0)
2140                 count = ea_off + 1;
2141
2142         size = lov_mds_md_size(count, magic);
2143         LASSERTF(buf->lb_len >= size,
2144                  "buffer len %d is less than real size %d\n",
2145                  (int)buf->lb_len, size);
2146
2147         if (gap > 0) {
2148                 memset(objs, 0, gap * sizeof(*objs));
2149                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
2150         }
2151
2152         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2153         lmm->lmm_stripe_count = cpu_to_le16(count);
2154         objs += gap;
2155
2156         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2157                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
2158
2159         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for "
2160                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2161                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2162                "comp_end %llu, %s LOV EA hole: rc = %d\n",
2163                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2164                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2165                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2166                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2167                "with" : "without", rc);
2168
2169         RETURN(rc);
2170 }
2171
2172 /**
2173  * \retval       +1: repaired
2174  * \retval        0: did nothing
2175  * \retval      -ve: on error
2176  */
2177 static int lfsck_layout_update_lovea(const struct lu_env *env,
2178                                      struct lfsck_instance *lfsck,
2179                                      struct thandle *handle,
2180                                      struct lu_orphan_rec_v3 *rec,
2181                                      struct dt_object *parent,
2182                                      const struct lu_fid *cfid,
2183                                      struct lu_buf *buf, int fl,
2184                                      __u32 ost_idx, __u32 ea_off)
2185 {
2186         struct ost_layout *ol = &rec->lor_layout;
2187         struct lov_mds_md_v1 *lmm = NULL;
2188         struct lov_ost_data_v1 *objs = NULL;
2189         int rc = 0;
2190         ENTRY;
2191
2192         if (ol->ol_comp_id != 0)
2193                 rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off,
2194                                                  &lmm, &objs);
2195         else
2196                 rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout,
2197                                                parent, buf, ea_off, &lmm,
2198                                                &objs);
2199         if (rc > 0)
2200                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2201                                                buf, lmm, objs, fl, ost_idx, rc);
2202
2203         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for "
2204                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2205                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2206                "comp_end %llu, layout version %u, range %u, fl %d, "
2207                "%s LOV EA hole: rc = %d\n",
2208                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2209                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2210                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2211                rec->lor_layout_version, rec->lor_range, fl,
2212                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2213                "with" : "without", rc);
2214
2215         RETURN(rc);
2216 }
2217
2218 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2219                                       struct lfsck_component *com,
2220                                       struct dt_object *child,
2221                                       const struct lu_fid *pfid,
2222                                       const struct ost_layout *ol, __u32 offset,
2223                                       __u32 version, __u32 range)
2224 {
2225         struct dt_device        *dev    = lfsck_obj2dev(child);
2226         struct filter_fid       *ff     = &lfsck_env_info(env)->lti_ff;
2227         struct thandle          *handle;
2228         struct lu_buf            buf    = { NULL };
2229         int                      rc;
2230
2231         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
2232                 RETURN(0);
2233
2234         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2235         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2236         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2237          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2238          * parent MDT-object's layout EA. */
2239         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2240         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2241         ff->ff_layout_version = cpu_to_le32(version);
2242         ff->ff_range = cpu_to_le32(range);
2243         lfsck_buf_init(&buf, ff, sizeof(*ff));
2244
2245         if (!dt_object_exists(child) || lfsck_is_dead_obj(child))
2246                 return 0;
2247
2248         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
2249         if (IS_ERR(handle))
2250                 RETURN(PTR_ERR(handle));
2251
2252         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2253         if (rc != 0)
2254                 GOTO(stop, rc);
2255
2256         rc = dt_trans_start_local(env, dev, handle);
2257         if (rc != 0)
2258                 GOTO(stop, rc);
2259
2260         dt_write_lock(env, child, 0);
2261         if (dt_object_exists(child) && !lfsck_is_dead_obj(child))
2262                 rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2263         dt_write_unlock(env, child);
2264
2265         GOTO(stop, rc);
2266
2267 stop:
2268         dt_trans_stop(env, dev, handle);
2269
2270         return rc;
2271 }
2272
2273 /**
2274  * \retval       +1: repaired
2275  * \retval        0: did nothing
2276  * \retval      -ve: on error
2277  */
2278 static int lfsck_layout_update_pfid(const struct lu_env *env,
2279                                     struct lfsck_component *com,
2280                                     struct dt_object *parent,
2281                                     struct lu_fid *cfid,
2282                                     struct dt_device *cdev,
2283                                     struct lu_orphan_rec_v3 *rec, __u32 ea_off)
2284 {
2285         struct dt_object        *child;
2286         int                      rc     = 0;
2287         ENTRY;
2288
2289         child = lfsck_object_find_by_dev(env, cdev, cfid);
2290         if (IS_ERR(child))
2291                 RETURN(PTR_ERR(child));
2292
2293         rc = __lfsck_layout_update_pfid(env, com, child,
2294                                         lu_object_fid(&parent->do_lu),
2295                                         &rec->lor_layout, ea_off,
2296                                         rec->lor_layout_version,
2297                                         rec->lor_range);
2298         lfsck_object_put(env, child);
2299
2300         RETURN(rc == 0 ? 1 : rc);
2301 }
2302
2303 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2304 {
2305         if (ol->ol_comp_id != 0)
2306                 return sizeof(struct lov_comp_md_v1) +
2307                        sizeof(struct lov_comp_md_entry_v1) +
2308                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2309
2310         if (ol->ol_stripe_count != 0)
2311                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2312
2313         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2314 }
2315
2316 /**
2317  * This function will create the MDT-object with the given (partial) LOV EA.
2318  *
2319  * Under some data corruption cases, the MDT-object of the file may be lost,
2320  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2321  * re-create the MDT-object with the orphan OST-object(s) information.
2322  *
2323  * On the other hand, the LFSCK may has created some OST-object for repairing
2324  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2325  * the old OST-object is there and should replace the former new created OST
2326  * object. Unfortunately, some others have modified such newly created object.
2327  * To keep the data (both new and old), the LFSCK will create MDT-object with
2328  * new FID to reference the original OST-object.
2329  *
2330  * \param[in] env       pointer to the thread context
2331  * \param[in] com       pointer to the lfsck component
2332  * \param[in] ltd       pointer to target device descriptor
2333  * \param[in] rec       pointer to the record for the orphan OST-object
2334  * \param[in] cfid      pointer to FID for the orphan OST-object
2335  * \param[in] infix     additional information, such as the FID for original
2336  *                      MDT-object and the stripe offset in the LOV EA
2337  * \param[in] type      the type for describing why the orphan MDT-object is
2338  *                      created. The rules are as following:
2339  *
2340  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2341  *                      same slot in the layout EA. Then the LFSCK will create
2342  *                      new MDT-object(s) to hold the conflict OST-object(s).
2343  *
2344  *  type "N":           The orphan OST-object does not know which one was the
2345  *                      real parent MDT-object, so the LFSCK uses new FID for
2346  *                      its parent MDT-object.
2347  *
2348  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2349  *                      but does not know the position (the file name) in the
2350  *                      layout.
2351  *
2352  *  type "D":           The MDT-object is a directory, it may knows its parent
2353  *                      but because there is no valid linkEA, the LFSCK cannot
2354  *                      know where to put it back to the namespace.
2355  *  type "O":           The MDT-object has no linkEA, and there is no name
2356  *                      entry that references the MDT-object.
2357  *
2358  *  type "P":           The orphan object to be created was a parent directory
2359  *                      of some MDT-object which linkEA shows that the @orphan
2360  *                      object is missing.
2361  *
2362  * The orphan name will be like:
2363  * ${FID}-${infix}-${type}-${conflict_version}
2364  *
2365  * \param[in] ea_off    the stripe offset in the LOV EA
2366  *
2367  * \retval              positive on repaired something
2368  * \retval              0 if needs to repair nothing
2369  * \retval              negative error number on failure
2370  */
2371 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2372                                         struct lfsck_component *com,
2373                                         struct lfsck_tgt_desc *ltd,
2374                                         struct lu_orphan_rec_v3 *rec,
2375                                         struct lu_fid *cfid,
2376                                         const char *infix,
2377                                         const char *type,
2378                                         __u32 ea_off)
2379 {
2380         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2381         struct dt_insert_rec            *dtrec  = &info->lti_dt_rec;
2382         char                            *name   = info->lti_key;
2383         struct lu_attr                  *la     = &info->lti_la2;
2384         struct dt_object_format         *dof    = &info->lti_dof;
2385         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2386         struct lu_fid                   *pfid   = &rec->lor_rec.lor_fid;
2387         struct lu_fid                   *tfid   = &info->lti_fid3;
2388         struct dt_device                *dev    = lfsck->li_bottom;
2389         struct dt_object                *lpf    = lfsck->li_lpf_obj;
2390         struct dt_object                *pobj   = NULL;
2391         struct dt_object                *cobj   = NULL;
2392         struct thandle                  *th     = NULL;
2393         struct lu_buf                   *ea_buf = &info->lti_big_buf;
2394         struct lu_buf                    lov_buf;
2395         struct lfsck_lock_handle        *llh    = &info->lti_llh;
2396         struct linkea_data               ldata  = { NULL };
2397         struct lu_buf                    linkea_buf;
2398         const struct lu_name            *pname;
2399         int                              size   = 0;
2400         int                              idx    = 0;
2401         int                              rc     = 0;
2402         ENTRY;
2403
2404         if (lfsck_is_dryrun(lfsck))
2405                 GOTO(log, rc = 0);
2406
2407         if (unlikely(lpf == NULL))
2408                 GOTO(log, rc = -ENXIO);
2409
2410         /* We use two separated transactions to repair the inconsistency.
2411          *
2412          * 1) create the MDT-object locally.
2413          * 2) update the OST-object's PFID EA if necessary.
2414          *
2415          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2416          * updated when the layout LFSCK run next time.
2417          *
2418          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2419          * when the layout LFSCK run next time. */
2420
2421         if (fid_is_zero(pfid)) {
2422                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2423                 if (rc != 0)
2424                         GOTO(log, rc);
2425
2426                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2427                 if (IS_ERR(cobj))
2428                         GOTO(log, rc = PTR_ERR(cobj));
2429         }
2430
2431         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2432         if (IS_ERR(pobj))
2433                 GOTO(log, rc = PTR_ERR(pobj));
2434
2435         LASSERT(infix != NULL);
2436         LASSERT(type != NULL);
2437
2438         memset(la, 0, sizeof(*la));
2439         la->la_uid = rec->lor_rec.lor_uid;
2440         la->la_gid = rec->lor_rec.lor_gid;
2441         la->la_mode = S_IFREG | S_IRUSR;
2442         la->la_valid = LA_MODE | LA_UID | LA_GID;
2443
2444         memset(dof, 0, sizeof(*dof));
2445         dof->dof_type = dt_mode_to_dft(S_IFREG);
2446         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2447          * the stripe(s). The LFSCK will specify the LOV EA via
2448          * lfsck_layout_update_lovea(). */
2449
2450         size = lfsck_lovea_size(&rec->lor_layout, ea_off);
2451         if (ea_buf->lb_len < size) {
2452                 lu_buf_realloc(ea_buf, size);
2453                 if (ea_buf->lb_buf == NULL)
2454                         GOTO(log, rc = -ENOMEM);
2455         }
2456
2457 again:
2458         do {
2459                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2460                          type, idx++);
2461                 rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2462                 if (rc != 0 && rc != -ENOENT)
2463                         GOTO(log, rc);
2464         } while (rc == 0);
2465
2466         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2467                         MDS_INODELOCK_UPDATE, LCK_PW);
2468         if (rc != 0)
2469                 GOTO(log, rc);
2470
2471         /* Re-check whether the name conflict with othrs after taken
2472          * the ldlm lock. */
2473         rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2474         if (unlikely(rc == 0)) {
2475                 lfsck_unlock(llh);
2476                 goto again;
2477         }
2478
2479         if (rc != -ENOENT)
2480                 GOTO(unlock, rc);
2481
2482         pname = lfsck_name_get_const(env, name, strlen(name));
2483         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2484                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2485         if (rc != 0)
2486                 GOTO(unlock, rc);
2487
2488         /* The 1st transaction. */
2489         th = lfsck_trans_create(env, dev, lfsck);
2490         if (IS_ERR(th))
2491                 GOTO(unlock, rc = PTR_ERR(th));
2492
2493         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2494         if (rc != 0)
2495                 GOTO(stop, rc);
2496
2497         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2498         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2499                                   LU_XATTR_CREATE, th);
2500         if (rc != 0)
2501                 GOTO(stop, rc);
2502
2503         dtrec->rec_fid = pfid;
2504         dtrec->rec_type = S_IFREG;
2505         rc = dt_declare_insert(env, lpf,
2506                                (const struct dt_rec *)dtrec,
2507                                (const struct dt_key *)name, th);
2508         if (rc != 0)
2509                 GOTO(stop, rc);
2510
2511         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2512                        ldata.ld_leh->leh_len);
2513         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2514                                   XATTR_NAME_LINK, 0, th);
2515         if (rc != 0)
2516                 GOTO(stop, rc);
2517
2518         rc = dt_trans_start_local(env, dev, th);
2519         if (rc != 0)
2520                 GOTO(stop, rc);
2521
2522         dt_write_lock(env, pobj, 0);
2523         rc = dt_create(env, pobj, la, NULL, dof, th);
2524         if (rc == 0)
2525                 rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid,
2526                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2527         dt_write_unlock(env, pobj);
2528         if (rc < 0)
2529                 GOTO(stop, rc);
2530
2531         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2532                        (const struct dt_key *)name, th);
2533         if (rc != 0)
2534                 GOTO(stop, rc);
2535
2536         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2537         if (rc == 0 && cobj != NULL) {
2538                 dt_trans_stop(env, dev, th);
2539                 th = NULL;
2540
2541                 /* The 2nd transaction. */
2542                 rc = __lfsck_layout_update_pfid(env, com, cobj, pfid,
2543                                                 &rec->lor_layout, ea_off,
2544                                                 rec->lor_layout_version,
2545                                                 rec->lor_range);
2546         }
2547
2548         GOTO(stop, rc);
2549
2550 stop:
2551         if (th != NULL)
2552                 dt_trans_stop(env, dev, th);
2553
2554 unlock:
2555         lfsck_unlock(llh);
2556
2557 log:
2558         if (cobj != NULL && !IS_ERR(cobj))
2559                 lfsck_object_put(env, cobj);
2560         if (pobj != NULL && !IS_ERR(pobj))
2561                 lfsck_object_put(env, pobj);
2562
2563         if (rc < 0)
2564                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to "
2565                        "recreate the lost MDT-object: parent "DFID
2566                        ", child "DFID", OST-index %u, stripe-index %u, "
2567                        "infix %s, type %s: rc = %d\n",
2568                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2569                        ltd->ltd_index, ea_off, infix, type, rc);
2570
2571         return rc >= 0 ? 1 : rc;
2572 }
2573
2574 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2575                                                    struct lfsck_component *com,
2576                                                    const struct lu_fid *fid,
2577                                                    __u32 index)
2578 {
2579         struct lfsck_thread_info *info  = lfsck_env_info(env);
2580         struct lfsck_request     *lr    = &info->lti_lr;
2581         struct lfsck_instance    *lfsck = com->lc_lfsck;
2582         struct lfsck_tgt_desc    *ltd;
2583         struct ptlrpc_request    *req;
2584         struct lfsck_request     *tmp;
2585         struct obd_export        *exp;
2586         int                       rc    = 0;
2587         ENTRY;
2588
2589         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2590         if (unlikely(ltd == NULL))
2591                 RETURN(-ENXIO);
2592
2593         exp = ltd->ltd_exp;
2594         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2595                 GOTO(put, rc = -EOPNOTSUPP);
2596
2597         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2598         if (req == NULL)
2599                 GOTO(put, rc = -ENOMEM);
2600
2601         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2602         if (rc != 0) {
2603                 ptlrpc_request_free(req);
2604
2605                 GOTO(put, rc);
2606         }
2607
2608         memset(lr, 0, sizeof(*lr));
2609         lr->lr_event = LE_CONDITIONAL_DESTROY;
2610         lr->lr_active = LFSCK_TYPE_LAYOUT;
2611         lr->lr_fid = *fid;
2612
2613         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2614         *tmp = *lr;
2615         ptlrpc_request_set_replen(req);
2616
2617         rc = ptlrpc_queue_wait(req);
2618         ptlrpc_req_finished(req);
2619
2620         GOTO(put, rc);
2621
2622 put:
2623         lfsck_tgt_put(ltd);
2624
2625         return rc;
2626 }
2627
2628 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2629                                                   struct lfsck_component *com,
2630                                                   struct lfsck_request *lr)
2631 {
2632         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2633         struct lu_attr                  *la     = &info->lti_la;
2634         union ldlm_policy_data          *policy = &info->lti_policy;
2635         struct ldlm_res_id              *resid  = &info->lti_resid;
2636         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2637         struct dt_device                *dev    = lfsck->li_bottom;
2638         struct lu_fid                   *fid    = &lr->lr_fid;
2639         struct dt_object                *obj;
2640         struct thandle                  *th     = NULL;
2641         struct lustre_handle             lh     = { 0 };
2642         __u64                            flags  = 0;
2643         int                              rc     = 0;
2644         ENTRY;
2645
2646         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
2647                 RETURN(0);
2648
2649         obj = lfsck_object_find_by_dev(env, dev, fid);
2650         if (IS_ERR(obj))
2651                 RETURN(PTR_ERR(obj));
2652
2653         dt_read_lock(env, obj, 0);
2654         if (dt_object_exists(obj) == 0 ||
2655             lfsck_is_dead_obj(obj)) {
2656                 dt_read_unlock(env, obj);
2657
2658                 GOTO(put, rc = -ENOENT);
2659         }
2660
2661         /* Get obj's attr without lock firstly. */
2662         rc = dt_attr_get(env, obj, la);
2663         dt_read_unlock(env, obj);
2664         if (rc != 0)
2665                 GOTO(put, rc);
2666
2667         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2668                 GOTO(put, rc = -ETXTBSY);
2669
2670         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2671         LASSERT(lfsck->li_namespace != NULL);
2672
2673         memset(policy, 0, sizeof(*policy));
2674         policy->l_extent.end = OBD_OBJECT_EOF;
2675         ost_fid_build_resid(fid, resid);
2676         rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid,
2677                                     LDLM_EXTENT, policy, LCK_EX, &flags,
2678                                     ldlm_blocking_ast, ldlm_completion_ast,
2679                                     NULL, NULL, 0, LVB_T_NONE, NULL, &lh);
2680         if (rc != ELDLM_OK)
2681                 GOTO(put, rc = -EIO);
2682
2683         dt_write_lock(env, obj, 0);
2684         /* Get obj's attr within lock again. */
2685         rc = dt_attr_get(env, obj, la);
2686         if (rc != 0)
2687                 GOTO(unlock, rc);
2688
2689         if (la->la_ctime != 0)
2690                 GOTO(unlock, rc = -ETXTBSY);
2691
2692         th = lfsck_trans_create(env, dev, lfsck);
2693         if (IS_ERR(th))
2694                 GOTO(unlock, rc = PTR_ERR(th));
2695
2696         rc = dt_declare_ref_del(env, obj, th);
2697         if (rc != 0)
2698                 GOTO(stop, rc);
2699
2700         rc = dt_declare_destroy(env, obj, th);
2701         if (rc != 0)
2702                 GOTO(stop, rc);
2703
2704         rc = dt_trans_start_local(env, dev, th);
2705         if (rc != 0)
2706                 GOTO(stop, rc);
2707
2708         rc = dt_ref_del(env, obj, th);
2709         if (rc != 0)
2710                 GOTO(stop, rc);
2711
2712         rc = dt_destroy(env, obj, th);
2713         if (rc == 0)
2714                 CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty "
2715                        "OST-object "DFID" that was created for reparing "
2716                        "dangling referenced case. But the original missing "
2717                        "OST-object is found now.\n",
2718                        lfsck_lfsck2name(lfsck), PFID(fid));
2719
2720         GOTO(stop, rc);
2721
2722 stop:
2723         dt_trans_stop(env, dev, th);
2724
2725 unlock:
2726         dt_write_unlock(env, obj);
2727         ldlm_lock_decref(&lh, LCK_EX);
2728
2729 put:
2730         lfsck_object_put(env, obj);
2731
2732         return rc;
2733 }
2734
2735 /**
2736  * Some OST-object has occupied the specified layout EA slot.
2737  * Such OST-object may be generated by the LFSCK when repair
2738  * dangling referenced MDT-object, which can be indicated by
2739  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2740  * is true and such OST-object has not been modified yet, we
2741  * will replace it with the orphan OST-object; otherwise the
2742  * LFSCK will create new MDT-object to reference the orphan.
2743  *
2744  * \retval       +1: repaired
2745  * \retval        0: did nothing
2746  * \retval      -ve: on error
2747  */
2748 static int lfsck_layout_conflict_create(const struct lu_env *env,
2749                                         struct lfsck_component *com,
2750                                         struct lfsck_tgt_desc *ltd,
2751                                         struct lu_orphan_rec_v3 *rec,
2752                                         struct dt_object *parent,
2753                                         struct lu_fid *cfid,
2754                                         struct lu_buf *ea_buf,
2755                                         struct lov_mds_md_v1 *lmm,
2756                                         struct lov_ost_data_v1 *slot,
2757                                         __u32 ea_off, int lovea_size)
2758 {
2759         struct lfsck_thread_info *info          = lfsck_env_info(env);
2760         struct lu_fid            *cfid2         = &info->lti_fid2;
2761         struct ost_id            *oi            = &info->lti_oi;
2762         struct dt_device         *dev           = lfsck_obj2dev(parent);
2763         struct thandle           *th            = NULL;
2764         struct lustre_handle      lh            = { 0 };
2765         __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
2766         int                       rc            = 0;
2767         ENTRY;
2768
2769         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2770                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2771                         RETURN(0);
2772         }
2773
2774         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2775         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2776         if (rc != 0)
2777                 GOTO(out, rc);
2778
2779         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2780                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2781                               LCK_EX);
2782         if (rc != 0)
2783                 GOTO(out, rc);
2784
2785         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2786
2787         /* If the conflict OST-obejct is not created for fixing dangling
2788          * referenced MDT-object in former LFSCK check/repair, or it has
2789          * been modified by others, then we cannot destroy it. Re-create
2790          * a new MDT-object for the orphan OST-object. */
2791         if (rc == -ETXTBSY) {
2792                 /* No need the layout lock on the original parent. */
2793                 lfsck_ibits_unlock(&lh, LCK_EX);
2794
2795                 fid_zero(&rec->lor_rec.lor_fid);
2796                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2797                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2798                          ea_off);
2799                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2800                                                 info->lti_tmpbuf, "C", ea_off);
2801
2802                 RETURN(rc);
2803         }
2804
2805         if (rc != 0 && rc != -ENOENT)
2806                 GOTO(unlock, rc);
2807
2808         if (lfsck_is_dryrun(com->lc_lfsck))
2809                 GOTO(unlock, rc = 0);
2810
2811         th = lfsck_trans_create(env, dev, com->lc_lfsck);
2812         if (IS_ERR(th))
2813                 GOTO(unlock, rc = PTR_ERR(th));
2814
2815         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2816                                   LU_XATTR_REPLACE, th);
2817         if (rc != 0)
2818                 GOTO(stop, rc);
2819
2820         rc = dt_trans_start_local(env, dev, th);
2821         if (rc != 0)
2822                 GOTO(stop, rc);
2823
2824         dt_write_lock(env, parent, 0);
2825         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2826         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2827                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2828                                        ltd->ltd_index, lovea_size);
2829         dt_write_unlock(env, parent);
2830
2831         GOTO(stop, rc);
2832
2833 stop:
2834         dt_trans_stop(env, dev, th);
2835
2836 unlock:
2837         lfsck_ibits_unlock(&lh, LCK_EX);
2838
2839 out:
2840         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict "
2841                "OST-object "DFID" on the OST %x with the orphan "DFID" on "
2842                "the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2843                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2844                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2845                ea_off, rc);
2846
2847         return rc >= 0 ? 1 : rc;
2848 }
2849
2850 /**
2851  * \retval       +1: repaired
2852  * \retval        0: did nothing
2853  * \retval      -ve: on error
2854  */
2855 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2856                                        struct lfsck_component *com,
2857                                        struct lfsck_tgt_desc *ltd,
2858                                        struct lu_orphan_rec_v3 *rec,
2859                                        struct dt_object *parent,
2860                                        struct lu_fid *cfid,
2861                                        __u32 ost_idx, __u32 ea_off)
2862 {
2863         struct lfsck_thread_info *info          = lfsck_env_info(env);
2864         struct lu_buf            *buf           = &info->lti_big_buf;
2865         struct lu_fid            *fid           = &info->lti_fid2;
2866         struct ost_id            *oi            = &info->lti_oi;
2867         struct lfsck_instance    *lfsck         = com->lc_lfsck;
2868         struct dt_device         *dt            = lfsck_obj2dev(parent);
2869         struct lfsck_bookmark    *bk            = &lfsck->li_bookmark_ram;
2870         struct ost_layout        *ol            = &rec->lor_layout;
2871         struct lov_comp_md_v1    *lcm           = NULL;
2872         struct lov_comp_md_entry_v1 *lcme       = NULL;
2873         struct thandle           *handle        = NULL;
2874         size_t                    lovea_size;
2875         struct lov_mds_md_v1     *lmm;
2876         struct lov_ost_data_v1   *objs;
2877         struct lustre_handle      lh            = { 0 };
2878         __u32                     magic;
2879         __u32 flags = 0;
2880         int                       fl            = 0;
2881         int                       rc            = 0;
2882         int                       rc1;
2883         int                       i;
2884         int pos = 0;
2885         __u16 count;
2886         bool locked = false;
2887         bool new_mirror = true;
2888         ENTRY;
2889
2890         if (lfsck_is_dryrun(lfsck))
2891                 RETURN(0);
2892
2893         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2894                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2895                               LCK_EX);
2896         if (rc != 0) {
2897                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate "
2898                        "LOV EA for "DFID": parent "DFID", OST-index %u, "
2899                        "stripe-index %u, comp_id %u, comp_start %llu, "
2900                        "comp_end %llu, layout version %u, range %u: rc = %d\n",
2901                        lfsck_lfsck2name(lfsck), PFID(cfid),
2902                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2903                        ol->ol_comp_id, ol->ol_comp_start,
2904                        ol->ol_comp_end, rec->lor_layout_version,
2905                        rec->lor_range, rc);
2906
2907                 RETURN(rc);
2908         }
2909
2910 again:
2911         if (locked) {
2912                 dt_write_unlock(env, parent);
2913                 locked = false;
2914         }
2915
2916         if (handle != NULL) {
2917                 dt_trans_stop(env, dt, handle);
2918                 handle = NULL;
2919         }
2920
2921         if (rc < 0)
2922                 GOTO(unlock_layout, rc);
2923
2924         lovea_size = rc;
2925         if (buf->lb_len < lovea_size) {
2926                 lu_buf_realloc(buf, lovea_size);
2927                 if (buf->lb_buf == NULL)
2928                         GOTO(unlock_layout, rc = -ENOMEM);
2929         }
2930
2931         if (!(bk->lb_param & LPF_DRYRUN)) {
2932                 handle = lfsck_trans_create(env, dt, lfsck);
2933                 if (IS_ERR(handle))
2934                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2935
2936                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2937                                           fl, handle);
2938                 if (rc != 0)
2939                         GOTO(stop, rc);
2940
2941                 rc = dt_trans_start_local(env, dt, handle);
2942                 if (rc != 0)
2943                         GOTO(stop, rc);
2944         }
2945
2946         dt_write_lock(env, parent, 0);
2947         locked = true;
2948         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2949         if (rc == -ERANGE) {
2950                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2951                 LASSERT(rc != 0);
2952                 goto again;
2953         } else if (rc == -ENODATA || rc == 0) {
2954                 lovea_size = lfsck_lovea_size(ol, ea_off);
2955                 /* If the declared is not big enough, re-try. */
2956                 if (buf->lb_len < lovea_size) {
2957                         rc = lovea_size;
2958                         goto again;
2959                 }
2960                 fl = LU_XATTR_CREATE;
2961         } else if (rc < 0) {
2962                 GOTO(unlock_parent, rc);
2963         } else if (unlikely(buf->lb_len == 0)) {
2964                 goto again;
2965         } else {
2966                 fl = LU_XATTR_REPLACE;
2967                 lovea_size = rc;
2968         }
2969
2970         if (fl == LU_XATTR_CREATE) {
2971                 if (bk->lb_param & LPF_DRYRUN)
2972                         GOTO(unlock_parent, rc = 1);
2973
2974                 LASSERT(buf->lb_len >= lovea_size);
2975
2976                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2977                                                cfid, buf, fl, ost_idx, ea_off);
2978
2979                 GOTO(unlock_parent, rc);
2980         }
2981
2982         lmm = buf->lb_buf;
2983         rc1 = lfsck_layout_verify_header(parent, lmm, lovea_size);
2984
2985         /* If the LOV EA crashed, the rebuild it. */
2986         if (rc1 == -EINVAL) {
2987                 if (bk->lb_param & LPF_DRYRUN)
2988                         GOTO(unlock_parent, rc = 1);
2989
2990                 LASSERT(buf->lb_len >= lovea_size);
2991
2992                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2993                                                cfid, buf, fl, ost_idx, ea_off);
2994
2995                 GOTO(unlock_parent, rc);
2996         }
2997
2998         /* For other unknown magic/pattern, keep the current LOV EA. */
2999         if (rc1 == -EOPNOTSUPP)
3000                 GOTO(unlock_parent, rc1 = 0);
3001
3002         if (rc1)
3003                 GOTO(unlock_parent, rc = rc1);
3004
3005         magic = le32_to_cpu(lmm->lmm_magic);
3006         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3007                 __u64 start;
3008                 __u64 end;
3009                 __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id);
3010                 __u16 mirror_id1;
3011
3012                 if (bk->lb_param & LPF_DRYRUN)
3013                         GOTO(unlock_parent, rc = 1);
3014
3015                 lcm = buf->lb_buf;
3016                 count = le16_to_cpu(lcm->lcm_entry_count);
3017                 for (i = 0; i < count; pos = ++i) {
3018                         lcme = &lcm->lcm_entries[i];
3019                         start = le64_to_cpu(lcme->lcme_extent.e_start);
3020                         end = le64_to_cpu(lcme->lcme_extent.e_end);
3021                         mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id));
3022
3023                         if (mirror_id0 > mirror_id1)
3024                                 continue;
3025
3026                         if (mirror_id0 < mirror_id1)
3027                                 break;
3028
3029                         new_mirror = false;
3030                         if (end <= ol->ol_comp_start)
3031                                 continue;
3032
3033                         if (start >= ol->ol_comp_end)
3034                                 break;
3035
3036                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
3037                         magic = le32_to_cpu(lmm->lmm_magic);
3038                         flags = le32_to_cpu(lcme->lcme_flags);
3039                         goto further;
3040                 }
3041
3042                 rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent,
3043                                 cfid, buf, ost_idx, ea_off, pos, new_mirror);
3044
3045                 GOTO(unlock_parent, rc);
3046         }
3047
3048 further:
3049         count = le16_to_cpu(lmm->lmm_stripe_count);
3050         if (count == 0)
3051                 GOTO(unlock_parent, rc = -EINVAL);
3052         LASSERT(count > 0);
3053
3054         /* Exceed the current end of MDT-object layout EA. Then extend it. */
3055         if (count <= ea_off) {
3056                 if (bk->lb_param & LPF_DRYRUN)
3057                         GOTO(unlock_parent, rc = 1);
3058
3059                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
3060                 /* If the declared is not big enough, re-try. */
3061                 if (buf->lb_len < lovea_size) {
3062                         rc = lovea_size;
3063                         goto again;
3064                 }
3065
3066                 if (lcm) {
3067                         LASSERT(lcme);
3068
3069                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
3070                         lfsck_layout_update_lcm(lcm, lcme,
3071                                                 rec->lor_layout_version,
3072                                                 rec->lor_range);
3073                 }
3074
3075                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
3076                                         parent, cfid, buf, ost_idx, ea_off);
3077
3078                 GOTO(unlock_parent, rc);
3079         }
3080
3081         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
3082
3083         if (magic == LOV_MAGIC_V1) {
3084                 objs = &lmm->lmm_objects[0];
3085         } else {
3086                 LASSERT(magic == LOV_MAGIC_V3);
3087                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
3088         }
3089
3090         for (i = 0; i < count; i++, objs++) {
3091                 /* The MDT-object was created via lfsck_layout_recover_create()
3092                  * by others before, and we fill the dummy layout EA. */
3093                 if ((lcme && !(flags & LCME_FL_INIT)) ||
3094                      lovea_slot_is_dummy(objs)) {
3095                         if (i != ea_off)
3096                                 continue;
3097
3098                         if (bk->lb_param & LPF_DRYRUN)
3099                                 GOTO(unlock_parent, rc = 1);
3100
3101                         lmm->lmm_layout_gen =
3102                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3103                         if (lcme) {
3104                                 LASSERT(lcm);
3105
3106                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
3107                                         ol->ol_stripe_size ||
3108                                     le16_to_cpu(lmm->lmm_stripe_count) !=
3109                                         ol->ol_stripe_count ||
3110                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
3111                                         ol->ol_comp_start ||
3112                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
3113                                         ol->ol_comp_end) {
3114                                         CDEBUG(D_LFSCK, "%s: found invalid "
3115                                         "component for "DFID ": parent "DFID
3116                                         ", stripe-index %u, stripe_size %u, "
3117                                         "stripe_count %u, comp_id %u, "
3118                                         "comp_start %llu, comp_end %llu, "
3119                                         "cur_stripe_size %u, "
3120                                         "cur_stripe_count %u, "
3121                                         "cur_comp_start %llu, "
3122                                         "cur_comp_end %llu\n",
3123                                         lfsck_lfsck2name(lfsck), PFID(cfid),
3124                                         PFID(lfsck_dto2fid(parent)), ea_off,
3125                                         ol->ol_stripe_size,
3126                                         ol->ol_stripe_count, ol->ol_comp_id,
3127                                         ol->ol_comp_start, ol->ol_comp_end,
3128                                         le32_to_cpu(lmm->lmm_stripe_size),
3129                                         le16_to_cpu(lmm->lmm_stripe_count),
3130                                         le64_to_cpu(lcme->lcme_extent.e_start),
3131                                         le64_to_cpu(lcme->lcme_extent.e_end));
3132
3133                                         GOTO(unlock_parent, rc = -EINVAL);
3134                                 }
3135
3136                                 lovea_size = le32_to_cpu(lcm->lcm_size);
3137                                 lcme->lcme_flags = cpu_to_le32(flags |
3138                                                                LCME_FL_INIT);
3139                                 lfsck_layout_update_lcm(lcm, lcme,
3140                                                         rec->lor_layout_version,
3141                                                         rec->lor_range);
3142                         }
3143
3144                         LASSERTF(buf->lb_len >= lovea_size,
3145                                  "buffer len %d is less than real size %d\n",
3146                                  (int)buf->lb_len, (int)lovea_size);
3147
3148                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
3149                                                 parent, cfid, buf, lmm, objs,
3150                                                 fl, ost_idx, lovea_size);
3151
3152                         CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill "
3153                                "dummy layout slot for "DFID": parent "DFID
3154                                ", OST-index %u, stripe-index %u: rc = %d\n",
3155                                lfsck_lfsck2name(lfsck), PFID(cfid),
3156                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
3157
3158                         GOTO(unlock_parent, rc);
3159                 }
3160
3161                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3162                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
3163                 if (rc != 0) {
3164                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
3165                                "invalid layout EA at the slot %d, index %u\n",
3166                                lfsck_lfsck2name(lfsck),
3167                                PFID(lfsck_dto2fid(parent)), i,
3168                                le32_to_cpu(objs->l_ost_idx));
3169
3170                         GOTO(unlock_parent, rc);
3171                 }
3172
3173                 /* It should be rare case, the slot is there, but the LFSCK
3174                  * does not handle it during the first-phase cycle scanning. */
3175                 if (unlikely(lu_fid_eq(fid, cfid))) {
3176                         if (i == ea_off) {
3177                                 GOTO(unlock_parent, rc = 0);
3178                         } else {
3179                                 /* Rare case that the OST-object index
3180                                  * does not match the parent MDT-object
3181                                  * layout EA. We trust the later one. */
3182                                 if (bk->lb_param & LPF_DRYRUN)
3183                                         GOTO(unlock_parent, rc = 1);
3184
3185                                 dt_write_unlock(env, parent);
3186                                 if (handle != NULL)
3187                                         dt_trans_stop(env, dt, handle);
3188                                 lfsck_ibits_unlock(&lh, LCK_EX);
3189                                 rc = lfsck_layout_update_pfid(env, com, parent,
3190                                                         cfid, ltd->ltd_tgt,
3191                                                         rec, i);
3192
3193                                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant "
3194                                        "updated OST-object's pfid for "DFID
3195                                        ": parent "DFID", OST-index %u, "
3196                                        "stripe-index %u: rc = %d\n",
3197                                        lfsck_lfsck2name(lfsck), PFID(cfid),
3198                                        PFID(lfsck_dto2fid(parent)),
3199                                        ltd->ltd_index, i, rc);
3200
3201                                 RETURN(rc);
3202                         }
3203                 }
3204         }
3205
3206         /* The MDT-object exists, but related layout EA slot is occupied
3207          * by others. */
3208         if (bk->lb_param & LPF_DRYRUN)
3209                 GOTO(unlock_parent, rc = 1);
3210
3211         dt_write_unlock(env, parent);
3212         if (handle != NULL)
3213                 dt_trans_stop(env, dt, handle);
3214         lfsck_ibits_unlock(&lh, LCK_EX);
3215         if (magic == LOV_MAGIC_V1)
3216                 objs = &lmm->lmm_objects[ea_off];
3217         else
3218                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3219         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3220                                           buf, lmm, objs, ea_off, lovea_size);
3221
3222         RETURN(rc);
3223
3224 unlock_parent:
3225         if (locked)
3226                 dt_write_unlock(env, parent);
3227
3228 stop:
3229         if (handle != NULL)
3230                 dt_trans_stop(env, dt, handle);
3231
3232 unlock_layout:
3233         lfsck_ibits_unlock(&lh, LCK_EX);
3234
3235         return rc;
3236 }
3237
3238 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3239                                         struct lfsck_component *com,
3240                                         struct lfsck_tgt_desc *ltd,
3241                                         struct lu_orphan_rec_v3 *rec,
3242                                         struct lu_fid *cfid)
3243 {
3244         struct lfsck_layout     *lo     = com->lc_file_ram;
3245         struct lu_fid           *pfid   = &rec->lor_rec.lor_fid;
3246         struct dt_object        *parent = NULL;
3247         __u32                    ea_off = pfid->f_stripe_idx;
3248         int                      rc     = 0;
3249         ENTRY;
3250
3251         if (!fid_is_sane(cfid))
3252                 GOTO(out, rc = -EINVAL);
3253
3254         pfid->f_ver = 0;
3255         if (fid_is_zero(pfid)) {
3256                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3257                                                   "", "N", ea_off);
3258                 GOTO(out, rc);
3259         }
3260
3261         if (!fid_is_sane(pfid))
3262                 GOTO(out, rc = -EINVAL);
3263
3264         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3265         if (IS_ERR(parent))
3266                 GOTO(out, rc = PTR_ERR(parent));
3267
3268         if (unlikely(dt_object_remote(parent) != 0))
3269                 GOTO(put, rc = -EXDEV);
3270
3271         if (dt_object_exists(parent) == 0) {
3272                 lfsck_object_put(env, parent);
3273                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3274                                                   "", "R", ea_off);
3275                 GOTO(out, rc);
3276         }
3277
3278         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3279                 GOTO(put, rc = -EISDIR);
3280
3281         /* The orphan OST-object claims to be the parent's stripe, then
3282          * related dangling record in the trace file is meaningless. */
3283         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3284                                            rec->lor_layout.ol_comp_id, ea_off);
3285         if (rc && rc != -ENOENT)
3286                 GOTO(put, rc);
3287
3288         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3289                                          ltd->ltd_index, ea_off);
3290
3291         GOTO(put, rc);
3292
3293 put:
3294         if (rc <= 0)
3295                 lfsck_object_put(env, parent);
3296         else
3297                 /* The layout EA is changed, need to be reloaded next time. */
3298                 dt_object_put_nocache(env, parent);
3299
3300 out:
3301         down_write(&com->lc_sem);
3302         com->lc_new_scanned++;
3303         com->lc_new_checked++;
3304         if (rc > 0) {
3305                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3306                 rc = 0;
3307         } else if (rc < 0) {
3308                 lo->ll_objs_failed_phase2++;
3309         }
3310         up_write(&com->lc_sem);
3311
3312         return rc;
3313 }
3314
3315 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3316                                     struct lfsck_component *com,
3317                                     struct lfsck_tgt_desc *ltd)
3318 {
3319         struct lfsck_assistant_data     *lad    = com->lc_data;
3320         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3321         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
3322         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3323         struct lu_fid                   *fid    = &info->lti_fid;
3324         struct dt_object                *obj;
3325         const struct dt_it_ops          *iops;
3326         struct dt_it                    *di;
3327         int                              rc     = 0;
3328         ENTRY;
3329
3330         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan "
3331                "scanning for OST%04x\n",
3332                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3333
3334         if (test_bit(ltd->ltd_index, lad->lad_bitmap)) {
3335                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan "
3336                        "scanning for OST%04x\n",
3337                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3338
3339                 RETURN(0);
3340         }
3341
3342         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3343         fid->f_oid = fid->f_ver = 0;
3344
3345         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3346         if (unlikely(IS_ERR(obj)))
3347                 GOTO(log, rc = PTR_ERR(obj));
3348
3349         rc = obj->do_ops->do_index_try(env, obj,
3350                                        &dt_lfsck_layout_orphan_features);
3351         if (rc != 0)
3352                 GOTO(put, rc);
3353
3354         iops = &obj->do_index_ops->dio_it;
3355         di = iops->init(env, obj, 0);
3356         if (IS_ERR(di))
3357                 GOTO(put, rc = PTR_ERR(di));
3358
3359         rc = iops->load(env, di, 0);
3360         if (rc == -ESRCH) {
3361                 /* -ESRCH means that the orphan OST-objects rbtree has been
3362                  * cleanup because of the OSS server restart or other errors. */
3363                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3364                 GOTO(fini, rc);
3365         }
3366
3367         if (rc == 0)
3368                 rc = iops->next(env, di);
3369         else if (rc > 0)
3370                 rc = 0;
3371
3372         if (rc < 0)
3373                 GOTO(fini, rc);
3374
3375         if (rc > 0)
3376                 GOTO(fini, rc = 0);
3377
3378         do {
3379                 struct dt_key           *key;
3380                 struct lu_orphan_rec_v3 *rec = &info->lti_rec;
3381
3382                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3383                     unlikely(!thread_is_running(&lfsck->li_thread)))
3384                         break;
3385
3386                 key = iops->key(env, di);
3387                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3388                 /* Remote target OST may be runnning old LFSCK */
3389                 memset(rec, 0, sizeof(*rec));
3390                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3391                 if (rc == 0)
3392                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3393                                         &com->lc_fid_latest_scanned_phase2);
3394                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3395                         GOTO(fini, rc);
3396
3397                 lfsck_control_speed_by_self(com);
3398                 do {
3399                         rc = iops->next(env, di);
3400                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3401         } while (rc == 0);
3402
3403         GOTO(fini, rc);
3404
3405 fini:
3406         iops->put(env, di);
3407         iops->fini(env, di);
3408 put:
3409         lfsck_object_put(env, obj);
3410
3411 log:
3412         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan "
3413                "scanning for OST%04x: rc = %d\n",
3414                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3415
3416         return rc > 0 ? 0 : rc;
3417 }
3418
3419 static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff,
3420                             __u32 comp_id)
3421 {
3422         struct ost_layout *ol = &ff->ff_layout;
3423         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3424         int rc = 0;
3425         ENTRY;
3426
3427         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3428                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3429                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3430                 ol->ol_comp_start = 0;
3431                 ol->ol_comp_end = 0;
3432                 ol->ol_comp_id = 0;
3433                 ff->ff_layout_version = 0;
3434                 ff->ff_range = 0;
3435         } else if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3436                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3437                 struct lov_comp_md_entry_v1 *lcme = NULL;
3438                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3439                 int i;
3440
3441                 for (i = 0; i < count; i++) {
3442                         lcme = &lcm->lcm_entries[i];
3443                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3444                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3445                                         LCME_FL_INIT);
3446
3447                                 break;
3448                         }
3449                 }
3450
3451                 /* The comp has been removed, do nothing. */
3452                 if (i == count)
3453                         GOTO(out, rc = 1);
3454
3455                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3456                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3457                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3458                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3459                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3460                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3461                 ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen);
3462                 ff->ff_range = 0;
3463         } else {
3464                 GOTO(out, rc = -EINVAL);
3465         }
3466
3467         EXIT;
3468
3469 out:
3470         return rc;
3471 }
3472
3473 /**
3474  * Repair the MDT-object with dangling LOV EA reference.
3475  *
3476  * we need to repair the inconsistency according to the users' requirement:
3477  *
3478  * 1) Keep the inconsistency there and report the inconsistency case,
3479  *    then give the chance to the application to find related issues,
3480  *    and the users can make the decision about how to handle it with
3481  *    more human knownledge. (by default)
3482  *
3483  * 2) Re-create the missing OST-object with the FID/owner information.
3484  *
3485  * \param[in] env       pointer to the thread context
3486  * \param[in] com       the layout LFSCK component
3487  * \param[in] parent    the MDT-object with dangling LOV EA reference
3488  * \param[in] child     the OST-object to be created
3489  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3490  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3491  * \param[in] ost_idx   the index of OST on which the OST-object resides
3492  *
3493  * \retval              +1 for repair successfully
3494  * \retval              0 for did nothing
3495  * \retval              negative error number on failure
3496  */
3497 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3498                                           struct lfsck_component *com,
3499                                           struct dt_object *parent,
3500                                           struct dt_object *child,
3501                                           __u32 comp_id, __u32 ea_off,
3502                                           __u32 ost_idx, bool log)
3503 {
3504         struct lfsck_thread_info *info = lfsck_env_info(env);
3505         struct filter_fid *ff = &info->lti_ff;
3506         struct dt_object_format *dof = &info->lti_dof;
3507         struct lu_attr *la = &info->lti_la;
3508         struct lfsck_instance *lfsck = com->lc_lfsck;
3509         struct dt_device *dev = lfsck_obj2dev(child);
3510         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3511         const struct lu_fid *cfid = lfsck_dto2fid(child);
3512         struct lu_buf *tbuf = &info->lti_big_buf;
3513         struct thandle *handle;
3514         struct lu_buf *buf;
3515         struct lustre_handle lh = { 0 };
3516         int rc;
3517         ENTRY;
3518
3519         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3520                 GOTO(log, rc = 1);
3521
3522         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3523                 GOTO(log, rc = 1);
3524
3525         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3526                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3527                               LCK_EX);
3528         if (rc != 0)
3529                 GOTO(log, rc);
3530
3531         rc = dt_attr_get(env, parent, la);
3532         if (rc != 0)
3533                 GOTO(unlock1, rc);
3534
3535         la->la_mode = S_IFREG | 0666;
3536         la->la_atime = la->la_mtime = la->la_ctime = 0;
3537         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3538                        LA_ATIME | LA_MTIME | LA_CTIME;
3539         memset(dof, 0, sizeof(*dof));
3540         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3541         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3542         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3543          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3544          * parent MDT-object's layout EA. */
3545         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3546
3547         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3548         if (unlikely(rc == -ENODATA))
3549                 rc = 0;
3550         if (rc <= 0)
3551                 GOTO(unlock1, rc);
3552
3553         rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id);
3554         if (rc)
3555                 GOTO(unlock1, rc);
3556
3557         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3558         handle = lfsck_trans_create(env, dev, lfsck);
3559         if (IS_ERR(handle))
3560                 GOTO(unlock1, rc = PTR_ERR(handle));
3561
3562         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3563         if (rc != 0)
3564                 GOTO(stop, rc);
3565
3566         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3567                                   LU_XATTR_CREATE, handle);
3568         if (rc != 0)
3569                 GOTO(stop, rc);
3570
3571         rc = dt_trans_start_local(env, dev, handle);
3572         if (rc != 0)
3573                 GOTO(stop, rc);
3574
3575         dt_read_lock(env, parent, 0);
3576         if (unlikely(lfsck_is_dead_obj(parent)))
3577                 GOTO(unlock2, rc = 0);
3578
3579         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3580                 struct ost_id *oi = &info->lti_oi;
3581                 struct lu_fid *tfid = &info->lti_fid2;
3582                 struct lu_buf *lovea = &info->lti_big_buf;
3583                 struct lov_mds_md_v1 *lmm;
3584                 struct lov_ost_data_v1 *objs;
3585                 __u32 magic;
3586                 int count;
3587                 int idx2;
3588
3589                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3590                 if (unlikely(rc == -ENODATA))
3591                         rc = 0;
3592                 if (rc <= 0)
3593                         GOTO(unlock2, rc);
3594
3595                 lmm = lovea->lb_buf;
3596                 magic = le32_to_cpu(lmm->lmm_magic);
3597                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3598                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3599                         struct lov_comp_md_entry_v1 *lcme;
3600                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3601                         int i;
3602
3603                         for (i = 0; i < count; i++) {
3604                                 lcme = &lcm->lcm_entries[i];
3605                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3606                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3607                                                 LCME_FL_INIT);
3608
3609                                         lmm = lovea->lb_buf +
3610                                                 le32_to_cpu(lcme->lcme_offset);
3611                                         magic = le32_to_cpu(lmm->lmm_magic);
3612                                         goto check;
3613                                 }
3614                         }
3615
3616                         /* Someone removed the component, do nothing. */
3617                         GOTO(unlock2, rc = 0);
3618                 }
3619
3620 check:
3621                 count = le16_to_cpu(lmm->lmm_stripe_count);
3622                 /* Someone changed the LOV EA, do nothing. */
3623                 if (count <= ea_off)
3624                         GOTO(unlock2, rc = 0);
3625
3626                 if (magic == LOV_MAGIC_V1) {
3627                         objs = &lmm->lmm_objects[ea_off];
3628                 } else {
3629                         LASSERT(magic == LOV_MAGIC_V3);
3630
3631                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3632                                                         lmm_objects[ea_off];
3633                 }
3634
3635                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3636                 idx2 = le32_to_cpu(objs->l_ost_idx);
3637                 rc = ostid_to_fid(tfid, oi, idx2);
3638                 /* Someone changed the LOV EA, do nothing. */
3639                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3640                         GOTO(unlock2, rc);
3641         }
3642
3643         rc = dt_create(env, child, la, NULL, dof, handle);
3644         if (rc != 0)
3645                 GOTO(unlock2, rc);
3646
3647         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3648                           handle);
3649
3650         GOTO(unlock2, rc);
3651
3652 unlock2:
3653         dt_read_unlock(env, parent);
3654
3655 stop:
3656         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3657
3658 unlock1:
3659         lfsck_ibits_unlock(&lh, LCK_EX);
3660
3661 log:
3662         if (rc && log)
3663                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3664                        "dangling reference for: parent "DFID", child "
3665                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: "
3666                        "rc = %d\n",
3667                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3668                        comp_id, ea_off, ost_idx,
3669                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3670                                 "Create the lost OST-object as required" :
3671                                 "Keep the MDT-object there by default", rc);
3672
3673         return rc;
3674 }
3675
3676 /**
3677  * Repair the MDT-object with dangling LOV EA reference.
3678  *
3679  * Prepare parameters and call __lfsck_layout_repair_dangling()
3680  * to repair the dangling LOV EA reference.
3681  *
3682  * \param[in] env       pointer to the thread context
3683  * \param[in] com       the layout LFSCK component
3684  * \param[in] pfid      the MDT-object's FID
3685  * \param[in] cfid      the FID for the OST-object to be created
3686  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3687  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3688  * \param[in] ost_idx   the index of OST on which the OST-object resides
3689  *
3690  * \retval              +1 for repair successfully
3691  * \retval              0 for did nothing
3692  * \retval              negative error number on failure
3693  */
3694 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3695                                         struct lfsck_component *com,
3696                                         const struct lu_fid *pfid,
3697                                         const struct lu_fid *cfid,
3698                                         __u32 comp_id, __u32 ea_off,
3699                                         __u32 ost_idx)
3700 {
3701         struct lfsck_instance *lfsck = com->lc_lfsck;
3702         struct dt_object *parent = NULL;
3703         struct dt_object *child = NULL;
3704         struct lfsck_tgt_desc *ltd;
3705         int rc;
3706         ENTRY;
3707
3708         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3709         if (IS_ERR(parent))
3710                 GOTO(log, rc = PTR_ERR(parent));
3711
3712         /* The MDT-object has been removed. */
3713         if (dt_object_exists(parent) == 0)
3714                 GOTO(log, rc = 0);
3715
3716         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3717         if (unlikely(ltd == NULL))
3718                 GOTO(log, rc = -ENODEV);
3719
3720         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3721         if (IS_ERR(child))
3722                 GOTO(log, rc = PTR_ERR(child));
3723
3724         /* The OST-object has been created. */
3725         if (unlikely(dt_object_exists(child) != 0))
3726                 GOTO(log, rc = 0);
3727
3728         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3729                                             comp_id, ea_off, ost_idx, false);
3730
3731         GOTO(log, rc);
3732
3733 log:
3734         if (child != NULL && !IS_ERR(child))
3735                 lfsck_object_put(env, child);
3736
3737         if (parent != NULL && !IS_ERR(parent))
3738                 lfsck_object_put(env, parent);
3739
3740         if (rc)
3741                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3742                        "dangling reference for: parent "DFID", child "
3743                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3744                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3745                        comp_id, ea_off, ost_idx,
3746                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3747                                 "Create the lost OST-object as required" :
3748                                 "Keep the MDT-object there by default", rc);
3749
3750         return rc;
3751 }
3752
3753 /* If the OST-object does not recognize the MDT-object as its parent, and
3754  * there is no other MDT-object claims as its parent, then just trust the
3755  * given MDT-object as its parent. So update the OST-object filter_fid. */
3756 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3757                                               struct lfsck_component *com,
3758                                               struct dt_object *parent,
3759                                               struct lfsck_layout_req *llr,
3760                                               struct lu_attr *la)
3761 {
3762         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3763         struct filter_fid               *ff     = &info->lti_ff;
3764         struct dt_object                *child  = llr->llr_child;
3765         struct dt_device                *dev    = lfsck_obj2dev(child);
3766         const struct lu_fid             *tfid   = lu_object_fid(&parent->do_lu);
3767         struct lu_buf                   *tbuf   = &info->lti_big_buf;
3768         struct thandle                  *handle;
3769         struct lu_buf                   *buf;
3770         struct lustre_handle             lh     = { 0 };
3771         int                              rc;
3772         ENTRY;
3773
3774         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3775                 GOTO(log, rc = 0);
3776
3777         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3778                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3779                               LCK_EX);
3780         if (rc != 0)
3781                 GOTO(log, rc);
3782
3783         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3784         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3785         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3786          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3787          * parent MDT-object's layout EA. */
3788         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3789
3790         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3791         if (unlikely(rc == -ENODATA))
3792                 rc = 0;
3793         if (rc <= 0)
3794                 GOTO(unlock1, rc);
3795
3796         rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id);
3797         if (rc)
3798                 GOTO(unlock1, rc);
3799
3800         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3801
3802         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
3803         if (IS_ERR(handle))
3804                 GOTO(unlock1, rc = PTR_ERR(handle));
3805
3806         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3807         if (rc != 0)
3808                 GOTO(stop, rc);
3809
3810         rc = dt_attr_get(env, parent, la);
3811         if (rc != 0)
3812                 GOTO(stop, rc);
3813
3814         la->la_valid = LA_UID | LA_GID;
3815         rc = dt_declare_attr_set(env, child, la, handle);
3816         if (rc != 0)
3817                 GOTO(stop, rc);
3818
3819         rc = dt_trans_start_local(env, dev, handle);
3820         if (rc != 0)
3821                 GOTO(stop, rc);
3822
3823         dt_write_lock(env, parent, 0);
3824         if (unlikely(lfsck_is_dead_obj(parent)))
3825                 GOTO(unlock2, rc = 1);
3826
3827         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3828         if (rc != 0)
3829                 GOTO(unlock2, rc);
3830
3831         /* Get the latest parent's owner. */
3832         rc = dt_attr_get(env, parent, la);
3833         if (rc != 0)
3834                 GOTO(unlock2, rc);
3835
3836         la->la_valid = LA_UID | LA_GID;
3837         rc = dt_attr_set(env, child, la, handle);
3838
3839         GOTO(unlock2, rc);
3840
3841 unlock2:
3842         dt_write_unlock(env, parent);
3843
3844 stop:
3845         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3846
3847 unlock1:
3848         lfsck_ibits_unlock(&lh, LCK_EX);
3849
3850 log:
3851         if (rc)
3852                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3853                        "unmatched MDT-OST pair for: parent "DFID
3854                        ", child "DFID", comp_id %u, OST-index %u, "
3855                        "stripe-index %u, owner %u/%u: rc = %d\n",
3856                        lfsck_lfsck2name(com->lc_lfsck),
3857                        PFID(lfsck_dto2fid(parent)),
3858                        PFID(lfsck_dto2fid(child)),
3859                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3860                        la->la_uid, la->la_gid, rc);
3861
3862         return rc;
3863 }
3864
3865 /* If there are more than one MDT-objects claim as the OST-object's parent,
3866  * and the OST-object only recognizes one of them, then we need to generate
3867  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */
3868 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3869                                                    struct lfsck_component *com,
3870                                                    struct dt_object *parent,
3871                                                    struct lfsck_layout_req *llr,
3872                                                    struct lu_attr *la)
3873 {
3874         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3875         struct dt_allocation_hint       *hint   = &info->lti_hint;
3876         struct dt_object_format         *dof    = &info->lti_dof;
3877         struct ost_id                   *oi     = &info->lti_oi;
3878         struct lu_buf                   *buf    = &info->lti_big_buf;
3879         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3880         struct dt_device                *dev;
3881         struct lu_device                *d      =
3882                                 &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3883         struct lu_object                *o;
3884         struct lu_object                *n;
3885         struct dt_object                *child  = NULL;
3886         struct thandle                  *handle = NULL;
3887         struct lov_mds_md_v1            *lmm;
3888         struct lov_ost_data_v1          *objs;
3889         const struct lu_fid             *pfid   = lfsck_dto2fid(parent);
3890         struct lu_fid                    tfid;
3891         struct lustre_handle             lh     = { 0 };
3892         __u32                            magic;
3893         __u32                            index;
3894         int                              rc;
3895         ENTRY;
3896
3897         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3898                 RETURN(0);
3899
3900         /* We use two separated transactions to repair the inconsistency.
3901          *
3902          * 1) create the child (OST-object).
3903          * 2) update the parent LOV EA according to the child's FID.
3904          *
3905          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3906          * handled as orphan when the layout LFSCK run next time.
3907          *
3908          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3909          * as dangling referened case when the layout LFSCK run next time. */
3910
3911         /* The 1st transaction. */
3912         o = lu_object_anon(env, d, NULL);
3913         if (IS_ERR(o))
3914                 GOTO(log, rc = PTR_ERR(o));
3915
3916         n = lu_object_locate(o->lo_header, d->ld_type);
3917         if (unlikely(n == NULL)) {
3918                 lu_object_put_nocache(env, o);
3919
3920                 GOTO(log, rc = -EINVAL);
3921         }
3922
3923         child = container_of(n, struct dt_object, do_lu);
3924         memset(hint, 0, sizeof(*hint));
3925         rc = dt_attr_get(env, parent, la);
3926         if (rc != 0)
3927                 GOTO(log, rc);
3928
3929         la->la_valid = LA_UID | LA_GID;
3930         memset(dof, 0, sizeof(*dof));
3931
3932         dev = lfsck_obj2dev(child);
3933         handle = lfsck_trans_create(env, dev, lfsck);
3934         if (IS_ERR(handle))
3935                 GOTO(log, rc = PTR_ERR(handle));
3936
3937         rc = dt_declare_create(env, child, la, hint, dof, handle);
3938         if (rc != 0)
3939                 GOTO(stop, rc);
3940
3941         rc = dt_trans_start_local(env, dev, handle);
3942         if (rc != 0)
3943                 GOTO(stop, rc);
3944
3945         rc = dt_create(env, child, la, hint, dof, handle);
3946         dt_trans_stop(env, dev, handle);
3947         handle = NULL;
3948         if (rc != 0)
3949                 GOTO(log, rc);
3950
3951         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3952                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3953                               LCK_EX);
3954         if (rc != 0)
3955                 GOTO(log, rc);
3956
3957         /* The 2nd transaction. */
3958
3959         /* XXX: Generally, we should use bottom device (OSD) to update parent
3960          *      LOV EA. But because the LOD-object still references the wrong
3961          *      OSP-object that should be detached after the parent's LOV EA
3962          *      refreshed. Unfortunately, there is no suitable API for that.
3963          *      So we have to make the LOD to re-load the OSP-object(s) via
3964          *      replacing the LOV EA against the LOD-object.
3965          *
3966          *      Once the DNE2 patches have been landed, we can replace the
3967          *      LOD device with the OSD device. LU-6230. */
3968
3969         dev = lfsck->li_next;
3970         parent = lfsck_object_locate(dev, parent);
3971         if (IS_ERR(parent))
3972                 GOTO(log, rc = PTR_ERR(parent));
3973
3974         handle = lfsck_trans_create(env, dev, lfsck);
3975         if (IS_ERR(handle))
3976                 GOTO(log, rc = PTR_ERR(handle));
3977
3978         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3979                                   LU_XATTR_REPLACE, handle);
3980         if (rc != 0)
3981                 GOTO(stop, rc);
3982
3983         rc = dt_trans_start_local(env, dev, handle);
3984         if (rc != 0)
3985                 GOTO(stop, rc);
3986
3987         dt_write_lock(env, parent, 0);
3988         if (unlikely(lfsck_is_dead_obj(parent)))
3989                 GOTO(unlock, rc = 0);
3990
3991         rc = lfsck_layout_get_lovea(env, parent, buf);
3992         if (unlikely(rc == -ENODATA))
3993                 rc = 0;
3994         if (rc <= 0)
3995                 GOTO(unlock, rc);
3996
3997         lmm = buf->lb_buf;
3998         magic = le32_to_cpu(lmm->lmm_magic);
3999         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4000                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4001                 struct lov_comp_md_entry_v1 *lcme;
4002                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
4003                 int i;
4004
4005                 LASSERT(llr->llr_comp_id != 0);
4006
4007                 for (i = 0; i < count; i++) {
4008                         lcme = &lcm->lcm_entries[i];
4009                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
4010                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
4011                                         LCME_FL_INIT);
4012
4013                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
4014                                 lmm = buf->lb_buf +
4015                                         le32_to_cpu(lcme->lcme_offset);
4016                                 magic = le32_to_cpu(lmm->lmm_magic);
4017                                 goto set;
4018                         }
4019                 }
4020
4021                 GOTO(unlock, rc = 0);
4022         }
4023
4024 set:
4025         if (magic == LOV_MAGIC_V1) {
4026                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
4027         } else {
4028                 LASSERT(magic == LOV_MAGIC_V3);
4029                 objs =
4030                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
4031         }
4032
4033         ostid_le_to_cpu(&objs->l_ost_oi, oi);
4034         index = le32_to_cpu(objs->l_ost_idx);
4035         rc = ostid_to_fid(&tfid, oi, index);
4036         /* Someone changed layout during the LFSCK, no need to repair then. */
4037         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
4038                 GOTO(unlock, rc = 0);
4039
4040         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
4041         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
4042         ostid_cpu_to_le(oi, &objs->l_ost_oi);
4043         objs->l_ost_gen = cpu_to_le32(0);
4044         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
4045         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
4046                           LU_XATTR_REPLACE, handle);
4047
4048         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
4049
4050 unlock:
4051         dt_write_unlock(env, parent);
4052
4053 stop:
4054         if (handle != NULL)
4055                 dt_trans_stop(env, dev, handle);
4056
4057 log:
4058         lfsck_ibits_unlock(&lh, LCK_EX);
4059         if (child != NULL)
4060                 lfsck_object_put(env, child);
4061
4062         if (rc)
4063                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4064                        "multiple references for: parent "DFID", comp_id %u, "
4065                        "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
4066                        lfsck_lfsck2name(lfsck), PFID(pfid),
4067                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
4068                        la->la_uid, la->la_gid, rc);
4069
4070         return rc;
4071 }
4072
4073 /* If the MDT-object and the OST-object have different owner information,
4074  * then trust the MDT-object, because the normal chown/chgrp handle order
4075  * is from MDT to OST, and it is possible that some chown/chgrp operation
4076  * is partly done. */
4077 static int lfsck_layout_repair_owner(const struct lu_env *env,
4078                                      struct lfsck_component *com,
4079                                      struct dt_object *parent,
4080                                      struct lfsck_layout_req *llr,
4081                                      struct lu_attr *pla,
4082                                      const struct lu_attr *cla)
4083 {
4084         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4085         struct lu_attr                  *tla    = &info->lti_la2;
4086         struct dt_object                *child  = llr->llr_child;
4087         struct dt_device                *dev    = lfsck_obj2dev(child);
4088         struct thandle                  *handle;
4089         int                              rc;
4090         dt_obj_version_t                 version;
4091         ENTRY;
4092
4093         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
4094                 RETURN(0);
4095
4096         tla->la_uid = pla->la_uid;
4097         tla->la_gid = pla->la_gid;
4098         tla->la_valid = LA_UID | LA_GID;
4099         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4100         if (IS_ERR(handle))
4101                 GOTO(log, rc = PTR_ERR(handle));
4102
4103         rc = dt_declare_attr_set(env, child, tla, handle);
4104         if (rc != 0)
4105                 GOTO(stop, rc);
4106
4107         rc = dt_trans_start_local(env, dev, handle);
4108         if (rc != 0)
4109                 GOTO(stop, rc);
4110
4111         /* Use the dt_object lock to serialize with destroy and attr_set. */
4112         dt_read_lock(env, parent, 0);
4113         if (unlikely(lfsck_is_dead_obj(parent)))
4114                 GOTO(unlock, rc = 1);
4115
4116         version = dt_version_get(env, child);
4117         if (version == -EOPNOTSUPP)
4118                 version = 0;
4119
4120         /* Get the latest parent's owner. */
4121         rc = dt_attr_get(env, parent, pla);
4122         if (rc != 0)
4123                 GOTO(unlock, rc);
4124
4125         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
4126         if (unlikely((!version && tla->la_ctime == 0) ||
4127                      tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid))
4128                 rc = 1;
4129         else
4130                 rc = dt_attr_set(env, child, tla, handle);
4131
4132         GOTO(unlock, rc);
4133
4134 unlock:
4135         dt_read_unlock(env, parent);
4136
4137 stop:
4138         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4139
4140 log:
4141         if (rc != 0)
4142                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
4143                        "inconsistent file owner for: parent "DFID", child "DFID
4144                        ", OST-index %u, stripe-index %u, old owner %u/%u, "
4145                        "new owner %u/%u: rc = %d\n",
4146                        lfsck_lfsck2name(com->lc_lfsck),
4147                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4148                        llr->llr_ost_idx, llr->llr_lov_idx,
4149                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
4150
4151         return rc;
4152 }
4153
4154 #define CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid, msg)               \
4155         CDEBUG(D_LFSCK, "%s:("DFID"|"DFID")/"DFID":XATTR %s: %s\n",      \
4156                lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid), PFID(pfid), \
4157                PFID(cfid), XATTR_NAME_FID, msg);
4158
4159 /* Check whether the OST-object correctly back points to the
4160  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */
4161 static int lfsck_layout_check_parent(const struct lu_env *env,
4162                                      struct lfsck_component *com,
4163                                      struct lfsck_assistant_object *lso,
4164                                      struct filter_fid *ff,
4165                                      const struct lu_fid *cfid,
4166                                      const struct lu_attr *cla,
4167                                      struct lfsck_layout_req *llr)
4168 {
4169         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4170         struct lu_buf                   *buf    = &info->lti_big_buf;
4171         struct lu_fid                   *pfid   = &info->lti_fid;
4172         struct dt_object                *tobj;
4173         struct lov_mds_md_v1            *lmm;
4174         struct lov_ost_data_v1          *objs;
4175         struct lustre_handle             lh     = { 0 };
4176         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4177         int                              rc;
4178         int                              i;
4179         __u32                            magic;
4180         __u32                            idx;
4181         __u16                            count;
4182         ENTRY;
4183
4184         *pfid = ff->ff_parent;
4185         idx = pfid->f_stripe_idx;
4186         pfid->f_ver = 0;
4187
4188         if (unlikely(!fid_is_sane(pfid))) {
4189                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4190                                       "the parent FID is invalid");
4191
4192                 RETURN(LLIT_UNMATCHED_PAIR);
4193         }
4194
4195         if (lu_fid_eq(pfid, &lso->lso_fid)) {
4196                 if (likely(llr->llr_lov_idx == idx))
4197                         RETURN(0);
4198
4199                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4200                                       "the stripe index is unmatched");
4201
4202                 RETURN(LLIT_UNMATCHED_PAIR);
4203         }
4204
4205         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4206         if (IS_ERR(tobj))
4207                 RETURN(PTR_ERR(tobj));
4208
4209         if (dt_object_exists(tobj) == 0) {
4210                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4211                                       "the parent is nonexistent");
4212
4213                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4214         }
4215
4216         if (lfsck_is_dead_obj(tobj)) {
4217                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4218                                       "the parent is dead object");
4219
4220                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4221         }
4222
4223         if (!S_ISREG(lfsck_object_type(tobj))) {
4224                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4225                                       "the parent is not a regular file");
4226
4227                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4228         }
4229
4230         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
4231          * remote one on another MDT. Then check whether the given OST-object
4232          * is in such layout. If yes, it is multiple referenced, otherwise it
4233          * is unmatched referenced case. */
4234         rc = lfsck_layout_get_lovea(env, tobj, buf);
4235         if (rc == 0 || rc == -ENODATA || rc == -ENOENT) {
4236                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4237                                       "the parent has no stripe data");
4238
4239                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4240         }
4241
4242         if (unlikely(rc == -EOPNOTSUPP))
4243                 GOTO(out, rc = LLIT_NONE);
4244
4245         if (rc < 0)
4246                 GOTO(out, rc);
4247
4248         lmm = buf->lb_buf;
4249         magic = le32_to_cpu(lmm->lmm_magic);
4250         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4251                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4252                 struct lov_comp_md_entry_v1 *lcme;
4253
4254                 if (ff->ff_layout.ol_comp_id == 0) {
4255                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4256                                               "the parent has incorrect comp_id");
4257
4258                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4259                 }
4260
4261                 count = le16_to_cpu(lcm->lcm_entry_count);
4262                 for (i = 0; i < count; i++) {
4263                         lcme = &lcm->lcm_entries[i];
4264                         if (le32_to_cpu(lcme->lcme_id) ==
4265                             ff->ff_layout.ol_comp_id) {
4266                                 lmm = buf->lb_buf +
4267                                         le32_to_cpu(lcme->lcme_offset);
4268                                 magic = le32_to_cpu(lmm->lmm_magic);
4269                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4270                                       LCME_FL_INIT)) {
4271                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4272                                                               cfid,
4273                                                               "the parent has uninitialized component");
4274
4275                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4276                                 }
4277
4278                                 goto further;
4279                         }
4280                 }
4281
4282                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4283                                       "the parent has no matched comp_id");
4284
4285                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4286         }
4287
4288 further:
4289         if (magic == LOV_MAGIC_V1) {
4290                 objs = &lmm->lmm_objects[0];
4291         } else {
4292                 LASSERT(magic == LOV_MAGIC_V3);
4293                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4294         }
4295
4296         count = le16_to_cpu(lmm->lmm_stripe_count);
4297         for (i = 0; i < count; i++, objs++) {
4298                 struct lu_fid           *tfid   = &info->lti_fid2;
4299                 struct ost_id           *oi     = &info->lti_oi;
4300                 __u32                    idx2;
4301
4302                 if (lovea_slot_is_dummy(objs))
4303                         continue;
4304
4305                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4306                 idx2 = le32_to_cpu(objs->l_ost_idx);
4307                 rc = ostid_to_fid(tfid, oi, idx2);
4308                 if (rc != 0) {
4309                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
4310                                "invalid layout EA at the slot %d, index %u\n",
4311                                lfsck_lfsck2name(com->lc_lfsck),
4312                                PFID(pfid), i, idx2);
4313
4314                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4315                 }
4316
4317                 if (lu_fid_eq(cfid, tfid)) {
4318                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4319                                               MDS_INODELOCK_UPDATE |
4320                                               MDS_INODELOCK_LAYOUT |
4321                                               MDS_INODELOCK_XATTR,
4322                                               LCK_EX);
4323                         if (rc != 0)
4324                                 GOTO(out, rc);
4325
4326                         dt_read_lock(env, tobj, 0);
4327
4328                         /* For local MDT-object, re-check existence
4329                          * after taken the lock. */
4330                         if (!dt_object_remote(tobj)) {
4331                                 if (dt_object_exists(tobj) == 0 ||
4332                                     lfsck_is_dead_obj(tobj)) {
4333                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4334                                                               cfid,
4335                                                               "the parent doesn't exist anymore after lock");
4336
4337                                         rc = LLIT_UNMATCHED_PAIR;
4338                                 } else {
4339                                         rc = LLIT_MULTIPLE_REFERENCED;
4340                                 }
4341
4342                                 GOTO(unlock, rc);
4343                         }
4344
4345                         /* For migration case, the new MDT-object and old
4346                          * MDT-object may reference the same OST-object at
4347                          * some migration internal time.
4348                          *
4349                          * For remote MDT-object, the local MDT may not know
4350                          * whether it has been removed or not.  Try checking
4351                          * for a non-existent xattr to check if this object
4352                          * has been been removed or not. */
4353                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4354                                           XATTR_NAME_DUMMY);
4355                         if (unlikely(rc == -ENOENT || rc >= 0)) {
4356                                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4357                                                       "the parent is remote object and nonexistent after lock");
4358
4359                                 rc = LLIT_UNMATCHED_PAIR;
4360                         } else if (rc == -ENODATA) {
4361                                 rc = LLIT_MULTIPLE_REFERENCED;
4362                         }
4363
4364                         GOTO(unlock, rc);
4365                 }
4366         }
4367
4368         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4369                               "the parent has no matched stripe");
4370
4371         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4372
4373 unlock:
4374         if (lustre_handle_is_used(&lh)) {
4375                 dt_read_unlock(env, tobj);
4376                 lfsck_ibits_unlock(&lh, LCK_EX);
4377         }
4378
4379 out:
4380         lfsck_object_put(env, tobj);
4381
4382         return rc;
4383 }
4384
4385 /*
4386  * If the MDT-object has the LUSTRE_ENCRYPT_FL flag, it needs to be set
4387  * on the OST-object as well.
4388  */
4389 static int lfsck_layout_repair_encflag(const struct lu_env *env,
4390                                        struct lfsck_component *com,
4391                                        struct dt_object *parent,
4392                                        struct lfsck_layout_req *llr)
4393 {
4394         struct lfsck_thread_info *info = lfsck_env_info(env);
4395         struct lu_attr *tla = &info->lti_la2;
4396         struct dt_object *child = llr->llr_child;
4397         struct dt_device *dev = lfsck_obj2dev(child);
4398         struct thandle *handle;
4399         int rc;
4400
4401         ENTRY;
4402
4403         tla->la_valid = LA_FLAGS;
4404         tla->la_flags = LUSTRE_ENCRYPT_FL;
4405         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4406         if (IS_ERR(handle))
4407                 GOTO(log, rc = PTR_ERR(handle));
4408
4409         rc = dt_declare_attr_set(env, child, tla, handle);
4410         if (rc != 0)
4411                 GOTO(stop, rc);
4412
4413         rc = dt_trans_start_local(env, dev, handle);
4414         if (rc != 0)
4415                 GOTO(stop, rc);
4416
4417         /* Use the dt_object lock to serialize with destroy and attr_set. */
4418         dt_read_lock(env, parent, 0);
4419         if (unlikely(lfsck_is_dead_obj(parent)))
4420                 GOTO(unlock, rc = 1);
4421
4422         rc = dt_attr_set(env, child, tla, handle);
4423         GOTO(unlock, rc);
4424
4425 unlock:
4426         dt_read_unlock(env, parent);
4427
4428 stop:
4429         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4430
4431 log:
4432         if (rc != 0)
4433                 CDEBUG(D_LFSCK,
4434                        "%s: layout LFSCK assistant repair of inconsistent file enc flag for: parent "
4435                        DFID", child "
4436                        DFID", OST-index %u, stripe-index %u: rc = %d\n",
4437                        lfsck_lfsck2name(com->lc_lfsck),
4438                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4439                        llr->llr_ost_idx, llr->llr_lov_idx, rc);
4440
4441         return rc;
4442 }
4443
4444 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4445                                              struct lfsck_component *com,
4446                                              struct lfsck_assistant_req *lar)
4447 {
4448         struct lfsck_layout_req              *llr    =
4449                 container_of(lar, struct lfsck_layout_req, llr_lar);
4450         struct lfsck_assistant_object        *lso    = lar->lar_parent;
4451         struct lfsck_layout                  *lo     = com->lc_file_ram;
4452         struct lfsck_thread_info             *info   = lfsck_env_info(env);
4453         struct filter_fid                    *ff     = &info->lti_ff;
4454         struct lu_buf buf = { .lb_buf = ff,
4455                               .lb_len = sizeof(*ff) };
4456         struct dt_object                     *parent = NULL;
4457         struct dt_object                     *child  = llr->llr_child;
4458         struct lu_attr                       *pla    = &lso->lso_attr;
4459         struct lu_attr                       *cla    = &info->lti_la;
4460         struct lfsck_instance                *lfsck  = com->lc_lfsck;
4461         struct lfsck_bookmark                *bk     = &lfsck->li_bookmark_ram;
4462         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4463         int                                   rc;
4464         ENTRY;
4465
4466         if (lso->lso_dead)
4467                 RETURN(0);
4468
4469         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
4470
4471         rc = dt_attr_get(env, child, cla);
4472         if (rc == -ENOENT) {
4473                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4474                 if (IS_ERR(parent)) {
4475                         rc = PTR_ERR(parent);
4476
4477                         RETURN(rc == -ENOENT ? 0 : rc);
4478                 }
4479
4480                 type = LLIT_DANGLING;
4481                 goto repair;
4482         }
4483
4484         if (rc != 0)
4485                 GOTO(out, rc);
4486
4487         if (!(bk->lb_param & LPF_DRYRUN) &&
4488             pla->la_valid & LA_FLAGS && pla->la_flags & LUSTRE_ENCRYPT_FL) {
4489                 /* MDT-inode is encrypted */
4490                 struct lu_buf lb = { .lb_buf = NULL, .lb_len = 0 };
4491
4492                 /* if OST-inode is missing encryption.c xattr, fix it */
4493                 if (dt_xattr_get(env, child, &lb,
4494                                  LL_XATTR_NAME_ENCRYPTION_CONTEXT) >= 0)
4495                         goto check_fid;
4496
4497                 if (parent == NULL)
4498                         parent = lfsck_assistant_object_load(env, lfsck, lso);
4499                 if (!IS_ERR_OR_NULL(parent))
4500                         rc = lfsck_layout_repair_encflag(env, com, parent, llr);
4501                 down_write(&com->lc_sem);
4502                 if (rc < 0)
4503                         lfsck_layout_record_failure(env, lfsck, lo);
4504                 else if (rc > 0)
4505                         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
4506                 up_write(&com->lc_sem);
4507         }
4508
4509 check_fid:
4510         lfsck_buf_init(&buf, ff, sizeof(*ff));
4511         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4512         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4513                 CDEBUG(D_LFSCK, "%s:"DFID"/"DFID": "
4514                        "the child object's %s is corrupted\n",
4515                        lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid),
4516                        PFID(lu_object_fid(&child->do_lu)),
4517                        XATTR_NAME_FID);
4518
4519                 type = LLIT_UNMATCHED_PAIR;
4520                 goto repair;
4521         }
4522
4523         if (rc < 0 && rc != -ENODATA)
4524                 GOTO(out, rc);
4525
4526         if (rc == 0 || rc == -ENODATA)
4527                 GOTO(check_owner, rc = 0);
4528
4529         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4530         rc = lfsck_layout_check_parent(env, com, lso, ff,
4531                                        lu_object_fid(&child->do_lu), cla, llr);
4532         if (rc > 0) {
4533                 type = rc;
4534                 goto repair;
4535         }
4536
4537         if (rc < 0)
4538                 GOTO(out, rc);
4539
4540 check_owner:
4541         /* Someone may has changed the owner after the parent attr pre-loaded.
4542          * It can be handled later inside the lfsck_layout_repair_owner(). */
4543         if (unlikely(cla->la_uid != pla->la_uid ||
4544                      cla->la_gid != pla->la_gid)) {
4545                 type = LLIT_INCONSISTENT_OWNER;
4546                 goto repair;
4547         }
4548
4549 repair:
4550         if (type == LLIT_NONE)
4551                 GOTO(out, rc = 0);
4552
4553         if (bk->lb_param & LPF_DRYRUN)
4554                 GOTO(out, rc = 1);
4555
4556         if (parent == NULL) {
4557                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4558                 if (IS_ERR(parent)) {
4559                         rc = PTR_ERR(parent);
4560
4561                         if (rc == -ENOENT)
4562                                 RETURN(0);
4563
4564                         GOTO(out, rc);
4565                 }
4566         }
4567
4568         switch (type) {
4569         case LLIT_DANGLING:
4570                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4571                         rc = lfsck_layout_ins_dangling_rec(env, com,
4572                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4573                                 llr->llr_comp_id, llr->llr_lov_idx,
4574                                 llr->llr_ost_idx);
4575                 else
4576                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4577                                                             llr->llr_child,
4578                                                             llr->llr_comp_id,
4579                                                             llr->llr_lov_idx,
4580                                                             llr->llr_ost_idx,
4581                                                             true);
4582                 break;
4583         case LLIT_UNMATCHED_PAIR:
4584                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4585                                                         llr, pla);
4586                 break;
4587         case LLIT_MULTIPLE_REFERENCED:
4588                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4589                                                              llr, pla);
4590                 break;
4591         case LLIT_INCONSISTENT_OWNER:
4592                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4593                 break;
4594         default:
4595                 rc = 0;
4596                 break;
4597         }
4598
4599         GOTO(out, rc);
4600
4601 out:
4602         down_write(&com->lc_sem);
4603         if (rc < 0) {
4604                 struct lfsck_assistant_data *lad = com->lc_data;
4605
4606                 if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) {
4607                         rc = 0;
4608                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4609                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4610                            rc == -EHOSTUNREACH) {
4611                         /* If cannot touch the target server,
4612                          * mark the LFSCK as INCOMPLETE. */
4613                         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to "
4614                                "talk with OST %x: rc = %d\n",
4615                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4616                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4617                         lo->ll_objs_skipped++;
4618                         rc = 0;
4619                 } else {
4620                         lfsck_layout_record_failure(env, lfsck, lo);
4621                 }
4622         } else if (rc > 0 && (type != LLIT_DANGLING ||
4623                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4624                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4625                          "unknown type = %d\n", type);
4626
4627                 lo->ll_objs_repaired[type - 1]++;
4628                 if (bk->lb_param & LPF_DRYRUN &&
4629                     unlikely(lo->ll_pos_first_inconsistent == 0))
4630                         lo->ll_pos_first_inconsistent =
4631                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4632                                                         lfsck->li_di_oit);
4633         }
4634         up_write(&com->lc_sem);
4635
4636         if (parent != NULL && !IS_ERR(parent))
4637                 lfsck_object_put(env, parent);
4638
4639         return rc;
4640 }
4641
4642 static int
4643 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4644                                         struct lfsck_component *com,
4645                                         struct dt_object *obj, bool first)
4646 {
4647         struct lfsck_instance *lfsck = com->lc_lfsck;
4648         struct ptlrpc_thread *thread = &lfsck->li_thread;
4649         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4650         struct lfsck_layout *lo = com->lc_file_ram;
4651         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4652         struct dt_it *di;
4653         struct dt_key *key;
4654         struct lfsck_layout_dangling_key *parent =
4655                                         &lfsck_env_info(env)->lti_lldk;
4656         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4657         __u32 ost_idx;
4658         int rc;
4659         ENTRY;
4660
4661         di = iops->init(env, obj, 0);
4662         if (IS_ERR(di))
4663                 RETURN(PTR_ERR(di));
4664
4665         if (first)
4666                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4667         else
4668                 memset(parent, 0, sizeof(*parent));
4669         rc = iops->get(env, di, (const struct dt_key *)parent);
4670         if (rc < 0)
4671                 GOTO(fini, rc);
4672
4673         if (first) {
4674                 /* The start one either has been processed or does not exist,
4675                  * skip it. */
4676                 rc = iops->next(env, di);
4677                 if (rc != 0)
4678                         GOTO(put, rc);
4679         }
4680
4681         do {
4682                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4683                     unlikely(!thread_is_running(thread)))
4684                         GOTO(put, rc = 0);
4685
4686                 key = iops->key(env, di);
4687                 if (IS_ERR(key)) {
4688                         rc = PTR_ERR(key);
4689                         if (rc == -ENOENT)
4690                                 GOTO(put, rc = 1);
4691
4692                         goto checkpoint;
4693                 }
4694
4695                 lldk_be_to_cpu(parent,
4696                                 (const struct lfsck_layout_dangling_key *)key);
4697                 if (!fid_is_sane(&parent->lldk_fid)) {
4698                         rc = 0;
4699                         goto checkpoint;
4700                 }
4701
4702                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4703                 if (rc == 0) {
4704                         fid_be_to_cpu(cfid, cfid);
4705                         ost_idx = cfid->f_ver;
4706                         cfid->f_ver = 0;
4707                         if (!fid_is_sane(cfid)) {
4708                                 rc = 0;
4709                                 goto checkpoint;
4710                         }
4711
4712                         rc = lfsck_layout_repair_dangling(env, com,
4713                                         &parent->lldk_fid, cfid,
4714                                         parent->lldk_comp_id,
4715                                         parent->lldk_ea_off, ost_idx);
4716                 }
4717
4718 checkpoint:
4719                 down_write(&com->lc_sem);
4720                 com->lc_new_checked++;
4721                 com->lc_new_scanned++;
4722                 if (rc >= 0)
4723                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4724
4725                 if (rc > 0)
4726                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4727                 else if (rc < 0)
4728                         lo->ll_objs_failed_phase2++;
4729                 up_write(&com->lc_sem);
4730
4731                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4732                         GOTO(put, rc);
4733
4734                 if (unlikely(com->lc_time_next_checkpoint <=
4735                              ktime_get_seconds()) &&
4736                     com->lc_new_checked != 0) {
4737                         down_write(&com->lc_sem);
4738                         lo->ll_run_time_phase2 += ktime_get_seconds() -
4739                                                   com->lc_time_last_checkpoint;
4740                         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
4741                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4742                         com->lc_new_checked = 0;
4743                         lfsck_layout_store(env, com);
4744                         up_write(&com->lc_sem);
4745
4746                         com->lc_time_last_checkpoint = ktime_get_seconds();
4747                         com->lc_time_next_checkpoint =
4748                                 com->lc_time_last_checkpoint +
4749                                 LFSCK_CHECKPOINT_INTERVAL;
4750                 }
4751
4752                 lfsck_control_speed_by_self(com);
4753                 if (unlikely(!thread_is_running(thread)))
4754                         GOTO(put, rc = 0);
4755
4756                 rc = iops->next(env, di);
4757         } while (rc == 0);
4758
4759         GOTO(put, rc);
4760
4761 put:
4762         iops->put(env, di);
4763
4764 fini:
4765         iops->fini(env, di);
4766
4767         return rc;
4768 }
4769
4770 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4771                                              struct lfsck_component *com)
4772 {
4773         struct lfsck_assistant_data     *lad    = com->lc_data;
4774         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4775         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
4776         struct lfsck_tgt_descs          *ltds   = &lfsck->li_ost_descs;
4777         struct lfsck_tgt_desc           *ltd;
4778         int                              rc     = 0;
4779         ENTRY;
4780
4781         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4782                lfsck_lfsck2name(lfsck));
4783
4784         spin_lock(&ltds->ltd_lock);
4785         while (!list_empty(&lad->lad_ost_phase2_list)) {
4786                 ltd = list_first_entry(&lad->lad_ost_phase2_list,
4787                                        struct lfsck_tgt_desc,
4788                                        ltd_layout_phase_list);
4789                 list_del_init(&ltd->ltd_layout_phase_list);
4790                 if (bk->lb_param & LPF_OST_ORPHAN) {
4791                         spin_unlock(&ltds->ltd_lock);
4792                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4793                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4794                                 RETURN(rc);
4795
4796                         if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
4797                                      !thread_is_running(&lfsck->li_thread)))
4798                                 RETURN(0);
4799                         spin_lock(&ltds->ltd_lock);
4800                 }
4801         }
4802
4803         if (list_empty(&lad->lad_ost_phase1_list))
4804                 rc = 1;
4805         else
4806                 rc = 0;
4807         spin_unlock(&ltds->ltd_lock);
4808
4809         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4810                 struct lfsck_layout *lo = com->lc_file_ram;
4811                 int i;
4812
4813                 com->lc_new_checked = 0;
4814                 com->lc_new_scanned = 0;
4815                 com->lc_time_last_checkpoint = ktime_get_seconds();
4816                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4817                                                LFSCK_CHECKPOINT_INTERVAL;
4818
4819                 i = lfsck_sub_trace_file_fid2idx(
4820                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4821                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4822                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4823                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4824                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4825                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4826
4827                 CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop "
4828                        "at the No. %d trace file: rc = %d\n",
4829                        lfsck_lfsck2name(lfsck), i, rc);
4830         }
4831
4832         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4833                lfsck_lfsck2name(lfsck), rc);
4834
4835         RETURN(rc);
4836 }
4837
4838 static int
4839 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4840                                    struct ptlrpc_request *req,
4841                                    void *args, int rc)
4842 {
4843         struct lfsck_layout_slave_async_args *llsaa = args;
4844         struct obd_export *exp = llsaa->llsaa_exp;
4845         struct lfsck_component *com = llsaa->llsaa_com;
4846         struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst;
4847         struct lfsck_layout_slave_data *llsd = com->lc_data;
4848         struct lfsck_reply *lr = NULL;
4849         bool done = false;
4850
4851         if (rc != 0) {
4852                 /* It is probably caused by network trouble, or target crash,
4853                  * it will try several times (depends on the obd_timeout, and
4854                  * will not less than 3 times). But to make the LFSCK can go
4855                  * ahead, we should not try for ever. After some try but still
4856                  * hit failure, it will assume that the target exit the LFSCK
4857                  * prcoessing and stop try. */
4858                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4859                         int max_try = max_t(int, obd_timeout / 30, 3);
4860
4861                         if (++(llst->llst_failures) > max_try)
4862                                 done = true;
4863                 } else {
4864                         done = true;
4865                 }
4866         } else {
4867                 llst->llst_failures = 0;
4868                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4869                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4870                     lr->lr_status != LS_SCANNING_PHASE2)
4871                         done = true;
4872         }
4873
4874         if (done) {
4875                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
4876                        "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
4877                        llst->llst_index, lr != NULL ? lr->lr_status : rc,
4878                        llst->llst_failures);
4879
4880                 lfsck_layout_llst_del(llsd, llst);
4881         }
4882
4883         lfsck_layout_llst_put(llst);
4884         lfsck_component_put(env, com);
4885         class_export_put(exp);
4886
4887         return 0;
4888 }
4889
4890 static int lfsck_layout_async_query(const struct lu_env *env,
4891                                     struct lfsck_component *com,
4892                                     struct obd_export *exp,
4893                                     struct lfsck_layout_slave_target *llst,
4894                                     struct lfsck_request *lr,
4895                                     struct ptlrpc_request_set *set)
4896 {
4897         struct lfsck_layout_slave_async_args *llsaa;
4898         struct ptlrpc_request                *req;
4899         struct lfsck_request                 *tmp;
4900         int                                   rc;
4901         ENTRY;
4902
4903         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4904         if (req == NULL)
4905                 RETURN(-ENOMEM);
4906
4907         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4908         if (rc != 0) {
4909                 ptlrpc_request_free(req);
4910                 RETURN(rc);
4911         }
4912
4913         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4914         *tmp = *lr;
4915         ptlrpc_request_set_replen(req);
4916
4917         llsaa = ptlrpc_req_async_args(llsaa, req);
4918         llsaa->llsaa_exp = exp;
4919         llsaa->llsaa_com = lfsck_component_get(com);
4920         llsaa->llsaa_llst = llst;
4921         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4922         req->rq_allow_intr = 1;
4923         req->rq_no_delay = 1;
4924         ptlrpc_set_add_req(set, req);
4925
4926         RETURN(0);
4927 }
4928
4929 static int lfsck_layout_async_notify(const struct lu_env *env,
4930                                      struct obd_export *exp,
4931                                      struct lfsck_request *lr,
4932                                      struct ptlrpc_request_set *set)
4933 {
4934         struct ptlrpc_request   *req;
4935         struct lfsck_request    *tmp;
4936         int                      rc;
4937         ENTRY;
4938
4939         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4940         if (req == NULL)
4941                 RETURN(-ENOMEM);
4942
4943         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4944         if (rc != 0) {
4945                 ptlrpc_request_free(req);
4946                 RETURN(rc);
4947         }
4948
4949         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4950         *tmp = *lr;
4951         ptlrpc_request_set_replen(req);
4952         req->rq_allow_intr = 1;
4953         req->rq_no_delay = 1;
4954         ptlrpc_set_add_req(set, req);
4955
4956         RETURN(0);
4957 }
4958
4959 static int
4960 lfsck_layout_slave_query_master(const struct lu_env *env,
4961                                 struct lfsck_component *com)
4962 {
4963         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4964         struct lfsck_instance            *lfsck = com->lc_lfsck;
4965         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4966         struct lfsck_layout_slave_target *llst;
4967         struct obd_export                *exp;
4968         struct ptlrpc_request_set        *set;
4969         int                               rc    = 0;
4970         int                               rc1   = 0;
4971         ENTRY;
4972
4973         set = ptlrpc_prep_set();
4974         if (set == NULL)
4975                 GOTO(log, rc = -ENOMEM);
4976
4977         memset(lr, 0, sizeof(*lr));
4978         lr->lr_event = LE_QUERY;
4979         lr->lr_active = LFSCK_TYPE_LAYOUT;
4980
4981         llsd->llsd_touch_gen++;
4982         spin_lock(&llsd->llsd_lock);
4983         while (!list_empty(&llsd->llsd_master_list)) {
4984                 llst = list_first_entry(&llsd->llsd_master_list,
4985                                         struct lfsck_layout_slave_target,
4986                                         llst_list);
4987                 if (llst->llst_gen == llsd->llsd_touch_gen)
4988                         break;
4989
4990                 llst->llst_gen = llsd->llsd_touch_gen;
4991                 list_move_tail(&llst->llst_list,
4992                                &llsd->llsd_master_list);
4993                 atomic_inc(&llst->llst_ref);
4994                 spin_unlock(&llsd->llsd_lock);
4995
4996                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4997                                                llst->llst_index);
4998                 if (exp == NULL) {
4999                         lfsck_layout_llst_del(llsd, llst);
5000                         lfsck_layout_llst_put(llst);
5001                         spin_lock(&llsd->llsd_lock);
5002                         continue;
5003                 }
5004
5005                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
5006                 if (rc != 0) {
5007                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
5008                                "query %s for layout: rc = %d\n",
5009                                lfsck_lfsck2name(lfsck),
5010                                exp->exp_obd->obd_name, rc);
5011
5012                         rc1 = rc;
5013                         lfsck_layout_llst_put(llst);
5014                         class_export_put(exp);
5015                 }
5016                 spin_lock(&llsd->llsd_lock);
5017         }
5018         spin_unlock(&llsd->llsd_lock);
5019
5020         rc = ptlrpc_set_wait(env, set);
5021         ptlrpc_set_destroy(set);
5022
5023         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
5024
5025 log:
5026         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
5027                lfsck_lfsck2name(com->lc_lfsck), rc);
5028
5029         return rc;
5030 }
5031
5032 static void
5033 lfsck_layout_slave_notify_master(const struct lu_env *env,
5034                                  struct lfsck_component *com,
5035                                  enum lfsck_events event, int result)
5036 {
5037         struct lfsck_layout              *lo    = com->lc_file_ram;
5038         struct lfsck_instance            *lfsck = com->lc_lfsck;
5039         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
5040         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
5041         struct lfsck_layout_slave_target *llst;
5042         struct obd_export                *exp;
5043         struct ptlrpc_request_set        *set;
5044         int                               rc;
5045         ENTRY;
5046
5047         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
5048                lfsck_lfsck2name(com->lc_lfsck));
5049
5050         set = ptlrpc_prep_set();
5051         if (set == NULL)
5052                 RETURN_EXIT;
5053
5054         memset(lr, 0, sizeof(*lr));
5055         lr->lr_event = event;
5056         lr->lr_flags = LEF_FROM_OST;
5057         lr->lr_status = result;
5058         lr->lr_index = lfsck_dev_idx(lfsck);
5059         lr->lr_active = LFSCK_TYPE_LAYOUT;
5060         lr->lr_flags2 = lo->ll_flags;
5061         llsd->llsd_touch_gen++;
5062         spin_lock(&llsd->llsd_lock);
5063         while (!list_empty(&llsd->llsd_master_list)) {
5064                 llst = list_first_entry(&llsd->llsd_master_list,
5065                                         struct lfsck_layout_slave_target,
5066                                         llst_list);
5067                 if (llst->llst_gen == llsd->llsd_touch_gen)
5068                         break;
5069
5070                 llst->llst_gen = llsd->llsd_touch_gen;
5071                 list_move_tail(&llst->llst_list,
5072                                &llsd->llsd_master_list);
5073                 atomic_inc(&llst->llst_ref);
5074                 spin_unlock(&llsd->llsd_lock);
5075
5076                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
5077                                                llst->llst_index);
5078                 if (exp == NULL) {
5079                         lfsck_layout_llst_del(llsd, llst);
5080                         lfsck_layout_llst_put(llst);
5081                         spin_lock(&llsd->llsd_lock);
5082                         continue;
5083                 }
5084
5085                 rc = lfsck_layout_async_notify(env, exp, lr, set);
5086                 if (rc != 0)
5087                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
5088                                "notify %s for layout: rc = %d\n",
5089                                lfsck_lfsck2name(lfsck),
5090                                exp->exp_obd->obd_name, rc);
5091
5092                 lfsck_layout_llst_put(llst);
5093                 class_export_put(exp);
5094                 spin_lock(&llsd->llsd_lock);
5095         }
5096         spin_unlock(&llsd->llsd_lock);
5097
5098         ptlrpc_set_wait(env, set);
5099         ptlrpc_set_destroy(set);
5100
5101         RETURN_EXIT;
5102 }
5103
5104 /*
5105  * \ret -ENODATA: unrecognized stripe
5106  * \ret = 0     : recognized stripe
5107  * \ret < 0     : other failures
5108  */
5109 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
5110                                            struct lfsck_component *com,
5111                                            struct lu_fid *cfid,
5112                                            struct lu_fid *pfid, __u32 comp_id)
5113 {
5114         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5115         struct lu_buf                   *buf    = &info->lti_big_buf;
5116         struct ost_id                   *oi     = &info->lti_oi;
5117         struct dt_object                *obj;
5118         struct lov_mds_md_v1            *lmm;
5119         struct lov_ost_data_v1          *objs;
5120         __u32                            idx    = pfid->f_stripe_idx;
5121         __u32                            magic;
5122         int                              rc     = 0;
5123         int                              i;
5124         __u16                            count;
5125         ENTRY;
5126
5127         pfid->f_ver = 0;
5128         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
5129         if (IS_ERR(obj))
5130                 RETURN(PTR_ERR(obj));
5131
5132         dt_read_lock(env, obj, 0);
5133         if (unlikely(dt_object_exists(obj) == 0 ||
5134                      lfsck_is_dead_obj(obj)))
5135                 GOTO(unlock, rc = -ENOENT);
5136
5137         if (!S_ISREG(lfsck_object_type(obj)))
5138                 GOTO(unlock, rc = -ENODATA);
5139
5140         rc = lfsck_layout_get_lovea(env, obj, buf);
5141         if (rc < 0)
5142                 GOTO(unlock, rc);
5143
5144         lmm = buf->lb_buf;
5145         magic = le32_to_cpu(lmm->lmm_magic);
5146         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5147                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
5148                 struct lov_comp_md_entry_v1 *lcme;
5149
5150                 if (comp_id == 0)
5151                         GOTO(unlock, rc = -ENODATA);
5152
5153                 count = le16_to_cpu(lcm->lcm_entry_count);
5154                 for (i = 0; i < count; i++) {
5155                         lcme = &lcm->lcm_entries[i];
5156                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
5157                                 lmm = buf->lb_buf +
5158                                         le32_to_cpu(lcme->lcme_offset);
5159                                 magic = le32_to_cpu(lmm->lmm_magic);
5160                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5161                                       LCME_FL_INIT))
5162                                         GOTO(unlock, rc = -ENODATA);
5163
5164                                 goto further;
5165                         }
5166                 }
5167
5168                 GOTO(unlock, rc = -ENODATA);
5169         }
5170
5171 further:
5172         if (magic == LOV_MAGIC_V1) {
5173                 objs = &lmm->lmm_objects[0];
5174         } else {
5175                 LASSERT(magic == LOV_MAGIC_V3);
5176                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5177         }
5178
5179         fid_to_ostid(cfid, oi);
5180         count = le16_to_cpu(lmm->lmm_stripe_count);
5181         for (i = 0; i < count; i++, objs++) {
5182                 struct ost_id oi2;
5183
5184                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
5185                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
5186                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
5187         }
5188
5189         GOTO(unlock, rc = -ENODATA);
5190
5191 unlock:
5192         dt_read_unlock(env, obj);
5193         lfsck_object_put(env, obj);
5194
5195         return rc;
5196 }
5197
5198 /*
5199  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
5200  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
5201  * layout EA from MDT to OST. On one hand, the OST no need to understand
5202  * the layout EA structure; on the other hand, it may cause trouble when
5203  * transfer large layout EA from MDT to OST via normal OUT RPC.
5204  *
5205  * \ret > 0: unrecognized stripe
5206  * \ret = 0: recognized stripe
5207  * \ret < 0: other failures
5208  */
5209 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
5210                                           struct lfsck_component *com,
5211                                           struct lu_fid *cfid,
5212                                           struct lu_fid *pfid, __u32 comp_id)
5213 {
5214         struct lfsck_instance    *lfsck  = com->lc_lfsck;
5215         struct obd_device        *obd    = lfsck->li_obd;
5216         struct seq_server_site   *ss     = lfsck_dev_site(lfsck);
5217         struct obd_export        *exp    = NULL;
5218         struct ptlrpc_request    *req    = NULL;
5219         struct lfsck_request     *lr;
5220         struct lu_seq_range      *range  = &lfsck_env_info(env)->lti_range;
5221         int                       rc     = 0;
5222         ENTRY;
5223
5224         if (unlikely(fid_is_idif(pfid)))
5225                 RETURN(1);
5226
5227         fld_range_set_any(range);
5228         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
5229         if (rc != 0)
5230                 RETURN(rc == -ENOENT ? 1 : rc);
5231
5232         if (unlikely(!fld_range_is_mdt(range)))
5233                 RETURN(1);
5234
5235         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
5236         if (unlikely(exp == NULL))
5237                 RETURN(1);
5238
5239         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
5240                 GOTO(out, rc = -EOPNOTSUPP);
5241
5242         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
5243         if (req == NULL)
5244                 GOTO(out, rc = -ENOMEM);
5245
5246         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
5247         if (rc != 0) {
5248                 ptlrpc_request_free(req);
5249
5250                 GOTO(out, rc);
5251         }
5252
5253         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
5254         memset(lr, 0, sizeof(*lr));
5255         lr->lr_event = LE_PAIRS_VERIFY;
5256         lr->lr_active = LFSCK_TYPE_LAYOUT;
5257         lr->lr_fid = *cfid; /* OST-object itself FID. */
5258         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
5259         lr->lr_comp_id = comp_id;
5260
5261         ptlrpc_request_set_replen(req);
5262         rc = ptlrpc_queue_wait(req);
5263         ptlrpc_req_finished(req);
5264
5265         if (rc == -ENOENT || rc == -ENODATA)
5266                 rc = 1;
5267
5268         GOTO(out, rc);
5269
5270 out:
5271         if (exp != NULL)
5272                 class_export_put(exp);
5273
5274         return rc;
5275 }
5276
5277 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
5278                                           struct lfsck_component *com,
5279                                           struct lfsck_req_local *lrl)
5280 {
5281         struct dt_object        *obj;
5282         int                      rc     = 0;
5283         ENTRY;
5284
5285         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
5286         if (IS_ERR(obj))
5287                 GOTO(log, rc = PTR_ERR(obj));
5288
5289         rc = __lfsck_layout_update_pfid(env, com, obj,
5290                                         &lrl->lrl_ff_client.ff_parent,
5291                                         &lrl->lrl_ff_client.ff_layout,
5292                                         lrl->lrl_ff_client.ff_layout_version,
5293                                         lrl->lrl_ff_client.ff_range,
5294                                         lrl->lrl_ff_client.ff_parent.f_ver);
5295
5296         lfsck_object_put(env, obj);
5297
5298 log:
5299         CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID
5300                ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
5301                PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc);
5302
5303         return rc;
5304 }
5305
5306 /* layout APIs */
5307
5308 static void lfsck_layout_slave_quit(const struct lu_env *env,
5309                                     struct lfsck_component *com);
5310
5311 static int lfsck_layout_reset(const struct lu_env *env,
5312                               struct lfsck_component *com, bool init)
5313 {
5314         struct lfsck_layout     *lo    = com->lc_file_ram;
5315         int                      rc;
5316
5317         down_write(&com->lc_sem);
5318         if (init) {
5319                 memset(lo, 0, com->lc_file_size);
5320         } else {
5321                 __u32 count = lo->ll_success_count;
5322                 time64_t last_time = lo->ll_time_last_complete;
5323
5324                 memset(lo, 0, com->lc_file_size);
5325                 lo->ll_success_count = count;
5326                 lo->ll_time_last_complete = last_time;
5327         }
5328
5329         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
5330         lo->ll_status = LS_INIT;
5331
5332         if (com->lc_lfsck->li_master) {
5333                 struct lfsck_assistant_data *lad = com->lc_data;
5334
5335                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
5336                 bitmap_zero(lad->lad_bitmap, lad->lad_bitmap_count);
5337         }
5338
5339         rc = lfsck_layout_store(env, com);
5340         if (rc == 0 && com->lc_lfsck->li_master)
5341                 rc = lfsck_load_sub_trace_files(env, com,
5342                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
5343         up_write(&com->lc_sem);
5344
5345         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
5346                lfsck_lfsck2name(com->lc_lfsck), rc);
5347
5348         return rc;
5349 }
5350
5351 static void lfsck_layout_fail(const struct lu_env *env,
5352                               struct lfsck_component *com, bool new_checked)
5353 {
5354         struct lfsck_layout *lo = com->lc_file_ram;
5355
5356         down_write(&com->lc_sem);
5357         if (new_checked)
5358                 com->lc_new_checked++;
5359         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
5360         up_write(&com->lc_sem);
5361 }
5362
5363 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
5364                                           struct lfsck_component *com, bool init)
5365 {
5366         struct lfsck_instance   *lfsck   = com->lc_lfsck;
5367         struct lfsck_layout     *lo      = com->lc_file_ram;
5368         int                      rc;
5369
5370         if (!init) {
5371                 rc = lfsck_checkpoint_generic(env, com);
5372                 if (rc != 0)
5373                         return rc > 0 ? 0 : rc;
5374         }
5375
5376         down_write(&com->lc_sem);
5377         if (init) {
5378                 lo->ll_pos_latest_start =
5379                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5380         } else {
5381                 lo->ll_pos_last_checkpoint =
5382                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5383                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5384                                           lfsck->li_time_last_checkpoint;
5385                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5386                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5387                 com->lc_new_checked = 0;
5388         }
5389
5390         rc = lfsck_layout_store(env, com);
5391         up_write(&com->lc_sem);
5392
5393         CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos ["
5394                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5395                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5396
5397         return rc;
5398 }
5399
5400 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5401                                          struct lfsck_component *com, bool init)
5402 {
5403         struct lfsck_instance   *lfsck = com->lc_lfsck;
5404         struct lfsck_layout     *lo    = com->lc_file_ram;
5405         int                      rc;
5406
5407         if (com->lc_new_checked == 0 && !init)
5408                 return 0;
5409
5410         down_write(&com->lc_sem);
5411         if (init) {
5412                 lo->ll_pos_latest_start =
5413                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5414         } else {
5415                 lo->ll_pos_last_checkpoint =
5416                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5417                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5418                                           lfsck->li_time_last_checkpoint;
5419                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5420                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5421                 com->lc_new_checked = 0;
5422         }
5423
5424         rc = lfsck_layout_store(env, com);
5425         up_write(&com->lc_sem);
5426
5427         CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos ["
5428                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5429                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5430
5431         return rc;
5432 }
5433
5434 static int lfsck_layout_prep(const struct lu_env *env,
5435                              struct lfsck_component *com,
5436                              struct lfsck_start *start)
5437 {
5438         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5439         struct lfsck_layout     *lo     = com->lc_file_ram;
5440         struct lfsck_position   *pos    = &com->lc_pos_start;
5441
5442         fid_zero(&pos->lp_dir_parent);
5443         pos->lp_dir_cookie = 0;
5444         if (lo->ll_status == LS_COMPLETED ||
5445             lo->ll_status == LS_PARTIAL ||
5446             /* To handle orphan, must scan from the beginning. */
5447             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5448                 int rc;
5449
5450                 rc = lfsck_layout_reset(env, com, false);
5451                 if (rc == 0)
5452                         rc = lfsck_set_param(env, lfsck, start, true);
5453
5454                 if (rc != 0) {
5455                         CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: "
5456                                "rc = %d\n", lfsck_lfsck2name(lfsck), rc);
5457
5458                         return rc;
5459                 }
5460         }
5461
5462         down_write(&com->lc_sem);
5463         lo->ll_time_latest_start = ktime_get_real_seconds();
5464         spin_lock(&lfsck->li_lock);
5465         if (lo->ll_flags & LF_SCANNED_ONCE) {
5466                 if (!lfsck->li_drop_dryrun ||
5467                     lo->ll_pos_first_inconsistent == 0) {
5468                         lo->ll_status = LS_SCANNING_PHASE2;
5469                         list_move_tail(&com->lc_link,
5470                                        &lfsck->li_list_double_scan);
5471                         pos->lp_oit_cookie = 0;
5472                 } else {
5473                         int i;
5474
5475                         lo->ll_status = LS_SCANNING_PHASE1;
5476                         lo->ll_run_time_phase1 = 0;
5477                         lo->ll_run_time_phase2 = 0;
5478                         lo->ll_objs_checked_phase1 = 0;
5479                         lo->ll_objs_checked_phase2 = 0;
5480                         lo->ll_objs_failed_phase1 = 0;
5481                         lo->ll_objs_failed_phase2 = 0;
5482                         for (i = 0; i < LLIT_MAX; i++)
5483                                 lo->ll_objs_repaired[i] = 0;
5484
5485                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5486                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5487                 }
5488         } else {
5489                 lo->ll_status = LS_SCANNING_PHASE1;
5490                 if (!lfsck->li_drop_dryrun ||
5491                     lo->ll_pos_first_inconsistent == 0)
5492                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5493                 else
5494                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5495         }
5496         spin_unlock(&lfsck->li_lock);
5497         up_write(&com->lc_sem);
5498
5499         return 0;
5500 }
5501
5502 static int lfsck_layout_slave_prep(const struct lu_env *env,
5503                                    struct lfsck_component *com,
5504                                    struct lfsck_start_param *lsp)
5505 {
5506         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5507         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5508         struct lfsck_layout             *lo     = com->lc_file_ram;
5509         struct lfsck_start              *start  = lsp->lsp_start;
5510         int                              rc;
5511
5512         rc = lfsck_layout_prep(env, com, start);
5513         if (rc != 0)
5514                 return rc;
5515
5516         if (lo->ll_flags & LF_CRASHED_LASTID &&
5517             list_empty(&llsd->llsd_master_list)) {
5518                 LASSERT(lfsck->li_out_notify != NULL);
5519
5520                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5521                                      LE_LASTID_REBUILDING);
5522         }
5523
5524         if (!lsp->lsp_index_valid)
5525                 return 0;
5526
5527         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5528         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5529                 LASSERT(!llsd->llsd_rbtree_valid);
5530
5531                 down_write(&llsd->llsd_rb_rwsem);
5532                 rc = lfsck_rbtree_setup(env, com);
5533                 up_write(&llsd->llsd_rb_rwsem);
5534         }
5535
5536         CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos ["
5537                "%llu]\n", lfsck_lfsck2name(lfsck),
5538                com->lc_pos_start.lp_oit_cookie);
5539
5540         return rc;
5541 }
5542
5543 static int lfsck_layout_master_prep(const struct lu_env *env,
5544                                     struct lfsck_component *com,
5545                                     struct lfsck_start_param *lsp)
5546 {
5547         int rc;
5548         ENTRY;
5549
5550         rc = lfsck_layout_load_bitmap(env, com);
5551         if (rc != 0) {
5552                 rc = lfsck_layout_reset(env, com, false);
5553                 if (rc == 0)
5554                         rc = lfsck_set_param(env, com->lc_lfsck,
5555                                              lsp->lsp_start, true);
5556
5557                 if (rc != 0)
5558                         GOTO(log, rc);
5559         }
5560
5561         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5562         if (rc != 0)
5563                 RETURN(rc);
5564
5565         rc = lfsck_start_assistant(env, com, lsp);
5566
5567         GOTO(log, rc);
5568
5569 log:
5570         CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos ["
5571                "%llu]\n", lfsck_lfsck2name(com->lc_lfsck),
5572                com->lc_pos_start.lp_oit_cookie);
5573
5574         return 0;
5575 }
5576
5577 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5578 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5579                                      struct lfsck_component *com,
5580                                      struct dt_object *parent,
5581                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5582 {
5583         struct lfsck_thread_info        *info    = lfsck_env_info(env);
5584         struct lfsck_instance           *lfsck   = com->lc_lfsck;
5585         struct lfsck_bookmark           *bk      = &lfsck->li_bookmark_ram;
5586         struct lfsck_layout             *lo      = com->lc_file_ram;
5587         struct lfsck_assistant_data     *lad     = com->lc_data;
5588         struct lfsck_assistant_object   *lso     = NULL;
5589         struct lov_ost_data_v1          *objs;
5590         struct lfsck_tgt_descs          *ltds    = &lfsck->li_ost_descs;
5591         struct ptlrpc_thread            *mthread = &lfsck->li_thread;
5592         struct ptlrpc_thread            *athread = &lad->lad_thread;
5593         struct lu_buf                    buf;
5594         int                              rc      = 0;
5595         int                              i;
5596         __u32                            magic;
5597         __u16                            count;
5598         ENTRY;
5599
5600         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5601         magic = le32_to_cpu(lmm->lmm_magic);
5602         if (magic == LOV_MAGIC_V1) {
5603                 objs = &lmm->lmm_objects[0];
5604         } else {
5605                 LASSERT(magic == LOV_MAGIC_V3);
5606                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5607         }
5608
5609         count = le16_to_cpu(lmm->lmm_stripe_count);
5610         for (i = 0; i < count; i++, objs++) {
5611                 struct lu_fid           *fid    = &info->lti_fid;
5612                 struct ost_id           *oi     = &info->lti_oi;
5613                 struct lfsck_layout_req *llr;
5614                 struct lfsck_tgt_desc   *tgt    = NULL;
5615                 struct dt_object        *cobj   = NULL;
5616                 __u32                    index;
5617                 bool                     wakeup = false;
5618
5619                 if (unlikely(lovea_slot_is_dummy(objs)))
5620                         continue;
5621
5622                 wait_event_idle(mthread->t_ctl_waitq,
5623                                 lad->lad_prefetched < bk->lb_async_windows ||
5624                                 !thread_is_running(mthread) ||
5625                                 thread_is_stopped(athread));
5626
5627                 if (unlikely(!thread_is_running(mthread)) ||
5628                              thread_is_stopped(athread))
5629                         GOTO(out, rc = 0);
5630
5631                 if (unlikely(lfsck_is_dead_obj(parent)))
5632                         GOTO(out, rc = 0);
5633
5634                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5635                 index = le32_to_cpu(objs->l_ost_idx);
5636                 rc = ostid_to_fid(fid, oi, index);
5637                 if (rc != 0) {
5638                         CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID
5639                                ": "DOSTID", idx %u, comp_id %u\n",
5640                                lfsck_lfsck2name(lfsck),
5641                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5642                                index, comp_id);
5643                         goto next;
5644                 }
5645
5646                 tgt = lfsck_tgt_get(ltds, index);
5647                 if (unlikely(tgt == NULL)) {
5648                         CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which "
5649                                "did not join the layout LFSCK, comp_id %u\n",
5650                                lfsck_lfsck2name(lfsck), index, comp_id);
5651                         lfsck_lad_set_bitmap(env, com, index);
5652                         goto next;
5653                 }
5654
5655                 /* There is potential deadlock race condition between object
5656                  * destroy and layout LFSCK. Consider the following scenario:
5657                  *
5658                  * 1) The LFSCK thread obtained the parent object firstly, at
5659                  *    that time, the parent object has not been destroyed yet.
5660                  *
5661                  * 2) One RPC service thread destroyed the parent and all its
5662                  *    children objects. Because the LFSCK is referencing the
5663                  *    parent object, then the parent object will be marked as
5664                  *    dying in RAM. On the other hand, the parent object is
5665                  *    referencing all its children objects, then all children
5666                  *    objects will be marked as dying in RAM also.
5667                  *
5668                  * 3) The LFSCK thread tries to find some child object with
5669                  *    the parent object referenced. Then it will find that the
5670                  *    child object is dying. According to the object visibility
5671                  *    rules: the object with dying flag cannot be returned to
5672                  *    others. So the LFSCK thread has to wait until the dying
5673                  *    object has been purged from RAM, then it can allocate a
5674                  *    new object (with the same FID) in RAM. Unfortunately, the
5675                  *    LFSCK thread itself is referencing the parent object, and
5676                  *    cause the parent object cannot be purged, then cause the
5677                  *    child object cannot be purged also. So the LFSCK thread
5678                  *    will fall into deadlock.
5679                  */
5680                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5681                 if (IS_ERR(cobj)) {
5682                         if (lfsck_is_dead_obj(parent)) {
5683                                 lfsck_tgt_put(tgt);
5684
5685                                 GOTO(out, rc = 0);
5686                         }
5687
5688                         rc = PTR_ERR(cobj);
5689                         goto next;
5690                 }
5691
5692                 rc = dt_declare_attr_get(env, cobj);
5693                 if (rc)
5694                         goto next;
5695
5696                 rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
5697                 if (rc)
5698                         goto next;
5699
5700                 if (lso == NULL) {
5701                         struct lu_attr *attr = &info->lti_la;
5702
5703                         rc = dt_attr_get(env, parent, attr);
5704                         if (rc != 0)
5705                                 goto next;
5706
5707                         lso = lfsck_assistant_object_init(env,
5708                                 lfsck_dto2fid(parent), attr,
5709                                 lfsck->li_pos_current.lp_oit_cookie, false);
5710                         if (IS_ERR(lso)) {
5711                                 rc = PTR_ERR(lso);
5712                                 lso = NULL;
5713
5714                                 goto next;
5715                         }
5716                 }
5717
5718                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5719                                                       index, i);
5720                 if (IS_ERR(llr)) {
5721                         rc = PTR_ERR(llr);
5722                         goto next;
5723                 }
5724
5725                 cobj = NULL;
5726                 spin_lock(&lad->lad_lock);
5727                 if (lad->lad_assistant_status < 0) {
5728                         spin_unlock(&lad->lad_lock);
5729                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5730                         lfsck_tgt_put(tgt);
5731                         RETURN(lad->lad_assistant_status);
5732                 }
5733
5734                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5735                 if (lad->lad_prefetched == 0)
5736                         wakeup = true;
5737
5738                 lad->lad_prefetched++;
5739                 spin_unlock(&lad->lad_lock);
5740                 if (wakeup)
5741                         wake_up(&athread->t_ctl_waitq);
5742
5743 next:
5744                 down_write(&com->lc_sem);
5745                 com->lc_new_checked++;
5746                 if (rc < 0)
5747                         lfsck_layout_record_failure(env, lfsck, lo);
5748                 up_write(&com->lc_sem);
5749
5750                 if (cobj != NULL && !IS_ERR(cobj))
5751                         lfsck_object_put(env, cobj);
5752
5753                 if (likely(tgt != NULL))
5754                         lfsck_tgt_put(tgt);
5755
5756                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5757                         GOTO(out, rc);
5758         }
5759
5760         GOTO(out, rc = 0);
5761
5762 out:
5763         if (lso != NULL)
5764                 lfsck_assistant_object_put(env, lso);
5765
5766         return rc;
5767 }
5768
5769 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5770  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5771  * list ::lad_req_list.
5772  *
5773  * For each request on above list, the lfsck_layout_assistant thread compares
5774  * the OST side attribute with local attribute, if inconsistent, then repair it.
5775  *
5776  * All above processing is async mode with pipeline. */
5777 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5778                                         struct lfsck_component *com,
5779                                         struct dt_object *obj)
5780 {
5781         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5782         struct ost_id                   *oi     = &info->lti_oi;
5783         struct lfsck_layout             *lo     = com->lc_file_ram;
5784         struct lfsck_assistant_data     *lad    = com->lc_data;
5785         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5786         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
5787         struct thandle                  *handle = NULL;
5788         struct lu_buf                   *buf    = &info->lti_big_buf;
5789         struct lov_mds_md_v1            *lmm    = NULL;
5790         struct dt_device                *dev    = lfsck_obj2dev(obj);
5791         struct lustre_handle             lh     = { 0 };
5792         struct lu_buf                    ea_buf = { NULL };
5793         struct lov_comp_md_v1           *lcm    = NULL;
5794         struct lov_comp_md_entry_v1     *lcme   = NULL;
5795         int                              rc     = 0;
5796         int                              size   = 0;
5797         __u32                            magic  = 0;
5798         __u16                            count  = 0;
5799         bool                             locked = false;
5800         bool                             stripe = false;
5801         bool                             bad_oi = false;
5802         ENTRY;
5803
5804         if (!S_ISREG(lfsck_object_type(obj)))
5805                 GOTO(out, rc = 0);
5806
5807         if (lad->lad_assistant_status < 0)
5808                 GOTO(out, rc = -ESRCH);
5809
5810         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5811         lmm_oi_cpu_to_le(oi, oi);
5812         dt_read_lock(env, obj, 0);
5813         locked = true;
5814
5815 again:
5816         bad_oi = false;
5817         if (dt_object_exists(obj) == 0 ||
5818             lfsck_is_dead_obj(obj))
5819                 GOTO(out, rc = 0);
5820
5821         rc = lfsck_layout_get_lovea(env, obj, buf);
5822         if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP)
5823                 /* Skip bad lov EA during the 1st cycle scanning, and
5824                  * try to recover it via orphan in the 2nd scanning. */
5825                 rc = 0;
5826         if (rc <= 0)
5827                 GOTO(out, rc);
5828
5829         size = rc;
5830         lmm = buf->lb_buf;
5831         magic = le32_to_cpu(lmm->lmm_magic);
5832         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5833                 struct lov_mds_md_v1 *v1;
5834                 int i;
5835
5836                 lcm = buf->lb_buf;
5837                 count = le16_to_cpu(lcm->lcm_entry_count);
5838                 for (i = 0; i < count; i++) {
5839                         lcme = &lcm->lcm_entries[i];
5840                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5841                         if (memcmp(oi, &v1->lmm_oi, sizeof(*oi)) != 0)
5842                                 goto fix;
5843                 }
5844
5845                 GOTO(out, stripe = true);
5846         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5847                 GOTO(out, stripe = true);
5848         }
5849
5850 fix:
5851         /* Inconsistent lmm_oi, should be repaired. */
5852         bad_oi = true;
5853
5854         if (bk->lb_param & LPF_DRYRUN) {
5855                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5856
5857                 GOTO(out, stripe = true);
5858         }
5859
5860         if (!lustre_handle_is_used(&lh)) {
5861                 dt_read_unlock(env, obj);
5862                 locked = false;
5863                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5864                                       MDS_INODELOCK_LAYOUT |
5865                                       MDS_INODELOCK_XATTR, LCK_EX);
5866                 if (rc != 0)
5867                         GOTO(out, rc);
5868
5869                 handle = lfsck_trans_create(env, dev, lfsck);
5870                 if (IS_ERR(handle))
5871                         GOTO(out, rc = PTR_ERR(handle));
5872
5873                 lfsck_buf_init(&ea_buf, buf->lb_buf, size);
5874                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5875                                           LU_XATTR_REPLACE, handle);
5876                 if (rc != 0)
5877                         GOTO(out, rc);
5878
5879                 rc = dt_trans_start_local(env, dev, handle);
5880                 if (rc != 0)
5881                         GOTO(out, rc);
5882
5883                 dt_write_lock(env, obj, 0);
5884                 locked = true;
5885
5886                 goto again;
5887         }
5888
5889         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5890                 struct lov_mds_md_v1 *v1;
5891                 int i;
5892
5893                 for (i = 0; i < count; i++) {
5894                         lcme = &lcm->lcm_entries[i];
5895                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5896                         v1->lmm_oi = *oi;
5897                 }
5898         } else {
5899                 lmm->lmm_oi = *oi;
5900         }
5901
5902         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5903                           LU_XATTR_REPLACE, handle);
5904         if (rc != 0)
5905                 GOTO(out, rc);
5906
5907         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5908
5909         GOTO(out, stripe = true);
5910
5911 out:
5912         if (locked) {
5913                 if (lustre_handle_is_used(&lh))
5914                         dt_write_unlock(env, obj);
5915                 else
5916                         dt_read_unlock(env, obj);
5917         }
5918
5919         if (handle != NULL && !IS_ERR(handle))
5920                 dt_trans_stop(env, dev, handle);
5921
5922         lfsck_ibits_unlock(&lh, LCK_EX);
5923
5924         if (bad_oi)
5925                 CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for "
5926                        DFID": rc = %d\n", lfsck_lfsck2name(lfsck),
5927                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5928                        PFID(lfsck_dto2fid(obj)), rc);
5929
5930         if (stripe) {
5931                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5932                         int i;
5933
5934                         for (i = 0; i < count; i++) {
5935                                 lcme = &lcm->lcm_entries[i];
5936                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5937                                       LCME_FL_INIT))
5938                                         continue;
5939
5940                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5941                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5942                                         le32_to_cpu(lcme->lcme_offset)),
5943                                         le32_to_cpu(lcme->lcme_id));
5944                         }
5945                 } else {
5946                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5947                 }
5948         } else {
5949                 down_write(&com->lc_sem);
5950                 com->lc_new_checked++;
5951                 if (rc < 0)
5952                         lfsck_layout_record_failure(env, lfsck, lo);
5953                 up_write(&com->lc_sem);
5954         }
5955
5956         return rc;
5957 }
5958
5959 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5960                                        struct lfsck_component *com,
5961                                        struct dt_object *obj)
5962 {
5963         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5964         struct lfsck_layout             *lo     = com->lc_file_ram;
5965         const struct lu_fid             *fid    = lfsck_dto2fid(obj);
5966         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5967         struct lfsck_layout_seq         *lls;
5968         __u64                            seq;
5969         __u64                            oid;
5970         int                              rc;
5971         ENTRY;
5972
5973         LASSERT(llsd != NULL);
5974
5975         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5976             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5977                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5978
5979                 wait_event_idle_timeout(thread->t_ctl_waitq,
5980                                         !thread_is_running(thread),
5981                                         cfs_time_seconds(1));
5982         }
5983
5984         lfsck_rbtree_update_bitmap(env, com, fid, false);
5985
5986         down_write(&com->lc_sem);
5987         if (fid_is_idif(fid))
5988                 seq = 0;
5989         else if (!fid_is_norm(fid) ||
5990                  !fid_is_for_ostobj(env, lfsck, obj, fid))
5991                 GOTO(unlock, rc = 0);
5992         else
5993                 seq = fid_seq(fid);
5994         com->lc_new_checked++;
5995
5996         lls = lfsck_layout_seq_lookup(llsd, seq);
5997         if (lls == NULL) {
5998                 OBD_ALLOC_PTR(lls);
5999                 if (unlikely(lls == NULL))
6000                         GOTO(unlock, rc = -ENOMEM);
6001
6002                 INIT_LIST_HEAD(&lls->lls_list);
6003                 lls->lls_seq = seq;
6004                 rc = lfsck_layout_lastid_load(env, com, lls);
6005                 if (rc != 0) {
6006                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
6007                               "load LAST_ID for %#llx: rc = %d\n",
6008                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
6009                         lo->ll_objs_failed_phase1++;
6010                         OBD_FREE_PTR(lls);
6011                         GOTO(unlock, rc);
6012                 }
6013
6014                 lfsck_layout_seq_insert(llsd, lls);
6015         }
6016
6017         if (unlikely(fid_is_last_id(fid)))
6018                 GOTO(unlock, rc = 0);
6019
6020         if (fid_is_idif(fid))
6021                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
6022         else
6023                 oid = fid_oid(fid);
6024
6025         if (oid > lls->lls_lastid_known)
6026                 lls->lls_lastid_known = oid;
6027
6028         if (oid > lls->lls_lastid) {
6029                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
6030                         /* OFD may create new objects during LFSCK scanning. */
6031                         rc = lfsck_layout_lastid_reload(env, com, lls);
6032                         if (unlikely(rc != 0)) {
6033                                 CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
6034                                       "reload LAST_ID for %#llx: rc = %d\n",
6035                                       lfsck_lfsck2name(com->lc_lfsck),
6036                                       lls->lls_seq, rc);
6037
6038                                 GOTO(unlock, rc);
6039                         }
6040
6041                         if (oid <= lls->lls_lastid ||
6042                             lo->ll_flags & LF_CRASHED_LASTID)
6043                                 GOTO(unlock, rc = 0);
6044
6045                         LASSERT(lfsck->li_out_notify != NULL);
6046
6047                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6048                                              LE_LASTID_REBUILDING);
6049                         lo->ll_flags |= LF_CRASHED_LASTID;
6050
6051                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
6052                                "LAST_ID file (2) for the sequence %#llx"
6053                                ", old value %llu, known value %llu\n",
6054                                lfsck_lfsck2name(lfsck), lls->lls_seq,
6055                                lls->lls_lastid, oid);
6056                 }
6057
6058                 lls->lls_lastid = oid;
6059                 lls->lls_dirty = 1;
6060         }
6061
6062         GOTO(unlock, rc = 0);
6063
6064 unlock:
6065         up_write(&com->lc_sem);
6066
6067         return rc;
6068 }
6069
6070 static int lfsck_layout_exec_dir(const struct lu_env *env,
6071                                  struct lfsck_component *com,
6072                                  struct lfsck_assistant_object *lso,
6073                                  struct lu_dirent *ent, __u16 type)
6074 {
6075         return 0;
6076 }
6077
6078 static int lfsck_layout_master_post(const struct lu_env *env,
6079                                     struct lfsck_component *com,
6080                                     int result, bool init)
6081 {
6082         struct lfsck_instance   *lfsck  = com->lc_lfsck;
6083         struct lfsck_layout     *lo     = com->lc_file_ram;
6084         int                      rc;
6085         ENTRY;
6086
6087         lfsck_post_generic(env, com, &result);
6088
6089         down_write(&com->lc_sem);
6090         spin_lock(&lfsck->li_lock);
6091         if (!init)
6092                 lo->ll_pos_last_checkpoint =
6093                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6094
6095         if (result > 0) {
6096                 if (lo->ll_flags & LF_INCOMPLETE)
6097                         lo->ll_status = LS_PARTIAL;
6098                 else
6099                         lo->ll_status = LS_SCANNING_PHASE2;
6100                 lo->ll_flags |= LF_SCANNED_ONCE;
6101                 lo->ll_flags &= ~LF_UPGRADE;
6102                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6103         } else if (result == 0) {
6104                 if (lfsck->li_status != 0)
6105                         lo->ll_status = lfsck->li_status;
6106                 else
6107                         lo->ll_status = LS_STOPPED;
6108                 if (lo->ll_status != LS_PAUSED)
6109                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6110         } else {
6111                 lo->ll_status = LS_FAILED;
6112                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6113         }
6114         spin_unlock(&lfsck->li_lock);
6115
6116         if (!init) {
6117                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6118                                           lfsck->li_time_last_checkpoint;
6119                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6120                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6121                 com->lc_new_checked = 0;
6122         }
6123
6124         rc = lfsck_layout_store(env, com);
6125         up_write(&com->lc_sem);
6126
6127         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
6128                lfsck_lfsck2name(lfsck), rc);
6129
6130         RETURN(rc);
6131 }
6132
6133 static int lfsck_layout_slave_post(const struct lu_env *env,
6134                                    struct lfsck_component *com,
6135                                    int result, bool init)
6136 {
6137         struct lfsck_instance   *lfsck = com->lc_lfsck;
6138         struct lfsck_layout     *lo    = com->lc_file_ram;
6139         int                      rc;
6140         bool                     done  = false;
6141
6142         down_write(&com->lc_sem);
6143         rc = lfsck_layout_lastid_store(env, com);
6144         if (rc != 0)
6145                 result = rc;
6146
6147         LASSERT(lfsck->li_out_notify != NULL);
6148
6149         spin_lock(&lfsck->li_lock);
6150         if (!init)
6151                 lo->ll_pos_last_checkpoint =
6152                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6153
6154         if (result > 0) {
6155                 lo->ll_status = LS_SCANNING_PHASE2;
6156                 lo->ll_flags |= LF_SCANNED_ONCE;
6157                 if (lo->ll_flags & LF_CRASHED_LASTID) {
6158                         done = true;
6159                         lo->ll_flags &= ~LF_CRASHED_LASTID;
6160
6161                         CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt "
6162                                "crashed LAST_ID files successfully\n",
6163                                lfsck_lfsck2name(lfsck));
6164                 }
6165                 lo->ll_flags &= ~LF_UPGRADE;
6166                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6167         } else if (result == 0) {
6168                 if (lfsck->li_status != 0)
6169                         lo->ll_status = lfsck->li_status;
6170                 else
6171                         lo->ll_status = LS_STOPPED;
6172                 if (lo->ll_status != LS_PAUSED)
6173                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6174         } else {
6175                 lo->ll_status = LS_FAILED;
6176                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6177         }
6178         spin_unlock(&lfsck->li_lock);
6179
6180         if (done)
6181                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6182                                      LE_LASTID_REBUILT);
6183
6184         if (!init) {
6185                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6186                                           lfsck->li_time_last_checkpoint;
6187                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6188                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6189                 com->lc_new_checked = 0;
6190         }
6191
6192         rc = lfsck_layout_store(env, com);
6193         up_write(&com->lc_sem);
6194
6195         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
6196
6197         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
6198                lfsck_lfsck2name(lfsck), rc);
6199
6200         return rc;
6201 }
6202
6203 static void lfsck_layout_dump(const struct lu_env *env,
6204                               struct lfsck_component *com, struct seq_file *m)
6205 {
6206         struct lfsck_instance   *lfsck = com->lc_lfsck;
6207         struct lfsck_bookmark   *bk    = &lfsck->li_bookmark_ram;
6208         struct lfsck_layout     *lo    = com->lc_file_ram;
6209         const char *prefix;
6210
6211         down_read(&com->lc_sem);
6212         if (bk->lb_param & LPF_DRYRUN)
6213                 prefix = "inconsistent";
6214         else
6215                 prefix = "repaired";
6216
6217         seq_printf(m, "name: lfsck_layout\n"
6218                    "magic: %#x\n"
6219                    "version: %d\n"
6220                    "status: %s\n",
6221                    lo->ll_magic,
6222                    bk->lb_version,
6223                    lfsck_status2name(lo->ll_status));
6224
6225         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
6226
6227         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
6228
6229         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
6230
6231         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
6232
6233         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
6234
6235         seq_printf(m, "latest_start_position: %llu\n"
6236                    "last_checkpoint_position: %llu\n"
6237                    "first_failure_position: %llu\n",
6238                    lo->ll_pos_latest_start,
6239                    lo->ll_pos_last_checkpoint,
6240                    lo->ll_pos_first_inconsistent);
6241
6242         seq_printf(m, "success_count: %u\n"
6243                    "%s_dangling: %llu\n"
6244                    "%s_unmatched_pair: %llu\n"
6245                    "%s_multiple_referenced: %llu\n"
6246                    "%s_orphan: %llu\n"
6247                    "%s_inconsistent_owner: %llu\n"
6248                    "%s_others: %llu\n"
6249                    "skipped: %llu\n"
6250                    "failed_phase1: %llu\n"
6251                    "failed_phase2: %llu\n",
6252                    lo->ll_success_count,
6253                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
6254                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
6255                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
6256                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
6257                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
6258                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
6259                    lo->ll_objs_skipped,
6260                    lo->ll_objs_failed_phase1,
6261                    lo->ll_objs_failed_phase2);
6262
6263         if (lo->ll_status == LS_SCANNING_PHASE1) {
6264                 time64_t duration = ktime_get_seconds() -
6265                                     lfsck->li_time_last_checkpoint;
6266                 u64 checked = lo->ll_objs_checked_phase1 +
6267                               com->lc_new_checked;
6268                 u64 speed = checked;
6269                 u64 new_checked = com->lc_new_checked;
6270                 time64_t rtime = lo->ll_run_time_phase1 + duration;
6271                 u64 pos;
6272
6273                 if (duration != 0)
6274                         new_checked = div64_s64(new_checked, duration);
6275                 if (rtime != 0)
6276                         speed = div64_s64(speed, rtime);
6277                 seq_printf(m, "checked_phase1: %llu\n"
6278                            "checked_phase2: %llu\n"
6279                            "run_time_phase1: %lld seconds\n"
6280                            "run_time_phase2: %lld seconds\n"
6281                            "average_speed_phase1: %llu items/sec\n"
6282                            "average_speed_phase2: N/A\n"
6283                            "real_time_speed_phase1: %llu items/sec\n"
6284                            "real_time_speed_phase2: N/A\n",
6285                            checked,
6286                            lo->ll_objs_checked_phase2,
6287                            rtime,
6288                            lo->ll_run_time_phase2,
6289                            speed,
6290                            new_checked);
6291
6292                 if (likely(lfsck->li_di_oit)) {
6293                         const struct dt_it_ops *iops =
6294                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
6295
6296                         /* The low layer otable-based iteration position may NOT
6297                          * exactly match the layout-based directory traversal
6298                          * cookie. Generally, it is not a serious issue. But the
6299                          * caller should NOT make assumption on that. */
6300                         pos = iops->store(env, lfsck->li_di_oit);
6301                         if (!lfsck->li_current_oit_processed)
6302                                 pos--;
6303                 } else {
6304                         pos = lo->ll_pos_last_checkpoint;
6305                 }
6306
6307                 seq_printf(m, "current_position: %llu\n", pos);
6308         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
6309                 time64_t duration = ktime_get_seconds() -
6310                                     com->lc_time_last_checkpoint;
6311                 u64 checked = lo->ll_objs_checked_phase2 +
6312                               com->lc_new_checked;
6313                 u64 speed1 = lo->ll_objs_checked_phase1;
6314                 u64 speed2 = checked;
6315                 u64 new_checked = com->lc_new_checked;
6316                 time64_t rtime = lo->ll_run_time_phase2 + duration;
6317
6318                 if (duration != 0)
6319                         new_checked = div64_s64(new_checked, duration);
6320                 if (lo->ll_run_time_phase1 != 0)
6321                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6322                 if (rtime != 0)
6323                         speed2 = div64_s64(speed2, rtime);
6324                 seq_printf(m, "checked_phase1: %llu\n"
6325                            "checked_phase2: %llu\n"
6326                            "run_time_phase1: %lld seconds\n"
6327                            "run_time_phase2: %lld seconds\n"
6328                            "average_speed_phase1: %llu items/sec\n"
6329                            "average_speed_phase2: %llu items/sec\n"
6330                            "real_time_speed_phase1: N/A\n"
6331                            "real_time_speed_phase2: %llu items/sec\n"
6332                            "current_position: "DFID"\n",
6333                            lo->ll_objs_checked_phase1,
6334                            checked,
6335                            lo->ll_run_time_phase1,
6336                            rtime,
6337                            speed1,
6338                            speed2,
6339                            new_checked,
6340                            PFID(&com->lc_fid_latest_scanned_phase2));
6341         } else {
6342                 __u64 speed1 = lo->ll_objs_checked_phase1;
6343                 __u64 speed2 = lo->ll_objs_checked_phase2;
6344
6345                 if (lo->ll_run_time_phase1 != 0)
6346                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6347                 if (lo->ll_run_time_phase2 != 0)
6348                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
6349                 seq_printf(m, "checked_phase1: %llu\n"
6350                            "checked_phase2: %llu\n"
6351                            "run_time_phase1: %lld seconds\n"
6352                            "run_time_phase2: %lld seconds\n"
6353                            "average_speed_phase1: %llu items/sec\n"
6354                            "average_speed_phase2: %llu objs/sec\n"
6355                            "real_time_speed_phase1: N/A\n"
6356                            "real_time_speed_phase2: N/A\n"
6357                            "current_position: N/A\n",
6358                            lo->ll_objs_checked_phase1,
6359                            lo->ll_objs_checked_phase2,
6360                            lo->ll_run_time_phase1,
6361                            lo->ll_run_time_phase2,
6362                            speed1,
6363                            speed2);
6364         }
6365
6366         up_read(&com->lc_sem);
6367 }
6368
6369 static int lfsck_layout_master_double_scan(const struct lu_env *env,
6370                                            struct lfsck_component *com)
6371 {
6372         struct lfsck_layout             *lo     = com->lc_file_ram;
6373         struct lfsck_assistant_data     *lad    = com->lc_data;
6374         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6375         struct lfsck_tgt_descs          *ltds;
6376         struct lfsck_tgt_desc           *ltd;
6377         struct lfsck_tgt_desc           *next;
6378         int                              rc;
6379
6380         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6381
6382         if (thread_is_stopped(&lad->lad_thread)) {
6383                 LASSERT(list_empty(&lad->lad_req_list));
6384                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6385                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6386
6387                 ltds = &lfsck->li_ost_descs;
6388                 spin_lock(&ltds->ltd_lock);
6389                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6390                                          ltd_layout_phase_list) {
6391                         list_del_init(&ltd->ltd_layout_phase_list);
6392                 }
6393                 spin_unlock(&ltds->ltd_lock);
6394
6395                 ltds = &lfsck->li_mdt_descs;
6396                 spin_lock(&ltds->ltd_lock);
6397                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6398                                          ltd_layout_phase_list) {
6399                         list_del_init(&ltd->ltd_layout_phase_list);
6400                 }
6401                 spin_unlock(&ltds->ltd_lock);
6402         }
6403
6404         return rc;
6405 }
6406
6407 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6408                                           struct lfsck_component *com)
6409 {
6410         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6411         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
6412         struct lfsck_layout             *lo     = com->lc_file_ram;
6413         struct ptlrpc_thread            *thread = &lfsck->li_thread;
6414         int                              rc;
6415         ENTRY;
6416
6417         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6418                lfsck_lfsck2name(lfsck));
6419
6420         atomic_inc(&lfsck->li_double_scan_count);
6421
6422         if (lo->ll_flags & LF_INCOMPLETE)
6423                 GOTO(done, rc = 1);
6424
6425         com->lc_new_checked = 0;
6426         com->lc_new_scanned = 0;
6427         com->lc_time_last_checkpoint = ktime_get_seconds();
6428         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6429                                        LFSCK_CHECKPOINT_INTERVAL;
6430
6431         while (1) {
6432                 rc = lfsck_layout_slave_query_master(env, com);
6433                 if (list_empty(&llsd->llsd_master_list)) {
6434                         if (unlikely(!thread_is_running(thread)))
6435                                 rc = 0;
6436                         else
6437                                 rc = 1;
6438
6439                         GOTO(done, rc);
6440                 }
6441
6442                 if (rc < 0)
6443                         GOTO(done, rc);
6444
6445                 rc = wait_event_idle_timeout(
6446                         thread->t_ctl_waitq,
6447                         !thread_is_running(thread) ||
6448                         lo->ll_flags & LF_INCOMPLETE ||
6449                         list_empty(&llsd->llsd_master_list),
6450                         cfs_time_seconds(30));
6451                 if (unlikely(!thread_is_running(thread)))
6452                         GOTO(done, rc = 0);
6453
6454                 if (lo->ll_flags & LF_INCOMPLETE)
6455                         GOTO(done, rc = 1);
6456
6457                 if (rc == 0)
6458                         continue;
6459
6460                 GOTO(done, rc = 1);
6461         }
6462
6463 done:
6464         rc = lfsck_layout_double_scan_result(env, com, rc);
6465         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6466                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6467         lfsck_layout_slave_quit(env, com);
6468         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6469                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6470
6471         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, "
6472                "status %d: rc = %d\n",
6473                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6474
6475         return rc;
6476 }
6477
6478 static void lfsck_layout_master_data_release(const struct lu_env *env,
6479                                              struct lfsck_component *com)
6480 {
6481         struct lfsck_assistant_data     *lad    = com->lc_data;
6482         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6483         struct lfsck_tgt_descs          *ltds;
6484         struct lfsck_tgt_desc           *ltd;
6485         struct lfsck_tgt_desc           *next;
6486
6487         LASSERT(lad != NULL);
6488         LASSERT(thread_is_init(&lad->lad_thread) ||
6489                 thread_is_stopped(&lad->lad_thread));
6490         LASSERT(list_empty(&lad->lad_req_list));
6491
6492         com->lc_data = NULL;
6493
6494         ltds = &lfsck->li_ost_descs;
6495         spin_lock(&ltds->ltd_lock);
6496         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6497                                  ltd_layout_phase_list) {
6498                 list_del_init(&ltd->ltd_layout_phase_list);
6499         }
6500         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6501                                  ltd_layout_phase_list) {
6502                 list_del_init(&ltd->ltd_layout_phase_list);
6503         }
6504         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6505                                  ltd_layout_list) {
6506                 list_del_init(&ltd->ltd_layout_list);
6507         }
6508         spin_unlock(&ltds->ltd_lock);
6509
6510         ltds = &lfsck->li_mdt_descs;
6511         spin_lock(&ltds->ltd_lock);
6512         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6513                                  ltd_layout_phase_list) {
6514                 list_del_init(&ltd->ltd_layout_phase_list);
6515         }
6516         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6517                                  ltd_layout_phase_list) {
6518                 list_del_init(&ltd->ltd_layout_phase_list);
6519         }
6520         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6521                                  ltd_layout_list) {
6522                 list_del_init(&ltd->ltd_layout_list);
6523         }
6524         spin_unlock(&ltds->ltd_lock);
6525
6526         bitmap_free(lad->lad_bitmap);
6527
6528         OBD_FREE_PTR(lad);
6529 }
6530
6531 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6532                                             struct lfsck_component *com)
6533 {
6534         struct lfsck_layout_slave_data *llsd = com->lc_data;
6535
6536         lfsck_layout_slave_quit(env, com);
6537         com->lc_data = NULL;
6538         OBD_FREE_PTR(llsd);
6539 }
6540
6541 static void lfsck_layout_master_quit(const struct lu_env *env,
6542                                      struct lfsck_component *com)
6543 {
6544         struct lfsck_assistant_data     *lad    = com->lc_data;
6545         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6546         struct lfsck_tgt_descs          *ltds;
6547         struct lfsck_tgt_desc           *ltd;
6548         struct lfsck_tgt_desc           *next;
6549
6550         LASSERT(lad != NULL);
6551
6552         lfsck_quit_generic(env, com);
6553
6554         LASSERT(thread_is_init(&lad->lad_thread) ||
6555                 thread_is_stopped(&lad->lad_thread));
6556         LASSERT(list_empty(&lad->lad_req_list));
6557
6558         ltds = &lfsck->li_ost_descs;
6559         spin_lock(&ltds->ltd_lock);
6560         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6561                                  ltd_layout_phase_list) {
6562                 list_del_init(&ltd->ltd_layout_phase_list);
6563         }
6564         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6565                                  ltd_layout_phase_list) {
6566                 list_del_init(&ltd->ltd_layout_phase_list);
6567         }
6568         spin_unlock(&ltds->ltd_lock);
6569
6570         ltds = &lfsck->li_mdt_descs;
6571         spin_lock(&ltds->ltd_lock);
6572         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6573                                  ltd_layout_phase_list) {
6574                 list_del_init(&ltd->ltd_layout_phase_list);
6575         }
6576         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6577                                  ltd_layout_phase_list) {
6578                 list_del_init(&ltd->ltd_layout_phase_list);
6579         }
6580         spin_unlock(&ltds->ltd_lock);
6581 }
6582
6583 static void lfsck_layout_slave_quit(const struct lu_env *env,
6584                                     struct lfsck_component *com)
6585 {
6586         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6587         struct lfsck_layout_seq          *lls;
6588         struct lfsck_layout_seq          *next;
6589         struct lfsck_layout_slave_target *llst;
6590
6591         LASSERT(llsd != NULL);
6592
6593         down_write(&com->lc_sem);
6594         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6595                                  lls_list) {
6596                 list_del_init(&lls->lls_list);
6597                 lfsck_object_put(env, lls->lls_lastid_obj);
6598                 OBD_FREE_PTR(lls);
6599         }
6600         up_write(&com->lc_sem);
6601
6602         spin_lock(&llsd->llsd_lock);
6603         while (!list_empty(&llsd->llsd_master_list)) {
6604                 llst = list_first_entry(&llsd->llsd_master_list,
6605                                         struct lfsck_layout_slave_target,
6606                                         llst_list);
6607                 list_del_init(&llst->llst_list);
6608                 spin_unlock(&llsd->llsd_lock);
6609                 lfsck_layout_llst_put(llst);
6610                 spin_lock(&llsd->llsd_lock);
6611         }
6612         spin_unlock(&llsd->llsd_lock);
6613
6614         lfsck_rbtree_cleanup(env, com);
6615 }
6616
6617 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6618                                          struct lfsck_component *com,
6619                                          struct lfsck_request *lr)
6620 {
6621         struct lfsck_instance           *lfsck = com->lc_lfsck;
6622         struct lfsck_layout             *lo    = com->lc_file_ram;
6623         struct lfsck_assistant_data     *lad   = com->lc_data;
6624         struct lfsck_tgt_descs          *ltds;
6625         struct lfsck_tgt_desc           *ltd;
6626         bool                             fail  = false;
6627         ENTRY;
6628
6629         if (lr->lr_event == LE_PAIRS_VERIFY) {
6630                 int rc;
6631
6632                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6633                                                      &lr->lr_fid2,
6634                                                      lr->lr_comp_id);
6635
6636                 RETURN(rc);
6637         }
6638
6639         CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
6640                "from %s %x, status %d, flags %x, flags2 %x\n",
6641                lfsck_lfsck2name(lfsck), lr->lr_event,
6642                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6643                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6644
6645         if (lr->lr_event != LE_PHASE1_DONE &&
6646             lr->lr_event != LE_PHASE2_DONE &&
6647             lr->lr_event != LE_PEER_EXIT)
6648                 RETURN(-EINVAL);
6649
6650         if (lr->lr_flags & LEF_FROM_OST)
6651                 ltds = &lfsck->li_ost_descs;
6652         else
6653                 ltds = &lfsck->li_mdt_descs;
6654         spin_lock(&ltds->ltd_lock);
6655         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6656         if (ltd == NULL) {
6657                 spin_unlock(&ltds->ltd_lock);
6658
6659                 RETURN(-ENXIO);
6660         }
6661
6662         list_del_init(&ltd->ltd_layout_phase_list);
6663         switch (lr->lr_event) {
6664         case LE_PHASE1_DONE:
6665                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6666                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6667                                 if (lr->lr_flags & LEF_FROM_OST)
6668                                         lfsck_lad_set_bitmap(env, com,
6669                                                              ltd->ltd_index);
6670                                 else
6671                                         lo->ll_flags |= LF_INCOMPLETE;
6672                         }
6673                         ltd->ltd_layout_done = 1;
6674                         list_del_init(&ltd->ltd_layout_list);
6675                         fail = true;
6676                         break;
6677                 }
6678
6679                 if (lr->lr_flags & LEF_FROM_OST) {
6680                         if (list_empty(&ltd->ltd_layout_list))
6681                                 list_add_tail(&ltd->ltd_layout_list,
6682                                               &lad->lad_ost_list);
6683                         list_add_tail(&ltd->ltd_layout_phase_list,
6684                                       &lad->lad_ost_phase2_list);
6685                 } else {
6686                         if (list_empty(&ltd->ltd_layout_list))
6687                                 list_add_tail(&ltd->ltd_layout_list,
6688                                               &lad->lad_mdt_list);
6689                         list_add_tail(&ltd->ltd_layout_phase_list,
6690                                       &lad->lad_mdt_phase2_list);
6691                 }
6692                 break;
6693         case LE_PHASE2_DONE:
6694                 ltd->ltd_layout_done = 1;
6695                 if (!list_empty(&ltd->ltd_layout_list))
6696                         list_del_init(&ltd->ltd_layout_list);
6697
6698                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6699                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6700                         fail = true;
6701                 }
6702
6703                 break;
6704         case LE_PEER_EXIT:
6705                 fail = true;
6706                 ltd->ltd_layout_done = 1;
6707                 list_del_init(&ltd->ltd_layout_list);
6708                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6709                     !(lr->lr_flags & LEF_FROM_OST))
6710                                 lo->ll_flags |= LF_INCOMPLETE;
6711                 break;
6712         default:
6713                 break;
6714         }
6715         spin_unlock(&ltds->ltd_lock);
6716
6717         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6718                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6719
6720                 memset(stop, 0, sizeof(*stop));
6721                 stop->ls_status = lr->lr_status;
6722                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6723                 lfsck_stop(env, lfsck->li_bottom, stop);
6724         } else if (lfsck_phase2_next_ready(lad)) {
6725                 wake_up(&lad->lad_thread.t_ctl_waitq);
6726         }
6727
6728         RETURN(0);
6729 }
6730
6731 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6732                                               struct lfsck_component *com,
6733                                               struct lfsck_req_local *lrl,
6734                                               struct thandle *th)
6735 {
6736         ENTRY;
6737
6738         switch (lrl->lrl_event) {
6739         case LEL_FID_ACCESSED:
6740                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6741                 RETURN(0);
6742         case LEL_PAIRS_VERIFY_LOCAL: {
6743                 int rc;
6744
6745                 lrl->lrl_status = LPVS_INIT;
6746                 /* Firstly, if the MDT-object which is claimed via OST-object
6747                  * local stored PFID xattr recognizes the OST-object, then it
6748                  * must be that the client given PFID is wrong. */
6749                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6750                                 &lrl->lrl_ff_local.ff_parent,
6751                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6752                 if (rc <= 0)
6753                         RETURN(0);
6754
6755                 lrl->lrl_status = LPVS_INCONSISTENT;
6756                 /* The OST-object local stored PFID xattr is stale. We need to
6757                  * check whether the MDT-object that is claimed via the client
6758                  * given PFID information recognizes the OST-object or not. If
6759                  * matches, then need to update the OST-object's PFID xattr. */
6760                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6761                                 &lrl->lrl_ff_client.ff_parent,
6762                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6763                 /* For rc < 0 case:
6764                  * We are not sure whether the client given PFID information
6765                  * is correct or not, do nothing to avoid improper fixing.
6766                  *
6767                  * For rc > 0 case:
6768                  * The client given PFID information is also invalid, we can
6769                  * NOT fix the OST-object inconsistency.
6770                  */
6771                 if (!rc) {
6772                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6773                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6774                 }
6775
6776                 RETURN(rc);
6777         }
6778         default:
6779                 break;
6780         }
6781
6782         RETURN(-EOPNOTSUPP);
6783 }
6784
6785 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6786                                         struct lfsck_component *com,
6787                                         struct lfsck_request *lr)
6788 {
6789         struct lfsck_instance *lfsck = com->lc_lfsck;
6790         struct lfsck_layout_slave_data *llsd = com->lc_data;
6791         struct lfsck_layout_slave_target *llst;
6792         int rc;
6793         ENTRY;
6794
6795         switch (lr->lr_event) {
6796         case LE_CONDITIONAL_DESTROY:
6797                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6798                 RETURN(rc);
6799         case LE_PHASE1_DONE: {
6800                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6801                         struct lfsck_layout *lo = com->lc_file_ram;
6802
6803                         lo->ll_flags |= LF_INCOMPLETE;
6804                         llst = lfsck_layout_llst_find_and_del(llsd,
6805                                                               lr->lr_index,
6806                                                               true);
6807                         if (llst != NULL) {
6808                                 lfsck_layout_llst_put(llst);
6809                                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6810                         }
6811                 }
6812
6813                 RETURN(0);
6814         }
6815         case LE_PHASE2_DONE:
6816         case LE_PEER_EXIT:
6817                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u "
6818                        "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck),
6819                        lr->lr_event, lr->lr_index, lr->lr_status);
6820                 break;
6821         default:
6822                 RETURN(-EINVAL);
6823         }
6824
6825         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6826         if (llst == NULL)
6827                 RETURN(0);
6828
6829         lfsck_layout_llst_put(llst);
6830         if (list_empty(&llsd->llsd_master_list))
6831                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6832
6833         if (lr->lr_event == LE_PEER_EXIT &&
6834             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6835              (list_empty(&llsd->llsd_master_list) &&
6836               (lr->lr_status == LS_STOPPED ||
6837                lr->lr_status == LS_CO_STOPPED)))) {
6838                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6839
6840                 memset(stop, 0, sizeof(*stop));
6841                 stop->ls_status = lr->lr_status;
6842                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6843                 lfsck_stop(env, lfsck->li_bottom, stop);
6844         }
6845
6846         RETURN(0);
6847 }
6848
6849 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6850 {
6851         int i;
6852
6853         for (i = 0; i < LLIT_MAX; i++)
6854                 *count += lo->ll_objs_repaired[i];
6855 }
6856
6857 static int lfsck_layout_query_all(const struct lu_env *env,
6858                                   struct lfsck_component *com,
6859                                   __u32 *mdts_count, __u32 *osts_count,
6860                                   __u64 *repaired)
6861 {
6862         struct lfsck_layout *lo = com->lc_file_ram;
6863         struct lfsck_tgt_descs *ltds;
6864         struct lfsck_tgt_desc *ltd;
6865         int idx;
6866         int rc;
6867         ENTRY;
6868
6869         rc = lfsck_query_all(env, com);
6870         if (rc != 0)
6871                 RETURN(rc);
6872
6873         ltds = &com->lc_lfsck->li_mdt_descs;
6874         down_read(&ltds->ltd_rw_sem);
6875         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6876                 ltd = lfsck_ltd2tgt(ltds, idx);
6877                 LASSERT(ltd != NULL);
6878
6879                 mdts_count[ltd->ltd_layout_status]++;
6880                 *repaired += ltd->ltd_layout_repaired;
6881         }
6882         up_read(&ltds->ltd_rw_sem);
6883
6884         ltds = &com->lc_lfsck->li_ost_descs;
6885         down_read(&ltds->ltd_rw_sem);
6886         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6887                 ltd = lfsck_ltd2tgt(ltds, idx);
6888                 LASSERT(ltd != NULL);
6889
6890                 osts_count[ltd->ltd_layout_status]++;
6891                 *repaired += ltd->ltd_layout_repaired;
6892         }
6893         up_read(&ltds->ltd_rw_sem);
6894
6895         down_read(&com->lc_sem);
6896         mdts_count[lo->ll_status]++;
6897         lfsck_layout_repaired(lo, repaired);
6898         up_read(&com->lc_sem);
6899
6900         RETURN(0);
6901 }
6902
6903 static int lfsck_layout_query(const struct lu_env *env,
6904                               struct lfsck_component *com,
6905                               struct lfsck_request *req,
6906                               struct lfsck_reply *rep,
6907                               struct lfsck_query *que, int idx)
6908 {
6909         struct lfsck_layout *lo = com->lc_file_ram;
6910         int rc = 0;
6911
6912         if (que != NULL) {
6913                 LASSERT(com->lc_lfsck->li_master);
6914
6915                 rc = lfsck_layout_query_all(env, com,
6916                                             que->lu_mdts_count[idx],
6917                                             que->lu_osts_count[idx],
6918                                             &que->lu_repaired[idx]);
6919         } else {
6920                 down_read(&com->lc_sem);
6921                 rep->lr_status = lo->ll_status;
6922                 if (req->lr_flags & LEF_QUERY_ALL)
6923                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6924                 up_read(&com->lc_sem);
6925         }
6926
6927         return rc;
6928 }
6929
6930 /* with lfsck::li_lock held */
6931 static int lfsck_layout_slave_join(const struct lu_env *env,
6932                                    struct lfsck_component *com,
6933                                    struct lfsck_start_param *lsp)
6934 {
6935         struct lfsck_instance            *lfsck = com->lc_lfsck;
6936         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6937         struct lfsck_layout_slave_target *llst;
6938         struct lfsck_start               *start = lsp->lsp_start;
6939         int                               rc    = 0;
6940         ENTRY;
6941
6942         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6943                 RETURN(0);
6944
6945         if (!lsp->lsp_index_valid)
6946                 RETURN(-EINVAL);
6947
6948         /* If someone is running the LFSCK without orphan handling,
6949          * it will not maintain the object accessing rbtree. So we
6950          * cannot join it for orphan handling. */
6951         if (!llsd->llsd_rbtree_valid)
6952                 RETURN(-EBUSY);
6953
6954         spin_unlock(&lfsck->li_lock);
6955         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6956         spin_lock(&lfsck->li_lock);
6957         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6958                 spin_unlock(&lfsck->li_lock);
6959                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6960                                                       true);
6961                 if (llst != NULL)
6962                         lfsck_layout_llst_put(llst);
6963                 spin_lock(&lfsck->li_lock);
6964                 rc = -EAGAIN;
6965         }
6966
6967         RETURN(rc);
6968 }
6969
6970 static const struct lfsck_operations lfsck_layout_master_ops = {
6971         .lfsck_reset            = lfsck_layout_reset,
6972         .lfsck_fail             = lfsck_layout_fail,
6973         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
6974         .lfsck_prep             = lfsck_layout_master_prep,
6975         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
6976         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6977         .lfsck_post             = lfsck_layout_master_post,
6978         .lfsck_dump             = lfsck_layout_dump,
6979         .lfsck_double_scan      = lfsck_layout_master_double_scan,
6980         .lfsck_data_release     = lfsck_layout_master_data_release,
6981         .lfsck_quit             = lfsck_layout_master_quit,
6982         .lfsck_in_notify        = lfsck_layout_master_in_notify,
6983         .lfsck_query            = lfsck_layout_query,
6984 };
6985
6986 static const struct lfsck_operations lfsck_layout_slave_ops = {
6987         .lfsck_reset            = lfsck_layout_reset,
6988         .lfsck_fail             = lfsck_layout_fail,
6989         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
6990         .lfsck_prep             = lfsck_layout_slave_prep,
6991         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
6992         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6993         .lfsck_post             = lfsck_layout_slave_post,
6994         .lfsck_dump             = lfsck_layout_dump,
6995         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
6996         .lfsck_data_release     = lfsck_layout_slave_data_release,
6997         .lfsck_quit             = lfsck_layout_slave_quit,
6998         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
6999         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
7000         .lfsck_query            = lfsck_layout_query,
7001         .lfsck_join             = lfsck_layout_slave_join,
7002 };
7003
7004 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
7005                                             struct lfsck_component *com,
7006                                             struct lfsck_position *pos)
7007 {
7008         struct lfsck_assistant_data     *lad = com->lc_data;
7009         struct lfsck_layout_req         *llr;
7010
7011         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
7012             LS_SCANNING_PHASE1)
7013                 return;
7014
7015         if (list_empty(&lad->lad_req_list))
7016                 return;
7017
7018         llr = list_first_entry(&lad->lad_req_list,
7019                                struct lfsck_layout_req,
7020                                llr_lar.lar_list);
7021         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
7022 }
7023
7024 const struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
7025         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
7026         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
7027         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
7028         .la_double_scan_result  = lfsck_layout_double_scan_result,
7029         .la_req_fini            = lfsck_layout_assistant_req_fini,
7030         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
7031 };
7032
7033 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
7034 {
7035         struct lfsck_component  *com;
7036         struct lfsck_layout     *lo;
7037         struct dt_object        *root = NULL;
7038         struct dt_object        *obj;
7039         int                      i;
7040         int                      rc;
7041         ENTRY;
7042
7043         OBD_ALLOC_PTR(com);
7044         if (com == NULL)
7045                 RETURN(-ENOMEM);
7046
7047         INIT_LIST_HEAD(&com->lc_link);
7048         INIT_LIST_HEAD(&com->lc_link_dir);
7049         init_rwsem(&com->lc_sem);
7050         atomic_set(&com->lc_ref, 1);
7051         com->lc_lfsck = lfsck;
7052         com->lc_type = LFSCK_TYPE_LAYOUT;
7053         if (lfsck->li_master) {
7054                 com->lc_ops = &lfsck_layout_master_ops;
7055                 com->lc_data = lfsck_assistant_data_init(
7056                                 &lfsck_layout_assistant_ops,
7057                                 LFSCK_LAYOUT);
7058                 if (com->lc_data == NULL)
7059                         GOTO(out, rc = -ENOMEM);
7060
7061                 for (i = 0; i < LFSCK_STF_COUNT; i++)
7062                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
7063         } else {
7064                 struct lfsck_layout_slave_data *llsd;
7065
7066                 com->lc_ops = &lfsck_layout_slave_ops;
7067                 OBD_ALLOC_PTR(llsd);
7068                 if (llsd == NULL)
7069                         GOTO(out, rc = -ENOMEM);
7070
7071                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
7072                 INIT_LIST_HEAD(&llsd->llsd_master_list);
7073                 spin_lock_init(&llsd->llsd_lock);
7074                 llsd->llsd_rb_root = RB_ROOT;
7075                 init_rwsem(&llsd->llsd_rb_rwsem);
7076                 com->lc_data = llsd;
7077         }
7078         com->lc_file_size = sizeof(*lo);
7079         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
7080         if (com->lc_file_ram == NULL)
7081                 GOTO(out, rc = -ENOMEM);
7082
7083         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
7084         if (com->lc_file_disk == NULL)
7085                 GOTO(out, rc = -ENOMEM);
7086
7087         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
7088         if (IS_ERR(root))
7089                 GOTO(out, rc = PTR_ERR(root));
7090
7091         if (unlikely(!dt_try_as_dir(env, root, true)))
7092                 GOTO(out, rc = -ENOTDIR);
7093
7094         obj = local_file_find_or_create(env, lfsck->li_los, root,
7095                                         LFSCK_LAYOUT,
7096                                         S_IFREG | S_IRUGO | S_IWUSR);
7097         if (IS_ERR(obj))
7098                 GOTO(out, rc = PTR_ERR(obj));
7099
7100         com->lc_obj = obj;
7101         rc = lfsck_layout_load(env, com);
7102         if (rc > 0) {
7103                 rc = lfsck_layout_reset(env, com, true);
7104         } else if (rc == -ENOENT) {
7105                 rc = lfsck_layout_init(env, com);
7106         } else if (lfsck->li_master) {
7107                 rc = lfsck_load_sub_trace_files(env, com,
7108                                 &dt_lfsck_layout_dangling_features,
7109                                 LFSCK_LAYOUT, false);
7110                 if (rc)
7111                         rc = lfsck_layout_reset(env, com, true);
7112         }
7113
7114         if (rc != 0)
7115                 GOTO(out, rc);
7116
7117         lo = com->lc_file_ram;
7118         switch (lo->ll_status) {
7119         case LS_INIT:
7120         case LS_COMPLETED:
7121         case LS_FAILED:
7122         case LS_STOPPED:
7123         case LS_PARTIAL:
7124                 spin_lock(&lfsck->li_lock);
7125                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
7126                 spin_unlock(&lfsck->li_lock);
7127                 break;
7128         default:
7129                 CERROR("%s: unknown lfsck_layout status %d\n",
7130                        lfsck_lfsck2name(lfsck), lo->ll_status);
7131                 fallthrough;
7132         case LS_SCANNING_PHASE1:
7133         case LS_SCANNING_PHASE2:
7134                 /* No need to store the status to disk right now.
7135                  * If the system crashed before the status stored,
7136                  * it will be loaded back when next time. */
7137                 lo->ll_status = LS_CRASHED;
7138                 if (!lfsck->li_master)
7139                         lo->ll_flags |= LF_INCOMPLETE;
7140                 fallthrough;
7141         case LS_PAUSED:
7142         case LS_CRASHED:
7143         case LS_CO_FAILED:
7144         case LS_CO_STOPPED:
7145         case LS_CO_PAUSED:
7146                 spin_lock(&lfsck->li_lock);
7147                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
7148                 spin_unlock(&lfsck->li_lock);
7149                 break;
7150         }
7151
7152         if (lo->ll_flags & LF_CRASHED_LASTID) {
7153                 LASSERT(lfsck->li_out_notify != NULL);
7154
7155                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
7156                                      LE_LASTID_REBUILDING);
7157         }
7158
7159         GOTO(out, rc = 0);
7160
7161 out:
7162         if (root != NULL && !IS_ERR(root))
7163                 lfsck_object_put(env, root);
7164
7165         if (rc != 0) {
7166                 lfsck_component_cleanup(env, com);
7167                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
7168                        lfsck_lfsck2name(lfsck), rc);
7169         }
7170
7171         return rc;
7172 }
7173
7174 struct lfsck_orphan_it {
7175         struct lfsck_component           *loi_com;
7176         struct lfsck_rbtree_node         *loi_lrn;
7177         struct lfsck_layout_slave_target *loi_llst;
7178         struct lu_fid                     loi_key;
7179         struct lu_orphan_rec_v3           loi_rec;
7180         __u64                             loi_hash;
7181         unsigned int                      loi_over:1;
7182 };
7183
7184 static int lfsck_fid_match_idx(const struct lu_env *env,
7185                                struct lfsck_instance *lfsck,
7186                                const struct lu_fid *fid, int idx)
7187 {
7188         struct seq_server_site  *ss;
7189         struct lu_server_fld    *sf;
7190         struct lu_seq_range     *range = &lfsck_env_info(env)->lti_range;
7191         int                      rc;
7192
7193         /* All abnormal cases will be returned to MDT0. */
7194         if (!fid_is_norm(fid)) {
7195                 if (idx == 0)
7196                         return 1;
7197
7198                 return 0;
7199         }
7200
7201         ss = lfsck_dev_site(lfsck);
7202         if (unlikely(ss == NULL))
7203                 return -ENOTCONN;
7204
7205         sf = ss->ss_server_fld;
7206         LASSERT(sf != NULL);
7207
7208         fld_range_set_any(range);
7209         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
7210         if (rc != 0)
7211                 return rc;
7212
7213         if (!fld_range_is_mdt(range))
7214                 return -EINVAL;
7215
7216         if (range->lsr_index == idx)
7217                 return 1;
7218
7219         return 0;
7220 }
7221
7222 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
7223                                         struct lfsck_instance *lfsck,
7224                                         struct dt_object *obj)
7225 {
7226         struct dt_device        *dev    = lfsck_obj2dev(obj);
7227         struct thandle          *handle;
7228         int                      rc;
7229         ENTRY;
7230
7231         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
7232                 GOTO(log, rc = 0);
7233
7234         handle = lfsck_trans_create(env, dev, lfsck);
7235         if (IS_ERR(handle))
7236                 RETURN_EXIT;
7237
7238         rc = dt_declare_ref_del(env, obj, handle);
7239         if (rc != 0)
7240                 GOTO(stop, rc);
7241
7242         rc = dt_declare_destroy(env, obj, handle);
7243         if (rc != 0)
7244                 GOTO(stop, rc);
7245
7246         rc = dt_trans_start_local(env, dev, handle);
7247         if (rc != 0)
7248                 GOTO(stop, rc);
7249
7250         dt_write_lock(env, obj, 0);
7251         rc = dt_ref_del(env, obj, handle);
7252         if (rc == 0)
7253                 rc = dt_destroy(env, obj, handle);
7254         dt_write_unlock(env, obj);
7255
7256         GOTO(stop, rc);
7257
7258 stop:
7259         dt_trans_stop(env, dev, handle);
7260
7261 log:
7262         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
7263                PFID(lfsck_dto2fid(obj)), rc);
7264
7265         RETURN_EXIT;
7266 }
7267
7268 static int lfsck_orphan_index_lookup(const struct lu_env *env,
7269                                      struct dt_object *dt,
7270                                      struct dt_rec *rec,
7271                                      const struct dt_key *key)
7272 {
7273         return -EOPNOTSUPP;
7274 }
7275
7276 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
7277                                              struct dt_object *dt,
7278                                              const struct dt_rec *rec,
7279                                              const struct dt_key *key,
7280                                              struct thandle *handle)
7281 {
7282         return -EOPNOTSUPP;
7283 }
7284
7285 static int lfsck_orphan_index_insert(const struct lu_env *env,
7286                                      struct dt_object *dt,
7287                                      const struct dt_rec *rec,
7288                                      const struct dt_key *key,
7289                                      struct thandle *handle)
7290 {
7291         return -EOPNOTSUPP;
7292 }
7293
7294 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
7295                                              struct dt_object *dt,
7296                                              const struct dt_key *key,
7297                                              struct thandle *handle)
7298 {
7299         return -EOPNOTSUPP;
7300 }
7301
7302 static int lfsck_orphan_index_delete(const struct lu_env *env,
7303                                      struct dt_object *dt,
7304                                      const struct dt_key *key,
7305                                      struct thandle *handle)
7306 {
7307         return -EOPNOTSUPP;
7308 }
7309
7310 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
7311                                           struct dt_object *dt,
7312                                           __u32 attr)
7313 {
7314         struct dt_device                *dev    = lu2dt_dev(dt->do_lu.lo_dev);
7315         struct lfsck_instance           *lfsck;
7316         struct lfsck_component          *com    = NULL;
7317         struct lfsck_layout_slave_data  *llsd;
7318         struct lfsck_orphan_it          *it     = NULL;
7319         struct lfsck_layout             *lo;
7320         int                              rc     = 0;
7321         ENTRY;
7322
7323         lfsck = lfsck_instance_find(dev, true, false);
7324         if (unlikely(lfsck == NULL))
7325                 RETURN(ERR_PTR(-ENXIO));
7326
7327         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
7328         if (unlikely(com == NULL))
7329                 GOTO(out, rc = -ENOENT);
7330
7331         lo = com->lc_file_ram;
7332         if (lo->ll_flags & LF_INCOMPLETE)
7333                 GOTO(out, rc = -ESRCH);
7334
7335         llsd = com->lc_data;
7336         if (!llsd->llsd_rbtree_valid)
7337                 GOTO(out, rc = -ESRCH);
7338
7339         OBD_ALLOC_PTR(it);
7340         if (it == NULL)
7341                 GOTO(out, rc = -ENOMEM);
7342
7343         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
7344         if (it->loi_llst == NULL)
7345                 GOTO(out, rc = -ENXIO);
7346
7347         if (dev->dd_record_fid_accessed) {
7348                 /* The first iteration against the rbtree, scan the whole rbtree
7349                  * to remove the nodes which do NOT need to be handled. */
7350                 down_write(&llsd->llsd_rb_rwsem);
7351                 if (dev->dd_record_fid_accessed) {
7352                         struct rb_node                  *node;
7353                         struct rb_node                  *next;
7354                         struct lfsck_rbtree_node        *lrn;
7355
7356                         /* No need to record the fid accessing anymore. */
7357                         dev->dd_record_fid_accessed = 0;
7358
7359                         node = rb_first(&llsd->llsd_rb_root);
7360                         while (node != NULL) {
7361                                 next = rb_next(node);
7362                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
7363                                                lrn_node);
7364                                 if (atomic_read(&lrn->lrn_known_count) <=
7365                                     atomic_read(&lrn->lrn_accessed_count)) {
7366                                         rb_erase(node, &llsd->llsd_rb_root);
7367                                         lfsck_rbtree_free(lrn);
7368                                 }
7369                                 node = next;
7370                         }
7371                 }
7372                 up_write(&llsd->llsd_rb_rwsem);
7373         }
7374
7375         /* read lock the rbtree when init, and unlock when fini */
7376         down_read(&llsd->llsd_rb_rwsem);
7377         it->loi_com = com;
7378         com = NULL;
7379
7380         GOTO(out, rc = 0);
7381
7382 out:
7383         if (com != NULL)
7384                 lfsck_component_put(env, com);
7385
7386         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7387                lfsck_lfsck2name(lfsck), rc);
7388
7389         lfsck_instance_put(env, lfsck);
7390         if (rc != 0) {
7391                 if (it != NULL)
7392                         OBD_FREE_PTR(it);
7393
7394                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7395         }
7396
7397         return (struct dt_it *)it;
7398 }
7399
7400 static void lfsck_orphan_it_fini(const struct lu_env *env,
7401                                  struct dt_it *di)
7402 {
7403         struct lfsck_orphan_it           *it    = (struct lfsck_orphan_it *)di;
7404         struct lfsck_component           *com   = it->loi_com;
7405         struct lfsck_layout_slave_data   *llsd;
7406         struct lfsck_layout_slave_target *llst;
7407
7408         if (com != NULL) {
7409                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7410                        lfsck_lfsck2name(com->lc_lfsck));
7411
7412                 llsd = com->lc_data;
7413                 up_read(&llsd->llsd_rb_rwsem);
7414                 llst = it->loi_llst;
7415                 LASSERT(llst != NULL);
7416
7417                 /* Save the key and hash for iterate next. */
7418                 llst->llst_fid = it->loi_key;
7419                 llst->llst_hash = it->loi_hash;
7420                 lfsck_layout_llst_put(llst);
7421                 lfsck_component_put(env, com);
7422         }
7423         OBD_FREE_PTR(it);
7424 }
7425
7426 /**
7427  * \retval       +1: the iteration finished
7428  * \retval        0: on success, not finished
7429  * \retval      -ve: on error
7430  */
7431 static int lfsck_orphan_it_next(const struct lu_env *env,
7432                                 struct dt_it *di)
7433 {
7434         struct lfsck_thread_info        *info   = lfsck_env_info(env);
7435         struct filter_fid               *ff     = &info->lti_ff;
7436         struct lu_attr                  *la     = &info->lti_la;
7437         struct lfsck_orphan_it          *it     = (struct lfsck_orphan_it *)di;
7438         struct lu_fid                   *key    = &it->loi_key;
7439         struct lu_orphan_rec_v3         *rec    = &it->loi_rec;
7440         struct ost_layout               *ol     = &rec->lor_layout;
7441         struct lfsck_component          *com    = it->loi_com;
7442         struct lfsck_instance           *lfsck  = com->lc_lfsck;
7443         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
7444         struct dt_object                *obj;
7445         struct lfsck_rbtree_node        *lrn;
7446         int                              pos;
7447         int                              rc;
7448         __u32                            save;
7449         __u32                            idx    = it->loi_llst->llst_index;
7450         bool                             exact  = false;
7451         ENTRY;
7452
7453         if (it->loi_over)
7454                 RETURN(1);
7455
7456 again0:
7457         lrn = it->loi_lrn;
7458         if (lrn == NULL) {
7459                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7460                 if (lrn == NULL) {
7461                         it->loi_over = 1;
7462                         RETURN(1);
7463                 }
7464
7465                 it->loi_lrn = lrn;
7466                 if (!exact) {
7467                         key->f_seq = lrn->lrn_seq;
7468                         key->f_oid = lrn->lrn_first_oid;
7469                         key->f_ver = 0;
7470                 }
7471         } else {
7472                 key->f_oid++;
7473                 if (unlikely(key->f_oid == 0)) {
7474                         key->f_seq++;
7475                         it->loi_lrn = NULL;
7476                         goto again0;
7477                 }
7478
7479                 if (key->f_oid >=
7480                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7481                         it->loi_lrn = NULL;
7482                         goto again0;
7483                 }
7484         }
7485
7486         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7487                      atomic_read(&lrn->lrn_accessed_count))) {
7488                 struct rb_node *next = rb_next(&lrn->lrn_node);
7489
7490                 while (next != NULL) {
7491                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7492                                        lrn_node);
7493                         if (atomic_read(&lrn->lrn_known_count) >
7494                             atomic_read(&lrn->lrn_accessed_count))
7495                                 break;
7496                         next = rb_next(next);
7497                 }
7498
7499                 if (next == NULL) {
7500                         it->loi_over = 1;
7501                         RETURN(1);
7502                 }
7503
7504                 it->loi_lrn = lrn;
7505                 key->f_seq = lrn->lrn_seq;
7506                 key->f_oid = lrn->lrn_first_oid;
7507                 key->f_ver = 0;
7508         }
7509
7510         pos = key->f_oid - lrn->lrn_first_oid;
7511
7512 again1:
7513         pos = find_next_bit(lrn->lrn_known_bitmap,
7514                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7515         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7516                 key->f_oid = lrn->lrn_first_oid + pos;
7517                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7518                         key->f_seq++;
7519                         key->f_oid = 0;
7520                 }
7521                 it->loi_lrn = NULL;
7522                 goto again0;
7523         }
7524
7525         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7526                 pos++;
7527                 goto again1;
7528         }
7529
7530         key->f_oid = lrn->lrn_first_oid + pos;
7531         obj = lfsck_object_find_bottom(env, lfsck, key);
7532         if (IS_ERR(obj)) {
7533                 rc = PTR_ERR(obj);
7534                 if (rc == -ENOENT) {
7535                         pos++;
7536                         goto again1;
7537                 }
7538                 RETURN(rc);
7539         }
7540
7541         dt_read_lock(env, obj, 0);
7542         if (dt_object_exists(obj) == 0 ||
7543             lfsck_is_dead_obj(obj)) {
7544                 dt_read_unlock(env, obj);
7545                 lfsck_object_put(env, obj);
7546                 pos++;
7547                 goto again1;
7548         }
7549
7550         rc = dt_attr_get(env, obj, la);
7551         if (rc != 0)
7552                 GOTO(out, rc);
7553
7554         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7555                           XATTR_NAME_FID);
7556         if (rc == -ENODATA) {
7557                 /* For the pre-created OST-object, update the bitmap to avoid
7558                  * others LFSCK (second phase) iteration to touch it again. */
7559                 if (la->la_ctime == 0) {
7560                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7561                                 atomic_inc(&lrn->lrn_accessed_count);
7562
7563                         /* For the race between repairing dangling referenced
7564                          * MDT-object and unlink the file, it may left orphan
7565                          * OST-object there. Destroy it now! */
7566                         if (unlikely(!(la->la_mode & S_ISUID))) {
7567                                 dt_read_unlock(env, obj);
7568                                 lfsck_layout_destroy_orphan(env, lfsck, obj);
7569                                 lfsck_object_put(env, obj);
7570                                 pos++;
7571                                 goto again1;
7572                         }
7573                 } else if (idx == 0) {
7574                         /* If the orphan OST-object has no parent information,
7575                          * regard it as referenced by the MDT-object on MDT0. */
7576                         fid_zero(&rec->lor_rec.lor_fid);
7577                         rec->lor_rec.lor_uid = la->la_uid;
7578                         rec->lor_rec.lor_gid = la->la_gid;
7579                         memset(ol, 0, sizeof(*ol));
7580                         rec->lor_layout_version = 0;
7581                         rec->lor_range = 0;
7582
7583                         GOTO(out, rc = 0);
7584                 }
7585
7586                 dt_read_unlock(env, obj);
7587                 lfsck_object_put(env, obj);
7588                 pos++;
7589                 goto again1;
7590         }
7591
7592         if (rc < sizeof(struct lu_fid))
7593                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7594
7595         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7596         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7597          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7598          * parent MDT-object's layout EA. */
7599         save = rec->lor_rec.lor_fid.f_stripe_idx;
7600         rec->lor_rec.lor_fid.f_ver = 0;
7601         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7602         /* If the orphan OST-object does not claim the MDT, then next.
7603          *
7604          * If we do not know whether it matches or not, then return it
7605          * to the MDT for further check. */
7606         if (rc == 0) {
7607                 dt_read_unlock(env, obj);
7608                 lfsck_object_put(env, obj);
7609                 pos++;
7610                 goto again1;
7611         }
7612
7613         rec->lor_rec.lor_fid.f_stripe_idx = save;
7614         rec->lor_rec.lor_uid = la->la_uid;
7615         rec->lor_rec.lor_gid = la->la_gid;
7616         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7617         rec->lor_layout_version =
7618                 le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC);
7619         rec->lor_range = le32_to_cpu(ff->ff_range);
7620
7621         CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, "
7622                "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, "
7623                "COMP end %llu, layout version %u, range %u\n",
7624                lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7625                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7626                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7627                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
7628                rec->lor_layout_version, rec->lor_range);
7629
7630         GOTO(out, rc = 0);
7631
7632 out:
7633         dt_read_unlock(env, obj);
7634         lfsck_object_put(env, obj);
7635         if (rc == 0)
7636                 it->loi_hash++;
7637
7638         return rc;
7639 }
7640
7641 /**
7642  * \retval       +1: locate to the exactly position
7643  * \retval        0: cannot locate to the exactly position,
7644  *                   call next() to move to a valid position.
7645  * \retval      -ve: on error
7646  */
7647 static int lfsck_orphan_it_get(const struct lu_env *env,
7648                                struct dt_it *di,
7649                                const struct dt_key *key)
7650 {
7651         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7652         int                      rc;
7653
7654         it->loi_key = *(struct lu_fid *)key;
7655         rc = lfsck_orphan_it_next(env, di);
7656         if (rc == 1)
7657                 return 0;
7658
7659         if (rc == 0)
7660                 return 1;
7661
7662         return rc;
7663 }
7664
7665 static void lfsck_orphan_it_put(const struct lu_env *env,
7666                                 struct dt_it *di)
7667 {
7668 }
7669
7670 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7671                                           const struct dt_it *di)
7672 {
7673         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7674
7675         return (struct dt_key *)&it->loi_key;
7676 }
7677
7678 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7679                                     const struct dt_it *di)
7680 {
7681         return sizeof(struct lu_fid);
7682 }
7683
7684 static int lfsck_orphan_it_rec(const struct lu_env *env,
7685                                const struct dt_it *di,
7686                                struct dt_rec *rec,
7687                                __u32 attr)
7688 {
7689         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7690
7691         *(struct lu_orphan_rec_v3 *)rec = it->loi_rec;
7692
7693         return 0;
7694 }
7695
7696 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7697                                    const struct dt_it *di)
7698 {
7699         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7700
7701         return it->loi_hash;
7702 }
7703
7704 /**
7705  * \retval       +1: locate to the exactly position
7706  * \retval        0: cannot locate to the exactly position,
7707  *                   call next() to move to a valid position.
7708  * \retval      -ve: on error
7709  */
7710 static int lfsck_orphan_it_load(const struct lu_env *env,
7711                                 const struct dt_it *di,
7712                                 __u64 hash)
7713 {
7714         struct lfsck_orphan_it           *it   = (struct lfsck_orphan_it *)di;
7715         struct lfsck_layout_slave_target *llst = it->loi_llst;
7716         int                               rc;
7717
7718         LASSERT(llst != NULL);
7719
7720         if (hash != llst->llst_hash) {
7721                 CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan "
7722                        "iteration does not match the one when fini "
7723                        "%llu, to be reset.\n",
7724                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7725                        llst->llst_hash);
7726                 fid_zero(&llst->llst_fid);
7727                 llst->llst_hash = 0;
7728         }
7729
7730         it->loi_key = llst->llst_fid;
7731         it->loi_hash = llst->llst_hash;
7732         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7733         if (rc == 1)
7734                 return 0;
7735
7736         if (rc == 0)
7737                 return 1;
7738
7739         return rc;
7740 }
7741
7742 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7743                                    const struct dt_it *di,
7744                                    void *key_rec)
7745 {
7746         return 0;
7747 }
7748
7749 static const struct dt_index_operations lfsck_orphan_index_ops = {
7750         .dio_lookup             = lfsck_orphan_index_lookup,
7751         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7752         .dio_insert             = lfsck_orphan_index_insert,
7753         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7754         .dio_delete             = lfsck_orphan_index_delete,
7755         .dio_it = {
7756                 .init           = lfsck_orphan_it_init,
7757                 .fini           = lfsck_orphan_it_fini,
7758                 .get            = lfsck_orphan_it_get,
7759                 .put            = lfsck_orphan_it_put,
7760                 .next           = lfsck_orphan_it_next,
7761                 .key            = lfsck_orphan_it_key,
7762                 .key_size       = lfsck_orphan_it_key_size,
7763                 .rec            = lfsck_orphan_it_rec,
7764                 .store          = lfsck_orphan_it_store,
7765                 .load           = lfsck_orphan_it_load,
7766                 .key_rec        = lfsck_orphan_it_key_rec,
7767         }
7768 };