Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2017, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lu_object.h>
40 #include <dt_object.h>
41 #include <lustre_fid.h>
42 #include <lustre_lib.h>
43 #include <lustre_net.h>
44 #include <md_object.h>
45 #include <obd_class.h>
46
47 #include "lfsck_internal.h"
48
49 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
50 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
51 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
52 #define LFSCK_LAYOUT_MAGIC_V4           0xB1732FED
53
54 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V4
55
56 struct lfsck_layout_seq {
57         struct list_head         lls_list;
58         __u64                    lls_seq;
59         __u64                    lls_lastid;
60         __u64                    lls_lastid_known;
61         struct dt_object        *lls_lastid_obj;
62         unsigned int             lls_dirty:1;
63 };
64
65 struct lfsck_layout_slave_target {
66         /* link into lfsck_layout_slave_data::llsd_master_list. */
67         struct list_head        llst_list;
68         /* The position for next record in the rbtree for iteration. */
69         struct lu_fid           llst_fid;
70         /* Dummy hash for iteration against the rbtree. */
71         __u64                   llst_hash;
72         __u64                   llst_gen;
73         struct kref             llst_ref;
74         __u32                   llst_index;
75         /* How many times we have failed to get the master status. */
76         int                     llst_failures;
77 };
78
79 struct lfsck_layout_slave_data {
80         /* list for lfsck_layout_seq */
81         struct list_head         llsd_seq_list;
82
83         /* list for the masters involve layout verification. */
84         struct list_head         llsd_master_list;
85         spinlock_t               llsd_lock;
86         __u64                    llsd_touch_gen;
87         struct dt_object        *llsd_rb_obj;
88         struct rb_root           llsd_rb_root;
89         struct rw_semaphore      llsd_rb_rwsem;
90         unsigned int             llsd_rbtree_valid:1;
91 };
92
93 struct lfsck_layout_slave_async_args {
94         struct obd_export                *llsaa_exp;
95         struct lfsck_component           *llsaa_com;
96         struct lfsck_layout_slave_target *llsaa_llst;
97 };
98
99 static inline bool lfsck_comp_extent_aligned(__u64 border, __u32 size)
100 {
101         return (border & (size - 1)) == 0;
102 }
103
104 static inline void
105 lfsck_layout_llst_put(struct kref *kref)
106 {
107         struct lfsck_layout_slave_target *llst;
108
109         llst = container_of(kref, struct lfsck_layout_slave_target, llst_ref);
110         LASSERT(list_empty(&llst->llst_list));
111         OBD_FREE_PTR(llst);
112 }
113
114 static inline int
115 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
116 {
117         struct lfsck_layout_slave_target *llst;
118         struct lfsck_layout_slave_target *tmp;
119         int                               rc   = 0;
120
121         OBD_ALLOC_PTR(llst);
122         if (llst == NULL)
123                 return -ENOMEM;
124
125         INIT_LIST_HEAD(&llst->llst_list);
126         llst->llst_gen = 0;
127         llst->llst_index = index;
128         kref_init(&llst->llst_ref);
129
130         spin_lock(&llsd->llsd_lock);
131         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
132                 if (tmp->llst_index == index) {
133                         rc = -EALREADY;
134                         break;
135                 }
136         }
137         if (rc == 0)
138                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
139         spin_unlock(&llsd->llsd_lock);
140
141         if (rc != 0)
142                 OBD_FREE_PTR(llst);
143
144         return rc;
145 }
146
147 static inline void
148 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
149                       struct lfsck_layout_slave_target *llst)
150 {
151         bool del = false;
152
153         spin_lock(&llsd->llsd_lock);
154         if (!list_empty(&llst->llst_list)) {
155                 list_del_init(&llst->llst_list);
156                 del = true;
157         }
158         spin_unlock(&llsd->llsd_lock);
159
160         if (del)
161                 kref_put(&llst->llst_ref, lfsck_layout_llst_put);
162 }
163
164 static inline struct lfsck_layout_slave_target *
165 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
166                                __u32 index, bool unlink)
167 {
168         struct lfsck_layout_slave_target *llst;
169
170         spin_lock(&llsd->llsd_lock);
171         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
172                 if (llst->llst_index == index) {
173                         if (unlink)
174                                 list_del_init(&llst->llst_list);
175                         else
176                                 kref_get(&llst->llst_ref);
177                         spin_unlock(&llsd->llsd_lock);
178
179                         return llst;
180                 }
181         }
182         spin_unlock(&llsd->llsd_lock);
183
184         return NULL;
185 }
186
187 static struct lfsck_layout_req *
188 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
189                                 struct dt_object *child, __u32 comp_id,
190                                 __u32 ost_idx, __u32 lov_idx)
191 {
192         struct lfsck_layout_req *llr;
193
194         OBD_ALLOC_PTR(llr);
195         if (llr == NULL)
196                 return ERR_PTR(-ENOMEM);
197
198         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
199         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
200         llr->llr_child = child;
201         llr->llr_comp_id = comp_id;
202         llr->llr_ost_idx = ost_idx;
203         llr->llr_lov_idx = lov_idx;
204
205         return llr;
206 }
207
208 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
209                                             struct lfsck_assistant_req *lar)
210 {
211         struct lfsck_layout_req *llr =
212                 container_of(lar, struct lfsck_layout_req, llr_lar);
213
214         lfsck_object_put(env, llr->llr_child);
215         kref_put(&lar->lar_parent->lso_ref, lfsck_assistant_object_put);
216         OBD_FREE_PTR(llr);
217 }
218
219 static int
220 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
221                                                struct ptlrpc_request *req,
222                                                void *args, int rc)
223 {
224         if (rc == 0) {
225                 struct lfsck_async_interpret_args *laia = args;
226                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
227
228                 ltd->ltd_synced_failures = 1;
229                 atomic_dec(laia->laia_count);
230         }
231
232         return 0;
233 }
234
235 /**
236  * Notify remote LFSCK instances about former failures.
237  *
238  * The local LFSCK instance has recorded which OSTs have ever failed to respond
239  * some LFSCK verification requests (maybe because of network issues or the OST
240  * itself trouble). During the respond gap, the OST may missed some OST-objects
241  * verification, then the OST cannot know whether related OST-objects have been
242  * referenced by related MDT-objects or not, then in the second-stage scanning,
243  * these OST-objects will be regarded as orphan, if the OST-object contains bad
244  * parent FID for back reference, then it will misguide the LFSCK to make wrong
245  * fixing for the fake orphan.
246  *
247  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
248  * it will scan the bitmap for the ever failed OSTs, and notify them that they
249  * have ever missed some OST-object verification and should skip the handling
250  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
251  *
252  * \param[in] env       pointer to the thread context
253  * \param[in] com       pointer to the lfsck component
254  * \param[in] lr        pointer to the lfsck request
255  */
256 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
257                                                  struct lfsck_component *com,
258                                                  struct lfsck_request *lr)
259 {
260         struct lfsck_async_interpret_args *laia  =
261                                 &lfsck_env_info(env)->lti_laia2;
262         struct lfsck_assistant_data *lad = com->lc_data;
263         struct lfsck_layout *lo = com->lc_file_ram;
264         struct lfsck_instance *lfsck = com->lc_lfsck;
265         struct lfsck_tgt_descs *ltds  = &lfsck->li_ost_descs;
266         struct lfsck_tgt_desc *ltd;
267         struct ptlrpc_request_set *set;
268         atomic_t count;
269         __u32 idx;
270         int rc = 0;
271
272         ENTRY;
273         if (!test_bit(LAD_INCOMPLETE, &lad->lad_flags))
274                 RETURN_EXIT;
275
276         /* If the MDT has ever failed to verfiy some OST-objects,
277          * then sync failures with them firstly.
278          */
279         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
280
281         atomic_set(&count, 0);
282         memset(laia, 0, sizeof(*laia));
283         laia->laia_count = &count;
284         set = ptlrpc_prep_set();
285         if (set == NULL)
286                 GOTO(out, rc = -ENOMEM);
287
288         down_read(&ltds->ltd_rw_sem);
289         for_each_set_bit(idx, lad->lad_bitmap, lad->lad_bitmap_count) {
290                 ltd = lfsck_ltd2tgt(ltds, idx);
291                 if (unlikely(!ltd))
292                         continue;
293
294                 laia->laia_ltd = ltd;
295                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
296                                 lfsck_layout_assistant_sync_failures_interpret,
297                                 laia, LFSCK_NOTIFY);
298                 if (rc != 0) {
299                         CDEBUG(D_LFSCK,
300                                "%s: LFSCK assistant fail to notify target %x for %s phase1 done: rc = %d\n",
301                                lfsck_lfsck2name(com->lc_lfsck), ltd->ltd_index,
302                                lad->lad_name, rc);
303
304                         break;
305                 }
306
307                 atomic_inc(&count);
308         }
309         up_read(&ltds->ltd_rw_sem);
310
311         if (rc == 0 && atomic_read(&count) > 0)
312                 rc = ptlrpc_set_wait(env, set);
313
314         ptlrpc_set_destroy(set);
315
316         if (rc == 0 && atomic_read(&count) > 0)
317                 rc = -EINVAL;
318
319         GOTO(out, rc);
320
321 out:
322         if (rc != 0)
323                 /* If failed to sync failures with the OSTs, then have to
324                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
325                  * subsequent orphan OST-object handling.
326                  */
327                 lo->ll_flags |= LF_INCOMPLETE;
328
329         lr->lr_flags2 = lo->ll_flags;
330 }
331
332 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
333                                            struct lov_mds_md_v1 *lmm,
334                                            __u64 start, __u64 end,
335                                            __u32 comp_id,
336                                            bool ext, bool *dom)
337 {
338         __u32 magic;
339         __u32 pattern;
340         __u32 size;
341
342         magic = le32_to_cpu(lmm->lmm_magic);
343         /* If magic crashed, keep it there. Sometime later, during OST-object
344          * orphan handling, if some OST-object(s) back-point to it, it can be
345          * verified and repaired.
346          */
347         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
348                 int rc;
349
350                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
351                         rc = -EOPNOTSUPP;
352                 else
353                         rc = -EINVAL;
354
355                 CDEBUG(D_LFSCK, "%s LOV EA magic 0x%X for the file "DFID"\n",
356                        rc == -EINVAL ? "Unknown" : "Unsupported",
357                        magic, PFID(lfsck_dto2fid(obj)));
358
359                 return rc;
360         }
361
362         pattern = le32_to_cpu(lmm->lmm_pattern);
363         *dom = !!(lov_pattern(pattern) & LOV_PATTERN_MDT);
364
365         /* XXX: DoM file verification will be supportted via LU-11081. */
366         if (lov_pattern(pattern) & LOV_PATTERN_MDT) {
367 #if 0
368                 if (start != 0) {
369                         CDEBUG(D_LFSCK,
370                                "The DoM entry for "DFID" is not the first component in the mirror %x/%llu\n",
371                                PFID(lfsck_dto2fid(obj)), comp_id, start);
372
373                         return -EINVAL;
374                 }
375 #endif
376         } else if (!lov_pattern_supported_normal_comp(lov_pattern(pattern))) {
377                 CDEBUG(D_LFSCK,
378                        "Unsupported LOV EA pattern %u for the file "DFID" in the component %x\n",
379                        pattern, PFID(lfsck_dto2fid(obj)), comp_id);
380
381                 return -EOPNOTSUPP;
382         }
383
384         size = le32_to_cpu(lmm->lmm_stripe_size);
385         if (!ext && end != LUSTRE_EOF && start != end &&
386             !lfsck_comp_extent_aligned(end, size)) {
387                 CDEBUG(D_LFSCK,
388                        "not aligned border in PFL extent range [%llu - %llu) stripesize %u for the file "DFID" at idx %d\n",
389                        start, end, size, PFID(lfsck_dto2fid(obj)), comp_id);
390
391                 return -EINVAL;
392         }
393
394         return 0;
395 }
396
397 static int lfsck_layout_verify_header_foreign(struct dt_object *obj,
398                                               struct lov_foreign_md *lfm,
399                                               size_t len)
400 {
401         /* magic has been verified already */
402         __u32 value_len = le32_to_cpu(lfm->lfm_length);
403         /* type and flags are not checked for instance */
404
405         CDEBUG(D_INFO,
406                "foreign LOV EA, magic %x, len %u, type %x, flags %x, for file "DFID"\n",
407                le32_to_cpu(lfm->lfm_magic), value_len,
408                le32_to_cpu(lfm->lfm_type), le32_to_cpu(lfm->lfm_flags),
409                PFID(lfsck_dto2fid(obj)));
410
411         if (len != value_len + offsetof(typeof(*lfm), lfm_value))
412                 CDEBUG(D_LFSCK,
413                        "foreign LOV EA internal size %u does not match EA full size %zu for file "DFID"\n",
414                        value_len, len, PFID(lfsck_dto2fid(obj)));
415
416         /* nothing to repair */
417         return -ENODATA;
418 }
419
420 static int lfsck_layout_verify_header(struct dt_object *obj,
421                                       struct lov_mds_md_v1 *lmm, size_t len)
422 {
423         bool p_dom = false;
424         int rc = 0;
425
426         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1 ||
427             le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_SEL) {
428                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
429                 bool p_zero = false;
430                 int i;
431                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
432
433                 if (unlikely(count == 0)) {
434                         CDEBUG(D_LFSCK,
435                                "the PFL file "DFID" contains invalid components count 0\n",
436                                PFID(lfsck_dto2fid(obj)));
437
438                         return -EINVAL;
439                 }
440
441                 for (i = 0; i < count && !rc; i++) {
442                         struct lov_comp_md_entry_v1 *lcme =
443                                                 &lcm->lcm_entries[i];
444                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
445                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
446                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
447                         struct lov_mds_md_v1 *v1;
448                         bool ext, inited, zero;
449                         __u32 flags;
450
451                         if (unlikely(comp_id == LCME_ID_INVAL ||
452                                      comp_id > LCME_ID_MAX)) {
453                                 CDEBUG(D_LFSCK,
454                                        "found invalid PFL ID %u for the file "DFID" at idx %d\n",
455                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
456
457                                 return -EINVAL;
458                         }
459
460                         flags = le32_to_cpu(lcme->lcme_flags);
461                         ext = flags & LCME_FL_EXTENSION;
462                         inited = flags & LCME_FL_INIT;
463                         zero = !!(start == end);
464
465                         if ((i == 0) && zero) {
466                                 CDEBUG(D_LFSCK,
467                                        "invalid PFL comp %d: [%llu - %llu) for "DFID"\n",
468                                        i, start, end, PFID(lfsck_dto2fid(obj)));
469                                 return -EINVAL;
470                         }
471
472                         if ((zero && (inited || (i + 1 == count))) ||
473                             (start > end)) {
474                                 CDEBUG(D_LFSCK,
475                                        "invalid PFL comp %d/%d: [%llu, %llu) for "DFID", %sinited\n",
476                                        i, count, start, end,
477                                        PFID(lfsck_dto2fid(obj)),
478                                        inited ? "" : "NOT ");
479                                 return -EINVAL;
480                         }
481
482                         if (!ext && p_zero) {
483                                 CDEBUG(D_LFSCK,
484                                        "invalid PFL comp %d: [%llu, %llu) for "DFID": NOT extension after 0-length component\n",
485                                        i, start, end, PFID(lfsck_dto2fid(obj)));
486                                 return -EINVAL;
487                         }
488
489                         if (ext && (inited || p_dom || zero)) {
490                                 CDEBUG(D_LFSCK,
491                                        "invalid PFL comp %d: [%llu, %llu) for "DFID": %s\n",
492                                        i, start, end, PFID(lfsck_dto2fid(obj)),
493                                        inited ? "inited extension" :
494                                        p_dom ? "extension follows DOM" :
495                                        zero ? "zero length extension" : "");
496                                 return -EINVAL;
497                         }
498
499                         v1 = (struct lov_mds_md_v1 *)((char *)lmm +
500                                                 le32_to_cpu(lcme->lcme_offset));
501                         if (le32_to_cpu(v1->lmm_magic) == LOV_MAGIC_FOREIGN)
502                                 rc = lfsck_layout_verify_header_foreign(
503                                         obj, (struct lov_foreign_md *)v1,
504                                         le32_to_cpu(lcme->lcme_size));
505                         else
506                                 rc = lfsck_layout_verify_header_v1v3(obj, v1,
507                                         start, end, comp_id, ext, &p_dom);
508
509                         p_zero = zero;
510                 }
511         } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_FOREIGN) {
512                 rc = lfsck_layout_verify_header_foreign(obj,
513                                                 (struct lov_foreign_md *)lmm,
514                                                 len);
515         } else {
516                 rc = lfsck_layout_verify_header_v1v3(obj, lmm, 0, LUSTRE_EOF,
517                                                      0, false, &p_dom);
518         }
519
520         return rc;
521 }
522
523 static int lfsck_layout_get_lovea(const struct lu_env *env,
524                                   struct dt_object *obj, struct lu_buf *buf)
525 {
526         int rc;
527         int rc1;
528
529 again:
530         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
531         if (rc == -ERANGE) {
532                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
533                 if (rc <= 0)
534                         return !rc ? -ENODATA : rc;
535
536                 lu_buf_realloc(buf, rc);
537                 if (buf->lb_buf == NULL)
538                         return -ENOMEM;
539
540                 goto again;
541         }
542
543         if (rc <= 0)
544                 return !rc ? -ENODATA : rc;
545
546         if (unlikely(buf->lb_buf == NULL)) {
547                 lu_buf_alloc(buf, rc);
548                 if (buf->lb_buf == NULL)
549                         return -ENOMEM;
550
551                 goto again;
552         }
553
554         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf, rc);
555
556         return rc1 ? rc1 : rc;
557 }
558
559 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
560 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
561 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
562
563 struct lfsck_rbtree_node {
564         struct rb_node   lrn_node;
565         __u64            lrn_seq;
566         __u32            lrn_first_oid;
567         atomic_t         lrn_known_count;
568         atomic_t         lrn_accessed_count;
569         void            *lrn_known_bitmap;
570         void            *lrn_accessed_bitmap;
571 };
572
573 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
574                                    __u64 seq, __u32 oid)
575 {
576         if (seq < lrn->lrn_seq)
577                 return -1;
578
579         if (seq > lrn->lrn_seq)
580                 return 1;
581
582         if (oid < lrn->lrn_first_oid)
583                 return -1;
584
585         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
586                 return 1;
587
588         return 0;
589 }
590
591 /* The caller should hold llsd->llsd_rb_lock. */
592 static struct lfsck_rbtree_node *
593 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
594                     const struct lu_fid *fid, bool *exact)
595 {
596         struct rb_node *node = llsd->llsd_rb_root.rb_node;
597         struct rb_node *prev = NULL;
598         struct lfsck_rbtree_node *lrn = NULL;
599         int rc = 0;
600
601         if (exact != NULL)
602                 *exact = true;
603
604         while (node != NULL) {
605                 prev = node;
606                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
607                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
608                 if (rc < 0)
609                         node = node->rb_left;
610                 else if (rc > 0)
611                         node = node->rb_right;
612                 else
613                         return lrn;
614         }
615
616         if (exact == NULL)
617                 return NULL;
618
619         /* If there is no exactly matched one, then to the next valid one. */
620         *exact = false;
621
622         /* The rbtree is empty. */
623         if (rc == 0)
624                 return NULL;
625
626         if (rc < 0)
627                 return lrn;
628
629         node = rb_next(prev);
630
631         /* The end of the rbtree. */
632         if (node == NULL)
633                 return NULL;
634
635         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
636
637         return lrn;
638 }
639
640 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
641                                                   const struct lu_fid *fid)
642 {
643         struct lfsck_rbtree_node *lrn;
644
645         OBD_ALLOC_PTR(lrn);
646         if (lrn == NULL)
647                 return ERR_PTR(-ENOMEM);
648
649         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
650         if (lrn->lrn_known_bitmap == NULL) {
651                 OBD_FREE_PTR(lrn);
652
653                 return ERR_PTR(-ENOMEM);
654         }
655
656         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
657         if (lrn->lrn_accessed_bitmap == NULL) {
658                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
659                 OBD_FREE_PTR(lrn);
660
661                 return ERR_PTR(-ENOMEM);
662         }
663
664         RB_CLEAR_NODE(&lrn->lrn_node);
665         lrn->lrn_seq = fid_seq(fid);
666         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
667         atomic_set(&lrn->lrn_known_count, 0);
668         atomic_set(&lrn->lrn_accessed_count, 0);
669
670         return lrn;
671 }
672
673 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
674 {
675         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
676         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
677         OBD_FREE_PTR(lrn);
678 }
679
680 /* The caller should hold lock. */
681 static struct lfsck_rbtree_node *
682 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
683                     struct lfsck_rbtree_node *lrn)
684 {
685         struct rb_node **pos = &llsd->llsd_rb_root.rb_node;
686         struct rb_node *parent = NULL;
687         struct lfsck_rbtree_node  *tmp;
688         int rc;
689
690         while (*pos != NULL) {
691                 parent = *pos;
692                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
693                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
694                 if (rc < 0)
695                         pos = &(*pos)->rb_left;
696                 else if (rc > 0)
697                         pos = &(*pos)->rb_right;
698                 else
699                         return tmp;
700         }
701
702         rb_link_node(&lrn->lrn_node, parent, pos);
703         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
704
705         return lrn;
706 }
707
708 static const struct dt_index_operations lfsck_orphan_index_ops;
709
710 static int lfsck_rbtree_setup(const struct lu_env *env,
711                               struct lfsck_component *com)
712 {
713         struct lu_fid *fid = &lfsck_env_info(env)->lti_fid;
714         struct lfsck_instance *lfsck = com->lc_lfsck;
715         struct dt_device *dev = lfsck->li_bottom;
716         struct lfsck_layout_slave_data *llsd = com->lc_data;
717         struct dt_object *obj;
718
719         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
720         fid->f_oid = lfsck_dev_idx(lfsck);
721         fid->f_ver = 0;
722         obj = dt_locate(env, dev, fid);
723         if (IS_ERR(obj))
724                 RETURN(PTR_ERR(obj));
725
726         /* Generate an in-RAM object to stand for the layout rbtree.
727          * Scanning the layout rbtree will be via the iteration over
728          * the object. In the future, the rbtree may be written onto
729          * disk with the object.
730          *
731          * Mark the object to be as exist.
732          */
733         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
734         obj->do_index_ops = &lfsck_orphan_index_ops;
735         llsd->llsd_rb_obj = obj;
736         llsd->llsd_rbtree_valid = 1;
737         dev->dd_record_fid_accessed = 1;
738
739         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
740                lfsck_lfsck2name(lfsck));
741
742         return 0;
743 }
744
745 static void lfsck_rbtree_cleanup(const struct lu_env *env,
746                                  struct lfsck_component *com)
747 {
748         struct lfsck_instance *lfsck = com->lc_lfsck;
749         struct lfsck_layout_slave_data *llsd  = com->lc_data;
750         struct rb_node *node  = rb_first(&llsd->llsd_rb_root);
751         struct rb_node *next;
752         struct lfsck_rbtree_node *lrn;
753
754         lfsck->li_bottom->dd_record_fid_accessed = 0;
755         /* Invalid the rbtree, then no others will use it. */
756         down_write(&llsd->llsd_rb_rwsem);
757         llsd->llsd_rbtree_valid = 0;
758         up_write(&llsd->llsd_rb_rwsem);
759
760         while (node != NULL) {
761                 next = rb_next(node);
762                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
763                 rb_erase(node, &llsd->llsd_rb_root);
764                 lfsck_rbtree_free(lrn);
765                 node = next;
766         }
767
768         if (llsd->llsd_rb_obj != NULL) {
769                 lfsck_object_put(env, llsd->llsd_rb_obj);
770                 llsd->llsd_rb_obj = NULL;
771         }
772
773         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
774                lfsck_lfsck2name(lfsck));
775 }
776
777 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
778                                        struct lfsck_component *com,
779                                        const struct lu_fid *fid,
780                                        bool accessed)
781 {
782         struct lfsck_layout_slave_data *llsd    = com->lc_data;
783         struct lfsck_rbtree_node *lrn;
784         bool insert = false;
785         int idx;
786         int rc = 0;
787
788         ENTRY;
789         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
790                 RETURN_EXIT;
791
792         if (!fid_is_idif(fid) && !fid_is_norm(fid))
793                 RETURN_EXIT;
794
795         down_read(&llsd->llsd_rb_rwsem);
796         if (!llsd->llsd_rbtree_valid)
797                 GOTO(unlock, rc = 0);
798
799         lrn = lfsck_rbtree_search(llsd, fid, NULL);
800         if (lrn == NULL) {
801                 struct lfsck_rbtree_node *tmp;
802
803                 LASSERT(!insert);
804
805                 up_read(&llsd->llsd_rb_rwsem);
806                 tmp = lfsck_rbtree_new(env, fid);
807                 if (IS_ERR(tmp))
808                         GOTO(out, rc = PTR_ERR(tmp));
809
810                 insert = true;
811                 down_write(&llsd->llsd_rb_rwsem);
812                 if (!llsd->llsd_rbtree_valid) {
813                         lfsck_rbtree_free(tmp);
814                         GOTO(unlock, rc = 0);
815                 }
816
817                 lrn = lfsck_rbtree_insert(llsd, tmp);
818                 if (lrn != tmp)
819                         lfsck_rbtree_free(tmp);
820         }
821
822         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
823         /* Any accessed object must be a known object. */
824         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
825                 atomic_inc(&lrn->lrn_known_count);
826         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
827                 atomic_inc(&lrn->lrn_accessed_count);
828
829         GOTO(unlock, rc = 0);
830
831 unlock:
832         if (insert)
833                 up_write(&llsd->llsd_rb_rwsem);
834         else
835                 up_read(&llsd->llsd_rb_rwsem);
836 out:
837         if (rc != 0 && accessed) {
838                 struct lfsck_layout *lo = com->lc_file_ram;
839
840                 CDEBUG(D_LFSCK,
841                        "%s: fail to update OST-objects accessing bitmap, and will cause incorrect LFSCK OST-object handling, so disable it to cancel orphan handling for related device. rc = %d\n",
842                        lfsck_lfsck2name(com->lc_lfsck), rc);
843
844                 lo->ll_flags |= LF_INCOMPLETE;
845                 lfsck_rbtree_cleanup(env, com);
846         }
847 }
848
849 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
850                                   const struct lfsck_layout_dangling_key *src)
851 {
852         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
853         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
854         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
855 }
856
857 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
858                                   const struct lfsck_layout_dangling_key *src)
859 {
860         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
861         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
862         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
863 }
864
865 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
866                                   const struct lfsck_layout_dangling_key *src)
867 {
868         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
869         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
870         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
871 }
872
873 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
874                                   const struct lfsck_layout_dangling_key *src)
875 {
876         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
877         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
878         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
879 }
880
881 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
882                                    const struct lfsck_layout *src)
883 {
884         int i;
885
886         des->ll_magic = le32_to_cpu(src->ll_magic);
887         des->ll_status = le32_to_cpu(src->ll_status);
888         des->ll_flags = le32_to_cpu(src->ll_flags);
889         des->ll_success_count = le32_to_cpu(src->ll_success_count);
890         des->ll_run_time_phase1 = le64_to_cpu(src->ll_run_time_phase1);
891         des->ll_run_time_phase2 = le64_to_cpu(src->ll_run_time_phase2);
892         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
893         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
894         des->ll_time_last_checkpoint =
895                                 le64_to_cpu(src->ll_time_last_checkpoint);
896         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
897         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
898         des->ll_pos_first_inconsistent =
899                         le64_to_cpu(src->ll_pos_first_inconsistent);
900         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
901         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
902         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
903         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
904         for (i = 0; i < LLIT_MAX; i++)
905                 des->ll_objs_repaired[i] =
906                                 le64_to_cpu(src->ll_objs_repaired[i]);
907         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
908         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
909         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
910                        &src->ll_lldk_latest_scanned_phase2);
911 }
912
913 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
914                                    const struct lfsck_layout *src)
915 {
916         int i;
917
918         des->ll_magic = cpu_to_le32(src->ll_magic);
919         des->ll_status = cpu_to_le32(src->ll_status);
920         des->ll_flags = cpu_to_le32(src->ll_flags);
921         des->ll_success_count = cpu_to_le32(src->ll_success_count);
922         des->ll_run_time_phase1 = cpu_to_le64(src->ll_run_time_phase1);
923         des->ll_run_time_phase2 = cpu_to_le64(src->ll_run_time_phase2);
924         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
925         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
926         des->ll_time_last_checkpoint =
927                                 cpu_to_le64(src->ll_time_last_checkpoint);
928         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
929         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
930         des->ll_pos_first_inconsistent =
931                         cpu_to_le64(src->ll_pos_first_inconsistent);
932         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
933         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
934         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
935         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
936         for (i = 0; i < LLIT_MAX; i++)
937                 des->ll_objs_repaired[i] =
938                                 cpu_to_le64(src->ll_objs_repaired[i]);
939         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
940         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
941         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
942                        &src->ll_lldk_latest_scanned_phase2);
943 }
944
945 /**
946  * Load the OST bitmap from the lfsck_layout trace file.
947  *
948  * \param[in] env       pointer to the thread context
949  * \param[in] com       pointer to the lfsck component
950  *
951  * \retval              0 for success
952  * \retval              negative error number on failure or data corruption
953  */
954 static int lfsck_layout_load_bitmap(const struct lu_env *env,
955                                     struct lfsck_component *com)
956 {
957         struct dt_object *obj = com->lc_obj;
958         struct lfsck_assistant_data *lad = com->lc_data;
959         struct lfsck_layout *lo = com->lc_file_ram;
960         unsigned long *bitmap = lad->lad_bitmap;
961         loff_t pos = com->lc_file_size;
962         ssize_t size;
963         __u32 nbits;
964         int rc;
965
966         ENTRY;
967         if (com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len > lo->ll_bitmap_size)
968                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_mask_len;
969         else
970                 nbits = lo->ll_bitmap_size;
971
972         if (unlikely(nbits < BITS_PER_LONG))
973                 nbits = BITS_PER_LONG;
974
975         if (nbits > lad->lad_bitmap_count) {
976                 u32 new_bits = lad->lad_bitmap_count;
977                 unsigned long *new_bitmap;
978
979                 while (new_bits < nbits)
980                         new_bits <<= 1;
981
982                 new_bitmap = bitmap_zalloc(new_bits, GFP_KERNEL);
983                 if (new_bitmap == NULL)
984                         RETURN(-ENOMEM);
985
986                 lad->lad_bitmap = new_bitmap;
987                 lad->lad_bitmap_count = new_bits;
988                 bitmap_free(bitmap);
989                 bitmap = new_bitmap;
990         }
991
992         if (lo->ll_bitmap_size == 0) {
993                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
994                 bitmap_zero(bitmap, lad->lad_bitmap_count);
995                 RETURN(0);
996         }
997
998         size = (lo->ll_bitmap_size + 7) >> 3;
999         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap, size), &pos);
1000         if (rc != size)
1001                 RETURN(rc >= 0 ? -EINVAL : rc);
1002
1003         if (bitmap_empty(bitmap, lad->lad_bitmap_count))
1004                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
1005         else
1006                 set_bit(LAD_INCOMPLETE, &lad->lad_flags);
1007
1008         RETURN(0);
1009 }
1010
1011 /**
1012  * Load the layout LFSCK trace file from disk.
1013  *
1014  * The layout LFSCK trace file records the layout LFSCK status information
1015  * and other statistics, such as how many objects have been scanned, and how
1016  * many objects have been repaired, and etc. It also contains the bitmap for
1017  * failed OSTs during the layout LFSCK. All these information will be loaded
1018  * from disk to RAM when the layout LFSCK component setup.
1019  *
1020  * \param[in] env       pointer to the thread context
1021  * \param[in] com       pointer to the lfsck component
1022  *
1023  * \retval              positive number for file data corruption, the caller
1024  *                      should reset the layout LFSCK trace file
1025  * \retval              0 for success
1026  * \retval              negative error number on failure
1027  */
1028 static int lfsck_layout_load(const struct lu_env *env,
1029                              struct lfsck_component *com)
1030 {
1031         struct lfsck_layout *lo = com->lc_file_ram;
1032         ssize_t size = com->lc_file_size;
1033         loff_t pos = 0;
1034         int rc;
1035
1036         rc = dt_read(env, com->lc_obj,
1037                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
1038         if (rc == 0) {
1039                 return -ENOENT;
1040         } else if (rc < 0) {
1041                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
1042                        lfsck_lfsck2name(com->lc_lfsck), rc);
1043                 return rc;
1044         } else if (rc != size) {
1045                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
1046                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
1047                 return 1;
1048         }
1049
1050         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
1051         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
1052                 CDEBUG(D_LFSCK,
1053                        "%s: invalid lfsck_layout magic %#x != %#x, to be reset\n",
1054                        lfsck_lfsck2name(com->lc_lfsck), lo->ll_magic,
1055                        LFSCK_LAYOUT_MAGIC);
1056                 return 1;
1057         }
1058
1059         return 0;
1060 }
1061
1062 /**
1063  * Store the layout LFSCK trace file on disk.
1064  *
1065  * The layout LFSCK trace file records the layout LFSCK status information
1066  * and other statistics, such as how many objects have been scanned, and how
1067  * many objects have been repaired, and etc. It also contains the bitmap for
1068  * failed OSTs during the layout LFSCK. All these information will be synced
1069  * from RAM to disk periodically.
1070  *
1071  * \param[in] env       pointer to the thread context
1072  * \param[in] com       pointer to the lfsck component
1073  *
1074  * \retval              0 for success
1075  * \retval              negative error number on failure
1076  */
1077 static int lfsck_layout_store(const struct lu_env *env,
1078                               struct lfsck_component *com)
1079 {
1080         struct dt_object *obj = com->lc_obj;
1081         struct lfsck_instance *lfsck = com->lc_lfsck;
1082         struct lfsck_layout *lo_ram = com->lc_file_ram;
1083         struct lfsck_layout *lo = com->lc_file_disk;
1084         struct thandle *th;
1085         struct dt_device *dev = lfsck_obj2dev(obj);
1086         unsigned long *bitmap = NULL;
1087         loff_t pos;
1088         ssize_t size = com->lc_file_size;
1089         __u32 nbits = 0;
1090         int rc;
1091
1092         ENTRY;
1093         if (lfsck->li_master) {
1094                 struct lfsck_assistant_data *lad = com->lc_data;
1095
1096                 bitmap = lad->lad_bitmap;
1097                 nbits = lad->lad_bitmap_count;
1098
1099                 LASSERT(nbits > 0);
1100                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1101         }
1102
1103         lo_ram->ll_bitmap_size = nbits;
1104         lfsck_layout_cpu_to_le(lo, lo_ram);
1105         th = dt_trans_create(env, dev);
1106         if (IS_ERR(th))
1107                 GOTO(log, rc = PTR_ERR(th));
1108
1109         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1110                                      (loff_t)0, th);
1111         if (rc != 0)
1112                 GOTO(out, rc);
1113
1114         if (bitmap != NULL) {
1115                 rc = dt_declare_record_write(env, obj,
1116                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1117                                 (loff_t)size, th);
1118                 if (rc != 0)
1119                         GOTO(out, rc);
1120         }
1121
1122         rc = dt_trans_start_local(env, dev, th);
1123         if (rc != 0)
1124                 GOTO(out, rc);
1125
1126         pos = 0;
1127         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1128         if (rc != 0)
1129                 GOTO(out, rc);
1130
1131         if (bitmap != NULL) {
1132                 pos = size;
1133                 rc = dt_record_write(env, obj,
1134                                 lfsck_buf_get(env, bitmap, nbits >> 3),
1135                                 &pos, th);
1136         }
1137
1138         GOTO(out, rc);
1139
1140 out:
1141         dt_trans_stop(env, dev, th);
1142
1143 log:
1144         if (rc != 0)
1145                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1146                        lfsck_lfsck2name(lfsck), rc);
1147
1148         return rc;
1149 }
1150
1151 static int lfsck_layout_init(const struct lu_env *env,
1152                              struct lfsck_component *com)
1153 {
1154         struct lfsck_layout *lo = com->lc_file_ram;
1155         int rc;
1156
1157         memset(lo, 0, com->lc_file_size);
1158         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1159         lo->ll_status = LS_INIT;
1160         down_write(&com->lc_sem);
1161         rc = lfsck_layout_store(env, com);
1162         if (rc == 0 && com->lc_lfsck->li_master)
1163                 rc = lfsck_load_sub_trace_files(env, com,
1164                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1165         up_write(&com->lc_sem);
1166
1167         return rc;
1168 }
1169
1170 static int fid_is_for_ostobj(const struct lu_env *env,
1171                              struct lfsck_instance *lfsck,
1172                              struct dt_object *obj, const struct lu_fid *fid)
1173 {
1174         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1175         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1176         struct lustre_ost_attrs *loa;
1177         int                      rc;
1178
1179         fld_range_set_any(range);
1180         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1181         if (rc == 0) {
1182                 if (fld_range_is_ost(range))
1183                         return 1;
1184
1185                 return 0;
1186         }
1187
1188         loa = &lfsck_env_info(env)->lti_loa;
1189         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1190                           XATTR_NAME_LMA);
1191         if (rc >= (int)sizeof(struct lustre_mdt_attrs)) {
1192                 lustre_lma_swab(&loa->loa_lma);
1193
1194                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1195         }
1196
1197         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1198
1199         return rc > 0;
1200 }
1201
1202 static struct lfsck_layout_seq *
1203 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1204 {
1205         struct lfsck_layout_seq *lls;
1206
1207         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1208                 if (lls->lls_seq == seq)
1209                         return lls;
1210
1211                 if (lls->lls_seq > seq)
1212                         return NULL;
1213         }
1214
1215         return NULL;
1216 }
1217
1218 static void
1219 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1220                         struct lfsck_layout_seq *lls)
1221 {
1222         struct lfsck_layout_seq *tmp;
1223         struct list_head        *pos = &llsd->llsd_seq_list;
1224
1225         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1226                 if (lls->lls_seq < tmp->lls_seq) {
1227                         pos = &tmp->lls_list;
1228                         break;
1229                 }
1230         }
1231         list_add_tail(&lls->lls_list, pos);
1232 }
1233
1234 static int
1235 lfsck_layout_lastid_create(const struct lu_env *env,
1236                            struct lfsck_instance *lfsck,
1237                            struct dt_object *obj)
1238 {
1239         struct lfsck_thread_info *info = lfsck_env_info(env);
1240         struct lu_attr *la = &info->lti_la;
1241         struct dt_object_format *dof = &info->lti_dof;
1242         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
1243         struct dt_device *dt = lfsck_obj2dev(obj);
1244         struct thandle *th;
1245         __u64 lastid = 0;
1246         loff_t pos = 0;
1247         int rc;
1248
1249         ENTRY;
1250         if (bk->lb_param & LPF_DRYRUN)
1251                 return 0;
1252
1253         memset(la, 0, sizeof(*la));
1254         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1255         la->la_valid = LA_MODE | LA_UID | LA_GID;
1256         memset(dof, 0, sizeof(*dof));
1257         dof->dof_type = dt_mode_to_dft(S_IFREG);
1258
1259         th = lfsck_trans_create(env, dt, lfsck);
1260         if (IS_ERR(th))
1261                 GOTO(log, rc = PTR_ERR(th));
1262
1263         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1264         if (rc != 0)
1265                 GOTO(stop, rc);
1266
1267         rc = dt_declare_record_write(env, obj,
1268                                      lfsck_buf_get(env, &lastid,
1269                                                    sizeof(lastid)),
1270                                      pos, th);
1271         if (rc != 0)
1272                 GOTO(stop, rc);
1273
1274         rc = dt_trans_start_local(env, dt, th);
1275         if (rc != 0)
1276                 GOTO(stop, rc);
1277
1278         dt_write_lock(env, obj, 0);
1279         if (likely(dt_object_exists(obj) == 0)) {
1280                 rc = dt_create(env, obj, la, NULL, dof, th);
1281                 if (rc == 0)
1282                         rc = dt_record_write(env, obj,
1283                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1284                                 &pos, th);
1285         }
1286         dt_write_unlock(env, obj);
1287
1288         GOTO(stop, rc);
1289
1290 stop:
1291         dt_trans_stop(env, dt, th);
1292
1293 log:
1294         CDEBUG(D_LFSCK,
1295                "%s: layout LFSCK will create LAST_ID for <seq> %#llx: rc = %d\n",
1296                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1297
1298         return rc;
1299 }
1300
1301 static int
1302 lfsck_layout_lastid_reload(const struct lu_env *env,
1303                            struct lfsck_component *com,
1304                            struct lfsck_layout_seq *lls)
1305 {
1306         __u64   lastid;
1307         loff_t  pos     = 0;
1308         int     rc;
1309
1310         dt_read_lock(env, lls->lls_lastid_obj, 0);
1311         rc = dt_record_read(env, lls->lls_lastid_obj,
1312                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1313         dt_read_unlock(env, lls->lls_lastid_obj);
1314         if (unlikely(rc != 0))
1315                 return rc;
1316
1317         lastid = le64_to_cpu(lastid);
1318         if (lastid < lls->lls_lastid_known) {
1319                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1320                 struct lfsck_layout     *lo     = com->lc_file_ram;
1321
1322                 lls->lls_lastid = lls->lls_lastid_known;
1323                 lls->lls_dirty = 1;
1324                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1325                         LASSERT(lfsck->li_out_notify != NULL);
1326
1327                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1328                                              LE_LASTID_REBUILDING);
1329                         lo->ll_flags |= LF_CRASHED_LASTID;
1330
1331                         CDEBUG(D_LFSCK,
1332                                "%s: layout LFSCK finds crashed LAST_ID file (1) for the sequence %#llx, old value %llu, known value %llu\n",
1333                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1334                                lastid, lls->lls_lastid);
1335                 }
1336         } else if (lastid >= lls->lls_lastid) {
1337                 lls->lls_lastid = lastid;
1338                 lls->lls_dirty = 0;
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int
1345 lfsck_layout_lastid_store(const struct lu_env *env,
1346                           struct lfsck_component *com)
1347 {
1348         struct lfsck_instance *lfsck  = com->lc_lfsck;
1349         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
1350         struct dt_device *dt = lfsck->li_bottom;
1351         struct lfsck_layout_slave_data *llsd = com->lc_data;
1352         struct lfsck_layout_seq *lls;
1353         struct thandle *th;
1354         __u64 lastid;
1355         int rc = 0;
1356         int rc1 = 0;
1357
1358         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1359                 loff_t pos = 0;
1360
1361                 if (!lls->lls_dirty)
1362                         continue;
1363
1364                 CDEBUG(D_LFSCK,
1365                        "%s: layout LFSCK will sync the LAST_ID for <seq> %#llx as <oid> %llu\n",
1366                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1367
1368                 if (bk->lb_param & LPF_DRYRUN) {
1369                         lls->lls_dirty = 0;
1370                         continue;
1371                 }
1372
1373                 th = lfsck_trans_create(env, dt, lfsck);
1374                 if (IS_ERR(th)) {
1375                         rc1 = PTR_ERR(th);
1376                         CDEBUG(D_LFSCK,
1377                                "%s: layout LFSCK failed to store the LAST_ID for <seq> %#llx(1): rc = %d\n",
1378                                lfsck_lfsck2name(com->lc_lfsck),
1379                                lls->lls_seq, rc1);
1380                         continue;
1381                 }
1382
1383                 lastid = cpu_to_le64(lls->lls_lastid);
1384                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1385                                              lfsck_buf_get(env, &lastid,
1386                                                            sizeof(lastid)),
1387                                              pos, th);
1388                 if (rc != 0)
1389                         goto stop;
1390
1391                 rc = dt_trans_start_local(env, dt, th);
1392                 if (rc != 0)
1393                         goto stop;
1394
1395                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1396                 rc = dt_record_write(env, lls->lls_lastid_obj,
1397                                      lfsck_buf_get(env, &lastid,
1398                                      sizeof(lastid)), &pos, th);
1399                 dt_write_unlock(env, lls->lls_lastid_obj);
1400                 if (rc == 0)
1401                         lls->lls_dirty = 0;
1402
1403 stop:
1404                 dt_trans_stop(env, dt, th);
1405                 if (rc != 0) {
1406                         rc1 = rc;
1407                         CDEBUG(D_LFSCK,
1408                                "%s: layout LFSCK failed to store the LAST_ID for <seq> %#llx(2): rc = %d\n",
1409                                lfsck_lfsck2name(com->lc_lfsck),
1410                                lls->lls_seq, rc1);
1411                 }
1412         }
1413
1414         return rc1;
1415 }
1416
1417 static int
1418 lfsck_layout_lastid_load(const struct lu_env *env,
1419                          struct lfsck_component *com,
1420                          struct lfsck_layout_seq *lls)
1421 {
1422         struct lfsck_instance *lfsck = com->lc_lfsck;
1423         struct lfsck_layout *lo = com->lc_file_ram;
1424         struct lu_fid *fid = &lfsck_env_info(env)->lti_fid;
1425         struct dt_object *obj;
1426         loff_t pos = 0;
1427         int rc;
1428
1429         ENTRY;
1430         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1431         obj = dt_locate(env, lfsck->li_bottom, fid);
1432         if (IS_ERR(obj))
1433                 RETURN(PTR_ERR(obj));
1434
1435         /* LAST_ID crashed, to be rebuilt */
1436         if (dt_object_exists(obj) == 0) {
1437                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1438                         LASSERT(lfsck->li_out_notify != NULL);
1439
1440                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1441                                              LE_LASTID_REBUILDING);
1442                         lo->ll_flags |= LF_CRASHED_LASTID;
1443
1444                         CDEBUG(D_LFSCK,
1445                                "%s: layout LFSCK cannot find the LAST_ID file for sequence %#llx\n",
1446                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1447
1448                         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1449                             cfs_fail_val > 0) {
1450                                 struct ptlrpc_thread *thread =
1451                                         &lfsck->li_thread;
1452
1453                                 up_write(&com->lc_sem);
1454                                 wait_event_idle_timeout(
1455                                         thread->t_ctl_waitq,
1456                                         !thread_is_running(thread),
1457                                         cfs_time_seconds(cfs_fail_val));
1458                                 down_write(&com->lc_sem);
1459                         }
1460                 }
1461
1462                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1463         } else {
1464                 dt_read_lock(env, obj, 0);
1465                 rc = dt_read(env, obj,
1466                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1467                         &pos);
1468                 dt_read_unlock(env, obj);
1469                 if (rc != 0 && rc != sizeof(__u64))
1470                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1471
1472                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1473                         LASSERT(lfsck->li_out_notify != NULL);
1474
1475                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1476                                              LE_LASTID_REBUILDING);
1477                         lo->ll_flags |= LF_CRASHED_LASTID;
1478
1479                         CDEBUG(D_LFSCK,
1480                                "%s: layout LFSCK finds invalid LAST_ID file for the sequence %#llx: rc = %d\n",
1481                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1482                 }
1483
1484                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1485                 rc = 0;
1486         }
1487
1488         GOTO(out, rc);
1489
1490 out:
1491         if (rc != 0)
1492                 lfsck_object_put(env, obj);
1493         else
1494                 lls->lls_lastid_obj = obj;
1495
1496         return rc;
1497 }
1498
1499 static void lfsck_layout_record_failure(const struct lu_env *env,
1500                                         struct lfsck_instance *lfsck,
1501                                         struct lfsck_layout *lo)
1502 {
1503         __u64 cookie;
1504
1505         lo->ll_objs_failed_phase1++;
1506         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1507                                                         lfsck->li_di_oit);
1508         if (lo->ll_pos_first_inconsistent == 0 ||
1509             lo->ll_pos_first_inconsistent < cookie) {
1510                 lo->ll_pos_first_inconsistent = cookie;
1511
1512                 CDEBUG(D_LFSCK,
1513                        "%s: layout LFSCK hit first non-repaired inconsistency at the pos [%llu]\n",
1514                        lfsck_lfsck2name(lfsck),
1515                        lo->ll_pos_first_inconsistent);
1516         }
1517 }
1518
1519 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1520                                            struct lfsck_component *com,
1521                                            int rc)
1522 {
1523         struct lfsck_instance *lfsck = com->lc_lfsck;
1524         struct lfsck_layout *lo = com->lc_file_ram;
1525
1526         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1527                lfsck_lfsck2name(lfsck), rc);
1528
1529         down_write(&com->lc_sem);
1530         lo->ll_run_time_phase2 += ktime_get_seconds() -
1531                                   com->lc_time_last_checkpoint;
1532         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
1533         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1534
1535         if (rc > 0) {
1536                 if (lo->ll_flags & LF_INCOMPLETE) {
1537                         lo->ll_status = LS_PARTIAL;
1538                 } else {
1539                         if (lfsck->li_master) {
1540                                 struct lfsck_assistant_data *lad = com->lc_data;
1541
1542                                 if (test_bit(LAD_INCOMPLETE, &lad->lad_flags))
1543                                         lo->ll_status = LS_PARTIAL;
1544                                 else
1545                                         lo->ll_status = LS_COMPLETED;
1546                         } else {
1547                                 lo->ll_status = LS_COMPLETED;
1548                         }
1549                 }
1550                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1551                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1552                         lo->ll_flags &= ~LF_INCONSISTENT;
1553                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1554                 lo->ll_success_count++;
1555         } else if (rc == 0) {
1556                 if (lfsck->li_status != 0)
1557                         lo->ll_status = lfsck->li_status;
1558                 else
1559                         lo->ll_status = LS_STOPPED;
1560         } else {
1561                 lo->ll_status = LS_FAILED;
1562         }
1563
1564         rc = lfsck_layout_store(env, com);
1565         up_write(&com->lc_sem);
1566
1567         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1568                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1569
1570         return rc;
1571 }
1572
1573 static int lfsck_layout_trans_stop(const struct lu_env *env,
1574                                    struct dt_device *dev,
1575                                    struct thandle *handle, int result)
1576 {
1577         int rc;
1578
1579         /* XXX: If there is something worng or it needs to repair nothing,
1580          *      then notify the lower to stop the modification. Currently,
1581          *      we use th_result for such purpose, that may be replaced by
1582          *      some rollback mechanism in the future.
1583          */
1584         handle->th_result = result;
1585         rc = dt_trans_stop(env, dev, handle);
1586         if (result != 0)
1587                 return result > 0 ? 0 : result;
1588
1589         return rc == 0 ? 1 : rc;
1590 }
1591
1592 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1593                                          struct lfsck_component *com,
1594                                          const struct lu_fid *pfid,
1595                                          const struct lu_fid *cfid,
1596                                          __u32 comp_id, __u32 ea_off,
1597                                          __u32 ost_idx)
1598 {
1599         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1600         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1601         struct dt_device *dev;
1602         struct dt_object *obj;
1603         struct thandle *th = NULL;
1604         int idx;
1605         int rc = 0;
1606
1607         ENTRY;
1608         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
1609                 GOTO(log, rc = 0);
1610
1611         idx = lfsck_sub_trace_file_fid2idx(pfid);
1612         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1613         dev = lfsck_obj2dev(obj);
1614
1615         fid_cpu_to_be(&key->lldk_fid, pfid);
1616         key->lldk_comp_id = cpu_to_be32(comp_id);
1617         key->lldk_ea_off = cpu_to_be32(ea_off);
1618
1619         fid_cpu_to_be(rec, cfid);
1620         rec->f_ver = cpu_to_be32(ost_idx);
1621
1622         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1623
1624         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1625         if (IS_ERR(th))
1626                 GOTO(unlock, rc = PTR_ERR(th));
1627
1628         rc = dt_declare_insert(env, obj,
1629                                (const struct dt_rec *)rec,
1630                                (const struct dt_key *)key, th);
1631         if (rc)
1632                 GOTO(unlock, rc);
1633
1634         rc = dt_trans_start_local(env, dev, th);
1635         if (rc)
1636                 GOTO(unlock, rc);
1637
1638         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1639                        (const struct dt_key *)key, th);
1640
1641         GOTO(unlock, rc);
1642
1643 unlock:
1644         if (th && !IS_ERR(th))
1645                 dt_trans_stop(env, dev, th);
1646
1647         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1648
1649 log:
1650         CDEBUG(D_LFSCK,
1651                "%s: insert the paris "DFID" => "DFID", comp_id = %u, ea_off = %u, ost_idx = %u, into the trace file for further dangling check: rc = %d\n",
1652                lfsck_lfsck2name(com->lc_lfsck), PFID(pfid), PFID(cfid), comp_id,
1653                ea_off, ost_idx, rc);
1654
1655         return rc;
1656 }
1657
1658 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1659                                          struct lfsck_component *com,
1660                                          const struct lu_fid *fid,
1661                                          __u32 comp_id, __u32 ea_off)
1662 {
1663         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1664         struct dt_device *dev;
1665         struct dt_object *obj;
1666         struct thandle *th = NULL;
1667         int idx;
1668         int rc = 0;
1669
1670         ENTRY;
1671         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
1672                 GOTO(log, rc = 0);
1673
1674         idx = lfsck_sub_trace_file_fid2idx(fid);
1675         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1676         dev = lfsck_obj2dev(obj);
1677
1678         fid_cpu_to_be(&key->lldk_fid, fid);
1679         key->lldk_comp_id = cpu_to_be32(comp_id);
1680         key->lldk_ea_off = cpu_to_be32(ea_off);
1681
1682         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1683
1684         th = lfsck_trans_create(env, dev, com->lc_lfsck);
1685         if (IS_ERR(th))
1686                 GOTO(unlock, rc = PTR_ERR(th));
1687
1688         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1689         if (rc)
1690                 GOTO(unlock, rc);
1691
1692         rc = dt_trans_start_local(env, dev, th);
1693         if (rc)
1694                 GOTO(unlock, rc);
1695
1696         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1697
1698         GOTO(unlock, rc);
1699
1700 unlock:
1701         if (th && !IS_ERR(th))
1702                 dt_trans_stop(env, dev, th);
1703
1704         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1705
1706 log:
1707         CDEBUG(D_LFSCK,
1708                "%s: delete the dangling record for "DFID", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1709                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1710
1711         return rc;
1712 }
1713
1714 /**
1715  * Get the system default stripe size.
1716  *
1717  * \param[in] env       pointer to the thread context
1718  * \param[in] lfsck     pointer to the lfsck instance
1719  * \param[out] size     pointer to the default stripe size
1720  *
1721  * \retval              0 for success
1722  * \retval              negative error number on failure
1723  */
1724 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1725                                            struct lfsck_instance *lfsck,
1726                                            __u32 *size)
1727 {
1728         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1729         struct dt_object        *root;
1730         int                      rc;
1731
1732         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1733         if (IS_ERR(root))
1734                 return PTR_ERR(root);
1735
1736         /* Get the default stripe size via xattr_get on the backend root. */
1737         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1738                           XATTR_NAME_LOV);
1739         if (rc > 0) {
1740                 /* The lum->lmm_stripe_size is LE mode. The *size also
1741                  * should be LE mode. So it is unnecessary to convert.
1742                  */
1743                 *size = lum->lmm_stripe_size;
1744                 rc = 0;
1745         } else if (unlikely(rc == 0)) {
1746                 rc = -EINVAL;
1747         }
1748
1749         lfsck_object_put(env, root);
1750
1751         return rc;
1752 }
1753
1754 /**
1755  * \retval       +1: repaired
1756  * \retval        0: did nothing
1757  * \retval      -ve: on error
1758  */
1759 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1760                                      struct lfsck_instance *lfsck,
1761                                      struct thandle *handle,
1762                                      struct dt_object *parent,
1763                                      const struct lu_fid *cfid,
1764                                      struct lu_buf *buf,
1765                                      struct lov_mds_md_v1 *lmm,
1766                                      struct lov_ost_data_v1 *slot,
1767                                      int fl, __u32 ost_idx, int size)
1768 {
1769         struct ost_id *oi = &lfsck_env_info(env)->lti_oi;
1770         struct lu_buf ea_buf;
1771         int rc;
1772         __u32 magic;
1773         __u32 pattern;
1774         __u16 count;
1775
1776         ENTRY;
1777         magic = le32_to_cpu(lmm->lmm_magic);
1778         pattern = le32_to_cpu(lmm->lmm_pattern);
1779         count = le16_to_cpu(lmm->lmm_stripe_count);
1780
1781         fid_to_ostid(cfid, oi);
1782         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1783         slot->l_ost_gen = cpu_to_le32(0);
1784         slot->l_ost_idx = cpu_to_le32(ost_idx);
1785
1786         if (pattern & LOV_PATTERN_F_HOLE) {
1787                 struct lov_ost_data_v1 *objs;
1788                 int                     i;
1789
1790                 if (magic == LOV_MAGIC_V1)
1791                         objs = &lmm->lmm_objects[0];
1792                 else
1793                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1794                 for (i = 0; i < count; i++, objs++) {
1795                         if (lovea_slot_is_dummy(objs))
1796                                 break;
1797                 }
1798
1799                 /* If the @slot is the last dummy slot to be refilled,
1800                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern.
1801                  */
1802                 if (i == count) {
1803                         lmm->lmm_pattern =
1804                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1805
1806                         CDEBUG(D_LFSCK,
1807                                "%s: remove layout HOLE for "DFID": parent "DFID"\n",
1808                                lfsck_lfsck2name(lfsck), PFID(cfid),
1809                                PFID(lfsck_dto2fid(parent)));
1810                 }
1811         }
1812
1813         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1814         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1815         if (rc == 0)
1816                 rc = 1;
1817
1818         RETURN(rc);
1819 }
1820
1821 static struct lov_ost_data_v1 *
1822 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1823                             const struct lu_fid *pfid,
1824                             __u32 stripe_size, __u32 ea_off,
1825                             __u32 pattern, __u16 count)
1826 {
1827         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1828         lmm->lmm_pattern = cpu_to_le32(pattern);
1829         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1830         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1831         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1832         lmm->lmm_stripe_count = cpu_to_le16(count);
1833         lmm->lmm_layout_gen = cpu_to_le16(1);
1834         memset(&lmm->lmm_objects[0], 0,
1835                sizeof(struct lov_ost_data_v1) * count);
1836
1837         return &lmm->lmm_objects[ea_off];
1838 }
1839
1840 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1841                                      struct lfsck_instance *lfsck,
1842                                      struct ost_layout *ol,
1843                                      struct dt_object *parent,
1844                                      struct lu_buf *buf, __u32 ea_off,
1845                                      struct lov_mds_md_v1 **lmm,
1846                                      struct lov_ost_data_v1 **objs)
1847 {
1848         int size;
1849         __u32 stripe_size = ol->ol_stripe_size;
1850         __u32 pattern = LOV_PATTERN_RAID0;
1851         __u16 count;
1852
1853         if (ol->ol_stripe_count != 0)
1854                 count = ol->ol_stripe_count;
1855         else
1856                 count = ea_off + 1;
1857
1858         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1859         LASSERTF(buf->lb_len >= size,
1860                  "buffer len %d is less than real size %d\n",
1861                  (int)buf->lb_len, size);
1862
1863         if (stripe_size == 0) {
1864                 int rc;
1865
1866                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1867                 if (rc)
1868                         return rc;
1869         }
1870
1871         *lmm = buf->lb_buf;
1872         if (ol->ol_stripe_count > 1 ||
1873             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1874                 pattern |= LOV_PATTERN_F_HOLE;
1875                 memset(&(*lmm)->lmm_objects[0], 0,
1876                        count * sizeof(struct lov_ost_data_v1));
1877         }
1878
1879         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1880                                 stripe_size, ea_off, pattern, count);
1881
1882         return size;
1883 }
1884
1885 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1886                                        struct lu_orphan_rec_v3 *rec,
1887                                        struct dt_object *parent,
1888                                        struct lu_buf *buf, __u32 ea_off,
1889                                        struct lov_mds_md_v1 **lmm,
1890                                        struct lov_ost_data_v1 **objs)
1891 {
1892         struct ost_layout *ol = &rec->lor_layout;
1893         struct lov_comp_md_v1 *lcm;
1894         struct lov_comp_md_entry_v1 *lcme;
1895         __u32 pattern = LOV_PATTERN_RAID0;
1896         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1897         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1898         int size = offset + lcme_size;
1899
1900         LASSERTF(buf->lb_len >= size,
1901                  "buffer len %d is less than real size %d\n",
1902                  (int)buf->lb_len, size);
1903
1904         lcm = buf->lb_buf;
1905         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1906         lcm->lcm_size = cpu_to_le32(size);
1907         if (rec->lor_range) {
1908                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1909                                                   rec->lor_range);
1910                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1911         } else if (rec->lor_layout_version) {
1912                 lcm->lcm_layout_gen = cpu_to_le32(rec->lor_layout_version +
1913                                                   rec->lor_range);
1914                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1915         } else {
1916                 /*
1917                  * if OST doesn't provide layout version, then try
1918                  * to inherit one from MDS's layout, but increment
1919                  * it so the client notices and applies modified
1920                  * layout
1921                  */
1922                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
1923                 lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
1924         }
1925         lcm->lcm_entry_count = cpu_to_le16(1);
1926         /* Currently, we do not know how many mirrors will be, set it as zero
1927          * at the beginning. It will be updated when more mirrors are found.
1928          */
1929         lcm->lcm_mirror_count = 0;
1930
1931         lcme = &lcm->lcm_entries[0];
1932         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1933         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1934         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1935         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1936         lcme->lcme_offset = cpu_to_le32(offset);
1937         lcme->lcme_size = cpu_to_le32(lcme_size);
1938         lcme->lcme_layout_gen = lcm->lcm_layout_gen;
1939         if (ol->ol_stripe_count > 1)
1940                 pattern |= LOV_PATTERN_F_HOLE;
1941
1942         *lmm = buf->lb_buf + offset;
1943         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1944                                             ol->ol_stripe_size, ea_off,
1945                                             pattern, ol->ol_stripe_count);
1946
1947         return size;
1948 }
1949
1950 static void lfsck_layout_update_lcm(struct lov_comp_md_v1 *lcm,
1951                                     struct lov_comp_md_entry_v1 *lcme,
1952                                     __u32 version, __u32 range)
1953 {
1954         struct lov_comp_md_entry_v1 *tmp;
1955         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
1956         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
1957         __u32 gen = version + range;
1958         __u32 tmp_gen;
1959         int i;
1960         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1961         __u16 flags = le16_to_cpu(lcm->lcm_flags);
1962
1963         if (!gen)
1964                 gen = 1;
1965         lcme->lcme_layout_gen = cpu_to_le32(gen);
1966         if (le32_to_cpu(lcm->lcm_layout_gen) < gen)
1967                 lcm->lcm_layout_gen = cpu_to_le32(gen);
1968
1969         if (range)
1970                 lcm->lcm_flags = cpu_to_le16(LCM_FL_WRITE_PENDING);
1971         else if (flags == LCM_FL_NONE && le16_to_cpu(lcm->lcm_mirror_count) > 0)
1972                 lcm->lcm_flags = cpu_to_le16(LCM_FL_RDONLY);
1973
1974         for (i = 0; i < count; i++) {
1975                 tmp = &lcm->lcm_entries[i];
1976                 if (le64_to_cpu(tmp->lcme_extent.e_end) <= start)
1977                         continue;
1978
1979                 if (le64_to_cpu(tmp->lcme_extent.e_start) >= end)
1980                         continue;
1981
1982                 if (le32_to_cpu(tmp->lcme_flags) & LCME_FL_STALE)
1983                         continue;
1984
1985                 tmp_gen = le32_to_cpu(tmp->lcme_layout_gen);
1986                 /* "lcme_layout_gen == 0" but without LCME_FL_STALE flag,
1987                  * then it should be the latest version of all mirrors.
1988                  */
1989                 if (tmp_gen == 0 || tmp_gen > gen) {
1990                         lcme->lcme_flags = cpu_to_le32(
1991                                 le32_to_cpu(lcme->lcme_flags) | LCME_FL_STALE);
1992                         break;
1993                 }
1994
1995                 if (tmp_gen < gen)
1996                         tmp->lcme_flags = cpu_to_le32(
1997                                 le32_to_cpu(tmp->lcme_flags) | LCME_FL_STALE);
1998         }
1999 }
2000
2001 static int lfsck_layout_add_comp(const struct lu_env *env,
2002                                  struct lfsck_instance *lfsck,
2003                                  struct thandle *handle,
2004                                  struct lu_orphan_rec_v3 *rec,
2005                                  struct dt_object *parent,
2006                                  const struct lu_fid *cfid,
2007                                  struct lu_buf *buf, __u32 ost_idx,
2008                                  __u32 ea_off, int pos, bool new_mirror)
2009 {
2010         struct ost_layout *ol = &rec->lor_layout;
2011         struct lov_comp_md_v1 *lcm = buf->lb_buf;
2012         struct lov_comp_md_entry_v1 *lcme;
2013         struct lov_mds_md_v1 *lmm;
2014         struct lov_ost_data_v1 *objs;
2015         int added = sizeof(*lcme) +
2016                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2017         int size = le32_to_cpu(lcm->lcm_size) + added;
2018         int rc;
2019         int i;
2020         __u32 offset;
2021         __u32 pattern = LOV_PATTERN_RAID0;
2022         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
2023
2024         ENTRY;
2025         lu_buf_check_and_grow(buf, size);
2026         /* set the lcm again because lu_buf_check_and_grow() may
2027          * have reallocated the buf.
2028          */
2029         lcm = buf->lb_buf;
2030         lcm->lcm_size = cpu_to_le32(size);
2031         lcm->lcm_entry_count = cpu_to_le16(count + 1);
2032         if (new_mirror)
2033                 le16_add_cpu(&lcm->lcm_mirror_count, 1);
2034
2035         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
2036          *    with distance of 'added'.
2037          */
2038         if (pos < count) {
2039                 size = 0;
2040                 for (i = pos; i < count; i++) {
2041                         lcme = &lcm->lcm_entries[i];
2042                         size += le32_to_cpu(lcme->lcme_size);
2043                 }
2044
2045                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
2046                 memmove(buf->lb_buf + offset + added,
2047                         buf->lb_buf + offset, size);
2048         }
2049
2050         size = 0;
2051         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
2052          *    of 'sizeof(struct lov_comp_md_entry_v1)'
2053          */
2054         if (pos > 0) {
2055                 for (i = 0; i < pos; i++) {
2056                         lcme = &lcm->lcm_entries[i];
2057                         size += le32_to_cpu(lcme->lcme_size);
2058                 }
2059
2060                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
2061                 memmove(buf->lb_buf + offset + sizeof(*lcme),
2062                         buf->lb_buf + offset, size);
2063         }
2064
2065         /* 3. Recalculate the enter offset for the component [pos, count-1] */
2066         for (i = count - 1; i >= pos; i--) {
2067                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
2068                 lcm->lcm_entries[i + 1].lcme_offset =
2069                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
2070                                                 lcme_offset) + added);
2071         }
2072
2073         /* 4. Recalculate the enter offset for the component [0, pos) */
2074         for (i = 0; i < pos; i++) {
2075                 lcm->lcm_entries[i].lcme_offset =
2076                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
2077                                                 lcme_offset) + sizeof(*lcme));
2078         }
2079
2080         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
2081         /* 4. Insert the new component header (entry) at the slot 'pos'. */
2082         lcme = &lcm->lcm_entries[pos];
2083         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
2084         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
2085         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
2086         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
2087         lcme->lcme_offset = cpu_to_le32(offset);
2088         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
2089                                                       LOV_MAGIC_V1));
2090
2091         if (ol->ol_stripe_count > 1)
2092                 pattern |= LOV_PATTERN_F_HOLE;
2093
2094         lmm = buf->lb_buf + offset;
2095         /* 5. Insert teh new component body at the 'offset'. */
2096         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
2097                                            ol->ol_stripe_size, ea_off,
2098                                            pattern, ol->ol_stripe_count);
2099
2100         /* 6. Update mirror related flags and version. */
2101         lfsck_layout_update_lcm(lcm, lcme, rec->lor_layout_version,
2102                                 rec->lor_range);
2103
2104         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2105                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
2106                                        le32_to_cpu(lcm->lcm_size));
2107
2108         CERROR("%s: Five five five five five  five file Hello "DFID" and "DFID"d: rc = %d\n",
2109                "five", PFID(cfid), PFID(cfid), 0);
2110
2111         CDEBUG(D_LFSCK,
2112                "%s: layout LFSCK assistant add new COMP for "DFID": parent "DFID", OST-index %u, stripe-index %u, stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, comp_end %llu, layout version %u, range %u, %s LOV EA hole: rc = %d\n",
2113                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2114                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2115                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2116                rec->lor_layout_version, rec->lor_range,
2117                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2118                "with" : "without", rc);
2119
2120         RETURN(rc);
2121 }
2122
2123 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
2124                                           struct lfsck_instance *lfsck,
2125                                           struct thandle *handle,
2126                                           struct ost_layout *ol,
2127                                           struct dt_object *parent,
2128                                           const struct lu_fid *cfid,
2129                                           struct lu_buf *buf, __u32 ost_idx,
2130                                           __u32 ea_off)
2131 {
2132         struct lov_mds_md_v1 *lmm = buf->lb_buf;
2133         struct lov_ost_data_v1 *objs;
2134         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
2135         __u32 magic = le32_to_cpu(lmm->lmm_magic);
2136         int size;
2137         int gap;
2138         int rc;
2139
2140         ENTRY;
2141         /* The original LOVEA maybe re-generated via old filter_fid, at
2142          * that time, we do not know the stripe count and stripe size.
2143          */
2144         if (ol->ol_stripe_count > count)
2145                 count = ol->ol_stripe_count;
2146         if (ol->ol_stripe_size != 0 &&
2147             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
2148                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
2149
2150         if (magic == LOV_MAGIC_V1)
2151                 objs = &lmm->lmm_objects[count];
2152         else
2153                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
2154
2155         gap = ea_off - count;
2156         if (gap >= 0)
2157                 count = ea_off + 1;
2158
2159         size = lov_mds_md_size(count, magic);
2160         LASSERTF(buf->lb_len >= size,
2161                  "buffer len %d is less than real size %d\n",
2162                  (int)buf->lb_len, size);
2163
2164         if (gap > 0) {
2165                 memset(objs, 0, gap * sizeof(*objs));
2166                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
2167         }
2168
2169         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2170         lmm->lmm_stripe_count = cpu_to_le16(count);
2171         objs += gap;
2172
2173         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
2174                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
2175
2176         CDEBUG(D_LFSCK,
2177                "%s: layout LFSCK assistant extend layout EA for "DFID": parent "DFID", OST-index %u, stripe-index %u, stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, comp_end %llu, %s LOV EA hole: rc = %d\n",
2178                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2179                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2180                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2181                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2182                "with" : "without", rc);
2183
2184         RETURN(rc);
2185 }
2186
2187 /**
2188  * \retval       +1: repaired
2189  * \retval        0: did nothing
2190  * \retval      -ve: on error
2191  */
2192 static int lfsck_layout_update_lovea(const struct lu_env *env,
2193                                      struct lfsck_instance *lfsck,
2194                                      struct thandle *handle,
2195                                      struct lu_orphan_rec_v3 *rec,
2196                                      struct dt_object *parent,
2197                                      const struct lu_fid *cfid,
2198                                      struct lu_buf *buf, int fl,
2199                                      __u32 ost_idx, __u32 ea_off)
2200 {
2201         struct ost_layout *ol = &rec->lor_layout;
2202         struct lov_mds_md_v1 *lmm = NULL;
2203         struct lov_ost_data_v1 *objs = NULL;
2204         int rc = 0;
2205
2206         ENTRY;
2207         if (ol->ol_comp_id != 0)
2208                 rc = lfsck_layout_new_comp_lovea(env, rec, parent, buf, ea_off,
2209                                                  &lmm, &objs);
2210         else
2211                 rc = lfsck_layout_new_v1_lovea(env, lfsck, &rec->lor_layout,
2212                                                parent, buf, ea_off, &lmm,
2213                                                &objs);
2214         if (rc > 0)
2215                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2216                                                buf, lmm, objs, fl, ost_idx, rc);
2217
2218         CDEBUG(D_LFSCK,
2219                "%s: layout LFSCK assistant created layout EA for "DFID": parent "DFID", OST-index %u, stripe-index %u, stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, comp_end %llu, layout version %u, range %u, fl %d, %s LOV EA hole: rc = %d\n",
2220                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2221                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2222                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2223                rec->lor_layout_version, rec->lor_range, fl,
2224                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2225                "with" : "without", rc);
2226
2227         RETURN(rc);
2228 }
2229
2230 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2231                                       struct lfsck_component *com,
2232                                       struct dt_object *child,
2233                                       const struct lu_fid *pfid,
2234                                       const struct ost_layout *ol, __u32 offset,
2235                                       __u32 version, __u32 range)
2236 {
2237         struct dt_device *dev = lfsck_obj2dev(child);
2238         struct filter_fid *ff = &lfsck_env_info(env)->lti_ff;
2239         struct thandle *handle;
2240         struct lu_buf buf = { NULL };
2241         int rc;
2242
2243         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
2244                 RETURN(0);
2245
2246         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2247         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2248         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2249          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2250          * parent MDT-object's layout EA.
2251          */
2252         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2253         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2254         ff->ff_layout_version = cpu_to_le32(version);
2255         ff->ff_range = cpu_to_le32(range);
2256         lfsck_buf_init(&buf, ff, sizeof(*ff));
2257
2258         if (!dt_object_exists(child) || lfsck_is_dead_obj(child))
2259                 return 0;
2260
2261         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
2262         if (IS_ERR(handle))
2263                 RETURN(PTR_ERR(handle));
2264
2265         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2266         if (rc != 0)
2267                 GOTO(stop, rc);
2268
2269         rc = dt_trans_start_local(env, dev, handle);
2270         if (rc != 0)
2271                 GOTO(stop, rc);
2272
2273         dt_write_lock(env, child, 0);
2274         if (dt_object_exists(child) && !lfsck_is_dead_obj(child))
2275                 rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2276         dt_write_unlock(env, child);
2277
2278         GOTO(stop, rc);
2279
2280 stop:
2281         dt_trans_stop(env, dev, handle);
2282
2283         return rc;
2284 }
2285
2286 /**
2287  * \retval       +1: repaired
2288  * \retval        0: did nothing
2289  * \retval      -ve: on error
2290  */
2291 static int lfsck_layout_update_pfid(const struct lu_env *env,
2292                                     struct lfsck_component *com,
2293                                     struct dt_object *parent,
2294                                     struct lu_fid *cfid,
2295                                     struct dt_device *cdev,
2296                                     struct lu_orphan_rec_v3 *rec, __u32 ea_off)
2297 {
2298         struct dt_object *child;
2299         int rc = 0;
2300
2301         ENTRY;
2302         child = lfsck_object_find_by_dev(env, cdev, cfid);
2303         if (IS_ERR(child))
2304                 RETURN(PTR_ERR(child));
2305
2306         rc = __lfsck_layout_update_pfid(env, com, child,
2307                                         lu_object_fid(&parent->do_lu),
2308                                         &rec->lor_layout, ea_off,
2309                                         rec->lor_layout_version,
2310                                         rec->lor_range);
2311         lfsck_object_put(env, child);
2312
2313         RETURN(rc == 0 ? 1 : rc);
2314 }
2315
2316 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2317 {
2318         if (ol->ol_comp_id != 0)
2319                 return sizeof(struct lov_comp_md_v1) +
2320                        sizeof(struct lov_comp_md_entry_v1) +
2321                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2322
2323         if (ol->ol_stripe_count != 0)
2324                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2325
2326         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2327 }
2328
2329 /**
2330  * This function will create the MDT-object with the given (partial) LOV EA.
2331  *
2332  * Under some data corruption cases, the MDT-object of the file may be lost,
2333  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2334  * re-create the MDT-object with the orphan OST-object(s) information.
2335  *
2336  * On the other hand, the LFSCK may has created some OST-object for repairing
2337  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2338  * the old OST-object is there and should replace the former new created OST
2339  * object. Unfortunately, some others have modified such newly created object.
2340  * To keep the data (both new and old), the LFSCK will create MDT-object with
2341  * new FID to reference the original OST-object.
2342  *
2343  * \param[in] env       pointer to the thread context
2344  * \param[in] com       pointer to the lfsck component
2345  * \param[in] ltd       pointer to target device descriptor
2346  * \param[in] rec       pointer to the record for the orphan OST-object
2347  * \param[in] cfid      pointer to FID for the orphan OST-object
2348  * \param[in] infix     additional information, such as the FID for original
2349  *                      MDT-object and the stripe offset in the LOV EA
2350  * \param[in] type      the type for describing why the orphan MDT-object is
2351  *                      created. The rules are as following:
2352  *
2353  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2354  *                      same slot in the layout EA. Then the LFSCK will create
2355  *                      new MDT-object(s) to hold the conflict OST-object(s).
2356  *
2357  *  type "N":           The orphan OST-object does not know which one was the
2358  *                      real parent MDT-object, so the LFSCK uses new FID for
2359  *                      its parent MDT-object.
2360  *
2361  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2362  *                      but does not know the position (the file name) in the
2363  *                      layout.
2364  *
2365  *  type "D":           The MDT-object is a directory, it may knows its parent
2366  *                      but because there is no valid linkEA, the LFSCK cannot
2367  *                      know where to put it back to the namespace.
2368  *  type "O":           The MDT-object has no linkEA, and there is no name
2369  *                      entry that references the MDT-object.
2370  *
2371  *  type "P":           The orphan object to be created was a parent directory
2372  *                      of some MDT-object which linkEA shows that the @orphan
2373  *                      object is missing.
2374  *
2375  * The orphan name will be like:
2376  * ${FID}-${infix}-${type}-${conflict_version}
2377  *
2378  * \param[in] ea_off    the stripe offset in the LOV EA
2379  *
2380  * \retval              positive on repaired something
2381  * \retval              0 if needs to repair nothing
2382  * \retval              negative error number on failure
2383  */
2384 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2385                                         struct lfsck_component *com,
2386                                         struct lfsck_tgt_desc *ltd,
2387                                         struct lu_orphan_rec_v3 *rec,
2388                                         struct lu_fid *cfid,
2389                                         const char *infix,
2390                                         const char *type,
2391                                         __u32 ea_off)
2392 {
2393         struct lfsck_thread_info *info = lfsck_env_info(env);
2394         struct dt_insert_rec *dtrec = &info->lti_dt_rec;
2395         char *name = info->lti_key;
2396         struct lu_attr *la = &info->lti_la2;
2397         struct dt_object_format *dof = &info->lti_dof;
2398         struct lfsck_instance *lfsck = com->lc_lfsck;
2399         struct lu_fid *pfid = &rec->lor_rec.lor_fid;
2400         struct lu_fid *tfid = &info->lti_fid3;
2401         struct dt_device *dev = lfsck->li_bottom;
2402         struct dt_object *lpf = lfsck->li_lpf_obj;
2403         struct dt_object *pobj = NULL;
2404         struct dt_object *cobj = NULL;
2405         struct thandle *th = NULL;
2406         struct lu_buf *ea_buf = &info->lti_big_buf;
2407         struct lu_buf lov_buf;
2408         struct lfsck_lock_handle *llh = &info->lti_llh;
2409         struct linkea_data ldata = { NULL };
2410         struct lu_buf linkea_buf;
2411         const struct lu_name *pname;
2412         int size = 0;
2413         int idx = 0;
2414         int rc = 0;
2415
2416         ENTRY;
2417         if (lfsck_is_dryrun(lfsck))
2418                 GOTO(log, rc = 0);
2419
2420         if (unlikely(lpf == NULL))
2421                 GOTO(log, rc = -ENXIO);
2422
2423         /* We use two separated transactions to repair the inconsistency.
2424          *
2425          * 1) create the MDT-object locally.
2426          * 2) update the OST-object's PFID EA if necessary.
2427          *
2428          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2429          * updated when the layout LFSCK run next time.
2430          *
2431          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2432          * when the layout LFSCK run next time.
2433          */
2434         if (fid_is_zero(pfid)) {
2435                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2436                 if (rc != 0)
2437                         GOTO(log, rc);
2438
2439                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2440                 if (IS_ERR(cobj))
2441                         GOTO(log, rc = PTR_ERR(cobj));
2442         }
2443
2444         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2445         if (IS_ERR(pobj))
2446                 GOTO(log, rc = PTR_ERR(pobj));
2447
2448         LASSERT(infix != NULL);
2449         LASSERT(type != NULL);
2450
2451         memset(la, 0, sizeof(*la));
2452         la->la_uid = rec->lor_rec.lor_uid;
2453         la->la_gid = rec->lor_rec.lor_gid;
2454         la->la_mode = S_IFREG | S_IRUSR;
2455         la->la_valid = LA_MODE | LA_UID | LA_GID;
2456
2457         memset(dof, 0, sizeof(*dof));
2458         dof->dof_type = dt_mode_to_dft(S_IFREG);
2459         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2460          * the stripe(s). The LFSCK will specify the LOV EA via
2461          * lfsck_layout_update_lovea().
2462          */
2463         size = lfsck_lovea_size(&rec->lor_layout, ea_off);
2464         if (ea_buf->lb_len < size) {
2465                 lu_buf_realloc(ea_buf, size);
2466                 if (ea_buf->lb_buf == NULL)
2467                         GOTO(log, rc = -ENOMEM);
2468         }
2469
2470 again:
2471         do {
2472                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2473                          type, idx++);
2474                 rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2475                 if (rc != 0 && rc != -ENOENT)
2476                         GOTO(log, rc);
2477         } while (rc == 0);
2478
2479         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2480                         MDS_INODELOCK_UPDATE, LCK_PW);
2481         if (rc != 0)
2482                 GOTO(log, rc);
2483
2484         /* Recheck whether the name conflict with othrs after taken ldlm lock */
2485         rc = dt_lookup_dir(env, lfsck->li_lpf_obj, name, tfid);
2486         if (unlikely(rc == 0)) {
2487                 lfsck_unlock(llh);
2488                 goto again;
2489         }
2490
2491         if (rc != -ENOENT)
2492                 GOTO(unlock, rc);
2493
2494         pname = lfsck_name_get_const(env, name, strlen(name));
2495         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2496                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2497         if (rc != 0)
2498                 GOTO(unlock, rc);
2499
2500         /* The 1st transaction. */
2501         th = lfsck_trans_create(env, dev, lfsck);
2502         if (IS_ERR(th))
2503                 GOTO(unlock, rc = PTR_ERR(th));
2504
2505         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2506         if (rc != 0)
2507                 GOTO(stop, rc);
2508
2509         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2510         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2511                                   LU_XATTR_CREATE, th);
2512         if (rc != 0)
2513                 GOTO(stop, rc);
2514
2515         dtrec->rec_fid = pfid;
2516         dtrec->rec_type = S_IFREG;
2517         rc = dt_declare_insert(env, lpf,
2518                                (const struct dt_rec *)dtrec,
2519                                (const struct dt_key *)name, th);
2520         if (rc != 0)
2521                 GOTO(stop, rc);
2522
2523         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2524                        ldata.ld_leh->leh_len);
2525         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2526                                   XATTR_NAME_LINK, 0, th);
2527         if (rc != 0)
2528                 GOTO(stop, rc);
2529
2530         rc = dt_trans_start_local(env, dev, th);
2531         if (rc != 0)
2532                 GOTO(stop, rc);
2533
2534         dt_write_lock(env, pobj, 0);
2535         rc = dt_create(env, pobj, la, NULL, dof, th);
2536         if (rc == 0)
2537                 rc = lfsck_layout_update_lovea(env, lfsck, th, rec, pobj, cfid,
2538                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2539         dt_write_unlock(env, pobj);
2540         if (rc < 0)
2541                 GOTO(stop, rc);
2542
2543         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2544                        (const struct dt_key *)name, th);
2545         if (rc != 0)
2546                 GOTO(stop, rc);
2547
2548         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2549         if (rc == 0 && cobj != NULL) {
2550                 dt_trans_stop(env, dev, th);
2551                 th = NULL;
2552
2553                 /* The 2nd transaction. */
2554                 rc = __lfsck_layout_update_pfid(env, com, cobj, pfid,
2555                                                 &rec->lor_layout, ea_off,
2556                                                 rec->lor_layout_version,
2557                                                 rec->lor_range);
2558         }
2559
2560         GOTO(stop, rc);
2561
2562 stop:
2563         if (th != NULL)
2564                 dt_trans_stop(env, dev, th);
2565
2566 unlock:
2567         lfsck_unlock(llh);
2568
2569 log:
2570         if (cobj != NULL && !IS_ERR(cobj))
2571                 lfsck_object_put(env, cobj);
2572         if (pobj != NULL && !IS_ERR(pobj))
2573                 lfsck_object_put(env, pobj);
2574
2575         if (rc < 0)
2576                 CDEBUG(D_LFSCK,
2577                        "%s layout LFSCK assistant failed to recreate the lost MDT-object: parent "DFID", child "DFID", OST-index %u, stripe-index %u, infix %s, type %s: rc = %d\n",
2578                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2579                        ltd->ltd_index, ea_off, infix, type, rc);
2580
2581         return rc >= 0 ? 1 : rc;
2582 }
2583
2584 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2585                                                    struct lfsck_component *com,
2586                                                    const struct lu_fid *fid,
2587                                                    __u32 index)
2588 {
2589         struct lfsck_thread_info *info = lfsck_env_info(env);
2590         struct lfsck_request *lr = &info->lti_lr;
2591         struct lfsck_instance *lfsck = com->lc_lfsck;
2592         struct lfsck_tgt_desc *ltd;
2593         struct ptlrpc_request *req;
2594         struct lfsck_request *tmp;
2595         struct obd_export *exp;
2596         int rc = 0;
2597
2598         ENTRY;
2599         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2600         if (unlikely(ltd == NULL))
2601                 RETURN(-ENXIO);
2602
2603         exp = ltd->ltd_exp;
2604         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2605                 GOTO(put, rc = -EOPNOTSUPP);
2606
2607         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2608         if (req == NULL)
2609                 GOTO(put, rc = -ENOMEM);
2610
2611         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2612         if (rc != 0) {
2613                 ptlrpc_request_free(req);
2614
2615                 GOTO(put, rc);
2616         }
2617
2618         memset(lr, 0, sizeof(*lr));
2619         lr->lr_event = LE_CONDITIONAL_DESTROY;
2620         lr->lr_active = LFSCK_TYPE_LAYOUT;
2621         lr->lr_fid = *fid;
2622
2623         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2624         *tmp = *lr;
2625         ptlrpc_request_set_replen(req);
2626
2627         rc = ptlrpc_queue_wait(req);
2628         ptlrpc_req_finished(req);
2629
2630         GOTO(put, rc);
2631
2632 put:
2633         lfsck_tgt_put(ltd);
2634
2635         return rc;
2636 }
2637
2638 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2639                                                   struct lfsck_component *com,
2640                                                   struct lfsck_request *lr)
2641 {
2642         struct lfsck_thread_info *info = lfsck_env_info(env);
2643         struct lu_attr *la = &info->lti_la;
2644         union ldlm_policy_data *policy = &info->lti_policy;
2645         struct ldlm_res_id *resid = &info->lti_resid;
2646         struct lfsck_instance *lfsck = com->lc_lfsck;
2647         struct dt_device *dev = lfsck->li_bottom;
2648         struct lu_fid *fid = &lr->lr_fid;
2649         struct dt_object *obj;
2650         struct thandle *th = NULL;
2651         struct lustre_handle lh = { 0 };
2652         __u64 flags = 0;
2653         int rc = 0;
2654
2655         ENTRY;
2656         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
2657                 RETURN(0);
2658
2659         obj = lfsck_object_find_by_dev(env, dev, fid);
2660         if (IS_ERR(obj))
2661                 RETURN(PTR_ERR(obj));
2662
2663         dt_read_lock(env, obj, 0);
2664         if (dt_object_exists(obj) == 0 ||
2665             lfsck_is_dead_obj(obj)) {
2666                 dt_read_unlock(env, obj);
2667
2668                 GOTO(put, rc = -ENOENT);
2669         }
2670
2671         /* Get obj's attr without lock firstly. */
2672         rc = dt_attr_get(env, obj, la);
2673         dt_read_unlock(env, obj);
2674         if (rc != 0)
2675                 GOTO(put, rc);
2676
2677         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2678                 GOTO(put, rc = -ETXTBSY);
2679
2680         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2681         LASSERT(lfsck->li_namespace != NULL);
2682
2683         memset(policy, 0, sizeof(*policy));
2684         policy->l_extent.end = OBD_OBJECT_EOF;
2685         ost_fid_build_resid(fid, resid);
2686
2687         /* LU-17344: check the validity of the ldlm_res_id */
2688         if (unlikely(resid->name[0] == 0)) {
2689                 CERROR("%s: the res_id "DLDLMRES" built by the FID "DFID" (the obj %p) is invalid\n",
2690                        lfsck_lfsck2name(lfsck), resid->name[0], resid->name[1],
2691                        resid->name[2], resid->name[3], PFID(fid), obj);
2692
2693                 dump_stack();
2694                 return -EIO;
2695         }
2696
2697         rc = ldlm_cli_enqueue_local(env, lfsck->li_namespace, resid,
2698                                     LDLM_EXTENT, policy, LCK_EX, &flags,
2699                                     ldlm_blocking_ast, ldlm_completion_ast,
2700                                     NULL, NULL, 0, LVB_T_NONE, NULL, &lh);
2701         if (rc != ELDLM_OK)
2702                 GOTO(put, rc = -EIO);
2703
2704         dt_write_lock(env, obj, 0);
2705         /* Get obj's attr within lock again. */
2706         rc = dt_attr_get(env, obj, la);
2707         if (rc != 0)
2708                 GOTO(unlock, rc);
2709
2710         if (la->la_ctime != 0)
2711                 GOTO(unlock, rc = -ETXTBSY);
2712
2713         th = lfsck_trans_create(env, dev, lfsck);
2714         if (IS_ERR(th))
2715                 GOTO(unlock, rc = PTR_ERR(th));
2716
2717         rc = dt_declare_ref_del(env, obj, th);
2718         if (rc != 0)
2719                 GOTO(stop, rc);
2720
2721         rc = dt_declare_destroy(env, obj, th);
2722         if (rc != 0)
2723                 GOTO(stop, rc);
2724
2725         rc = dt_trans_start_local(env, dev, th);
2726         if (rc != 0)
2727                 GOTO(stop, rc);
2728
2729         rc = dt_ref_del(env, obj, th);
2730         if (rc != 0)
2731                 GOTO(stop, rc);
2732
2733         rc = dt_destroy(env, obj, th);
2734         if (rc == 0)
2735                 CDEBUG(D_LFSCK,
2736                        "%s: layout LFSCK destroyed the empty OST-object "DFID" that was created for reparing dangling referenced case. But the original missing OST-object is found now.\n",
2737                        lfsck_lfsck2name(lfsck), PFID(fid));
2738
2739         GOTO(stop, rc);
2740
2741 stop:
2742         dt_trans_stop(env, dev, th);
2743
2744 unlock:
2745         dt_write_unlock(env, obj);
2746         ldlm_lock_decref(&lh, LCK_EX);
2747
2748 put:
2749         lfsck_object_put(env, obj);
2750
2751         return rc;
2752 }
2753
2754 /**
2755  * Some OST-object has occupied the specified layout EA slot.
2756  * Such OST-object may be generated by the LFSCK when repair
2757  * dangling referenced MDT-object, which can be indicated by
2758  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2759  * is true and such OST-object has not been modified yet, we
2760  * will replace it with the orphan OST-object; otherwise the
2761  * LFSCK will create new MDT-object to reference the orphan.
2762  *
2763  * \retval       +1: repaired
2764  * \retval        0: did nothing
2765  * \retval      -ve: on error
2766  */
2767 static int lfsck_layout_conflict_create(const struct lu_env *env,
2768                                         struct lfsck_component *com,
2769                                         struct lfsck_tgt_desc *ltd,
2770                                         struct lu_orphan_rec_v3 *rec,
2771                                         struct dt_object *parent,
2772                                         struct lu_fid *cfid,
2773                                         struct lu_buf *ea_buf,
2774                                         struct lov_mds_md_v1 *lmm,
2775                                         struct lov_ost_data_v1 *slot,
2776                                         __u32 ea_off, int lovea_size)
2777 {
2778         struct lfsck_thread_info *info = lfsck_env_info(env);
2779         struct lu_fid *cfid2 = &info->lti_fid2;
2780         struct ost_id *oi = &info->lti_oi;
2781         struct dt_device *dev = lfsck_obj2dev(parent);
2782         struct thandle *th = NULL;
2783         struct lustre_handle lh = { 0 };
2784         __u32 ost_idx2 = le32_to_cpu(slot->l_ost_idx);
2785         int rc = 0;
2786
2787         ENTRY;
2788         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2789                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2790                         RETURN(0);
2791         }
2792
2793         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2794         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2795         if (rc != 0)
2796                 GOTO(out, rc);
2797
2798         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2799                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2800                               LCK_EX);
2801         if (rc != 0)
2802                 GOTO(out, rc);
2803
2804         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2805
2806         /* If the conflict OST-obejct is not created for fixing dangling
2807          * referenced MDT-object in former LFSCK check/repair, or it has
2808          * been modified by others, then we cannot destroy it. Re-create
2809          * a new MDT-object for the orphan OST-object.
2810          */
2811         if (rc == -ETXTBSY) {
2812                 /* No need the layout lock on the original parent. */
2813                 lfsck_ibits_unlock(&lh, LCK_EX);
2814
2815                 fid_zero(&rec->lor_rec.lor_fid);
2816                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2817                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2818                          ea_off);
2819                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2820                                                 info->lti_tmpbuf, "C", ea_off);
2821
2822                 RETURN(rc);
2823         }
2824
2825         if (rc != 0 && rc != -ENOENT)
2826                 GOTO(unlock, rc);
2827
2828         if (lfsck_is_dryrun(com->lc_lfsck))
2829                 GOTO(unlock, rc = 0);
2830
2831         th = lfsck_trans_create(env, dev, com->lc_lfsck);
2832         if (IS_ERR(th))
2833                 GOTO(unlock, rc = PTR_ERR(th));
2834
2835         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2836                                   LU_XATTR_REPLACE, th);
2837         if (rc != 0)
2838                 GOTO(stop, rc);
2839
2840         rc = dt_trans_start_local(env, dev, th);
2841         if (rc != 0)
2842                 GOTO(stop, rc);
2843
2844         dt_write_lock(env, parent, 0);
2845         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2846         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2847                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2848                                        ltd->ltd_index, lovea_size);
2849         dt_write_unlock(env, parent);
2850
2851         GOTO(stop, rc);
2852
2853 stop:
2854         dt_trans_stop(env, dev, th);
2855
2856 unlock:
2857         lfsck_ibits_unlock(&lh, LCK_EX);
2858
2859 out:
2860         CDEBUG(D_LFSCK,
2861                "%s: layout LFSCK assistant replaced the conflict OST-object "DFID" on the OST %x with the orphan "DFID" on the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2862                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2863                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2864                ea_off, rc);
2865
2866         return rc >= 0 ? 1 : rc;
2867 }
2868
2869 /**
2870  * \retval       +1: repaired
2871  * \retval        0: did nothing
2872  * \retval      -ve: on error
2873  */
2874 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2875                                        struct lfsck_component *com,
2876                                        struct lfsck_tgt_desc *ltd,
2877                                        struct lu_orphan_rec_v3 *rec,
2878                                        struct dt_object *parent,
2879                                        struct lu_fid *cfid,
2880                                        __u32 ost_idx, __u32 ea_off)
2881 {
2882         struct lfsck_thread_info *info = lfsck_env_info(env);
2883         struct lu_buf *buf = &info->lti_big_buf;
2884         struct lu_fid *fid = &info->lti_fid2;
2885         struct ost_id *oi = &info->lti_oi;
2886         struct lfsck_instance *lfsck = com->lc_lfsck;
2887         struct dt_device *dt = lfsck_obj2dev(parent);
2888         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
2889         struct ost_layout *ol = &rec->lor_layout;
2890         struct lov_comp_md_v1 *lcm = NULL;
2891         struct lov_comp_md_entry_v1 *lcme = NULL;
2892         struct thandle *handle = NULL;
2893         size_t lovea_size;
2894         struct lov_mds_md_v1 *lmm;
2895         struct lov_ost_data_v1 *objs;
2896         struct lustre_handle lh = { 0 };
2897         __u32 magic;
2898         __u32 flags = 0;
2899         int fl = 0;
2900         int rc = 0;
2901         int rc1;
2902         int i;
2903         int pos = 0;
2904         __u16 count;
2905         bool locked = false;
2906         bool new_mirror = true;
2907
2908         ENTRY;
2909         if (lfsck_is_dryrun(lfsck))
2910                 RETURN(0);
2911
2912         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2913                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2914                               LCK_EX);
2915         if (rc != 0) {
2916                 CDEBUG(D_LFSCK,
2917                        "%s: layout LFSCK assistant failed to recreate LOV EA for "DFID": parent "DFID", OST-index %u, stripe-index %u, comp_id %u, comp_start %llu, comp_end %llu, layout version %u, range %u: rc = %d\n",
2918                        lfsck_lfsck2name(lfsck), PFID(cfid),
2919                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2920                        ol->ol_comp_id, ol->ol_comp_start,
2921                        ol->ol_comp_end, rec->lor_layout_version,
2922                        rec->lor_range, rc);
2923
2924                 RETURN(rc);
2925         }
2926
2927 again:
2928         if (locked) {
2929                 dt_write_unlock(env, parent);
2930                 locked = false;
2931         }
2932
2933         if (handle != NULL) {
2934                 dt_trans_stop(env, dt, handle);
2935                 handle = NULL;
2936         }
2937
2938         if (rc < 0)
2939                 GOTO(unlock_layout, rc);
2940
2941         lovea_size = rc;
2942         if (buf->lb_len < lovea_size) {
2943                 lu_buf_realloc(buf, lovea_size);
2944                 if (buf->lb_buf == NULL)
2945                         GOTO(unlock_layout, rc = -ENOMEM);
2946         }
2947
2948         if (!(bk->lb_param & LPF_DRYRUN)) {
2949                 handle = lfsck_trans_create(env, dt, lfsck);
2950                 if (IS_ERR(handle))
2951                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2952
2953                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2954                                           fl, handle);
2955                 if (rc != 0)
2956                         GOTO(stop, rc);
2957
2958                 rc = dt_trans_start_local(env, dt, handle);
2959                 if (rc != 0)
2960                         GOTO(stop, rc);
2961         }
2962
2963         dt_write_lock(env, parent, 0);
2964         locked = true;
2965         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2966         if (rc == -ERANGE) {
2967                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2968                 LASSERT(rc != 0);
2969                 goto again;
2970         } else if (rc == -ENODATA || rc == 0) {
2971                 lovea_size = lfsck_lovea_size(ol, ea_off);
2972                 /* If the declared is not big enough, re-try. */
2973                 if (buf->lb_len < lovea_size) {
2974                         rc = lovea_size;
2975                         goto again;
2976                 }
2977                 fl = LU_XATTR_CREATE;
2978         } else if (rc < 0) {
2979                 GOTO(unlock_parent, rc);
2980         } else if (unlikely(buf->lb_len == 0)) {
2981                 goto again;
2982         } else {
2983                 fl = LU_XATTR_REPLACE;
2984                 lovea_size = rc;
2985         }
2986
2987         if (fl == LU_XATTR_CREATE) {
2988                 if (bk->lb_param & LPF_DRYRUN)
2989                         GOTO(unlock_parent, rc = 1);
2990
2991                 LASSERT(buf->lb_len >= lovea_size);
2992
2993                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
2994                                                cfid, buf, fl, ost_idx, ea_off);
2995
2996                 GOTO(unlock_parent, rc);
2997         }
2998
2999         lmm = buf->lb_buf;
3000         rc1 = lfsck_layout_verify_header(parent, lmm, lovea_size);
3001
3002         /* If the LOV EA crashed, the rebuild it. */
3003         if (rc1 == -EINVAL) {
3004                 if (bk->lb_param & LPF_DRYRUN)
3005                         GOTO(unlock_parent, rc = 1);
3006
3007                 LASSERT(buf->lb_len >= lovea_size);
3008
3009                 rc = lfsck_layout_update_lovea(env, lfsck, handle, rec, parent,
3010                                                cfid, buf, fl, ost_idx, ea_off);
3011
3012                 GOTO(unlock_parent, rc);
3013         }
3014
3015         /* For other unknown magic/pattern, keep the current LOV EA. */
3016         if (rc1 == -EOPNOTSUPP)
3017                 GOTO(unlock_parent, rc1 = 0);
3018
3019         if (rc1)
3020                 GOTO(unlock_parent, rc = rc1);
3021
3022         magic = le32_to_cpu(lmm->lmm_magic);
3023         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3024                 __u64 start;
3025                 __u64 end;
3026                 __u16 mirror_id0 = mirror_id_of(ol->ol_comp_id);
3027                 __u16 mirror_id1;
3028
3029                 if (bk->lb_param & LPF_DRYRUN)
3030                         GOTO(unlock_parent, rc = 1);
3031
3032                 lcm = buf->lb_buf;
3033                 count = le16_to_cpu(lcm->lcm_entry_count);
3034                 for (i = 0; i < count; pos = ++i) {
3035                         lcme = &lcm->lcm_entries[i];
3036                         start = le64_to_cpu(lcme->lcme_extent.e_start);
3037                         end = le64_to_cpu(lcme->lcme_extent.e_end);
3038                         mirror_id1 = mirror_id_of(le32_to_cpu(lcme->lcme_id));
3039
3040                         if (mirror_id0 > mirror_id1)
3041                                 continue;
3042
3043                         if (mirror_id0 < mirror_id1)
3044                                 break;
3045
3046                         new_mirror = false;
3047                         if (end <= ol->ol_comp_start)
3048                                 continue;
3049
3050                         if (start >= ol->ol_comp_end)
3051                                 break;
3052
3053                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
3054                         magic = le32_to_cpu(lmm->lmm_magic);
3055                         flags = le32_to_cpu(lcme->lcme_flags);
3056                         goto further;
3057                 }
3058
3059                 rc = lfsck_layout_add_comp(env, lfsck, handle, rec, parent,
3060                                 cfid, buf, ost_idx, ea_off, pos, new_mirror);
3061
3062                 GOTO(unlock_parent, rc);
3063         }
3064
3065 further:
3066         count = le16_to_cpu(lmm->lmm_stripe_count);
3067         if (count == 0)
3068                 GOTO(unlock_parent, rc = -EINVAL);
3069         LASSERT(count > 0);
3070
3071         /* Exceed the current end of MDT-object layout EA. Then extend it. */
3072         if (count <= ea_off) {
3073                 if (bk->lb_param & LPF_DRYRUN)
3074                         GOTO(unlock_parent, rc = 1);
3075
3076                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
3077                 /* If the declared is not big enough, re-try. */
3078                 if (buf->lb_len < lovea_size) {
3079                         rc = lovea_size;
3080                         goto again;
3081                 }
3082
3083                 if (lcm) {
3084                         LASSERT(lcme);
3085
3086                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
3087                         lfsck_layout_update_lcm(lcm, lcme,
3088                                                 rec->lor_layout_version,
3089                                                 rec->lor_range);
3090                 }
3091
3092                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
3093                                         parent, cfid, buf, ost_idx, ea_off);
3094
3095                 GOTO(unlock_parent, rc);
3096         }
3097
3098         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
3099
3100         if (magic == LOV_MAGIC_V1) {
3101                 objs = &lmm->lmm_objects[0];
3102         } else {
3103                 LASSERT(magic == LOV_MAGIC_V3);
3104                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
3105         }
3106
3107         for (i = 0; i < count; i++, objs++) {
3108                 /* The MDT-object was created via lfsck_layout_recover_create()
3109                  * by others before, and we fill the dummy layout EA.
3110                  */
3111                 if ((lcme && !(flags & LCME_FL_INIT)) ||
3112                      lovea_slot_is_dummy(objs)) {
3113                         if (i != ea_off)
3114                                 continue;
3115
3116                         if (bk->lb_param & LPF_DRYRUN)
3117                                 GOTO(unlock_parent, rc = 1);
3118
3119                         lmm->lmm_layout_gen =
3120                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3121                         if (lcme) {
3122                                 LASSERT(lcm);
3123
3124                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
3125                                         ol->ol_stripe_size ||
3126                                     le16_to_cpu(lmm->lmm_stripe_count) !=
3127                                         ol->ol_stripe_count ||
3128                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
3129                                         ol->ol_comp_start ||
3130                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
3131                                         ol->ol_comp_end) {
3132                                         CDEBUG(D_LFSCK,
3133                                                "%s: found invalid component for "DFID ": parent "DFID", stripe-index %u, stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, comp_end %llu, cur_stripe_size %u, cur_stripe_count %u, cur_comp_start %llu, cur_comp_end %llu\n",
3134                                         lfsck_lfsck2name(lfsck), PFID(cfid),
3135                                         PFID(lfsck_dto2fid(parent)), ea_off,
3136                                         ol->ol_stripe_size,
3137                                         ol->ol_stripe_count, ol->ol_comp_id,
3138                                         ol->ol_comp_start, ol->ol_comp_end,
3139                                         le32_to_cpu(lmm->lmm_stripe_size),
3140                                         le16_to_cpu(lmm->lmm_stripe_count),
3141                                         le64_to_cpu(lcme->lcme_extent.e_start),
3142                                         le64_to_cpu(lcme->lcme_extent.e_end));
3143
3144                                         GOTO(unlock_parent, rc = -EINVAL);
3145                                 }
3146
3147                                 lovea_size = le32_to_cpu(lcm->lcm_size);
3148                                 lcme->lcme_flags = cpu_to_le32(flags |
3149                                                                LCME_FL_INIT);
3150                                 lfsck_layout_update_lcm(lcm, lcme,
3151                                                         rec->lor_layout_version,
3152                                                         rec->lor_range);
3153                         }
3154
3155                         LASSERTF(buf->lb_len >= lovea_size,
3156                                  "buffer len %d is less than real size %d\n",
3157                                  (int)buf->lb_len, (int)lovea_size);
3158
3159                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
3160                                                 parent, cfid, buf, lmm, objs,
3161                                                 fl, ost_idx, lovea_size);
3162
3163                         CDEBUG(D_LFSCK,
3164                                "%s layout LFSCK assistant fill dummy layout slot for "DFID": parent "DFID", OST-index %u, stripe-index %u: rc = %d\n",
3165                                lfsck_lfsck2name(lfsck), PFID(cfid),
3166                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
3167
3168                         GOTO(unlock_parent, rc);
3169                 }
3170
3171                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3172                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
3173                 if (rc != 0) {
3174                         CDEBUG(D_LFSCK,
3175                                "%s: the parent "DFID" contains invalid layout EA at the slot %d, index %u\n",
3176                                lfsck_lfsck2name(lfsck),
3177                                PFID(lfsck_dto2fid(parent)), i,
3178                                le32_to_cpu(objs->l_ost_idx));
3179
3180                         GOTO(unlock_parent, rc);
3181                 }
3182
3183                 /* It should be rare case, the slot is there, but the LFSCK
3184                  * does not handle it during the first-phase cycle scanning.
3185                  */
3186                 if (unlikely(lu_fid_eq(fid, cfid))) {
3187                         if (i == ea_off) {
3188                                 GOTO(unlock_parent, rc = 0);
3189                         } else {
3190                                 /* Rare case that the OST-object index
3191                                  * does not match the parent MDT-object
3192                                  * layout EA. We trust the later one.
3193                                  */
3194                                 if (bk->lb_param & LPF_DRYRUN)
3195                                         GOTO(unlock_parent, rc = 1);
3196
3197                                 dt_write_unlock(env, parent);
3198                                 if (handle != NULL)
3199                                         dt_trans_stop(env, dt, handle);
3200                                 lfsck_ibits_unlock(&lh, LCK_EX);
3201                                 rc = lfsck_layout_update_pfid(env, com, parent,
3202                                                         cfid, ltd->ltd_tgt,
3203                                                         rec, i);
3204
3205                                 CDEBUG(D_LFSCK,
3206                                        "%s layout LFSCK assistant updated OST-object's pfid for "DFID": parent "DFID", OST-index %u, stripe-index %u: rc = %d\n",
3207                                        lfsck_lfsck2name(lfsck), PFID(cfid),
3208                                        PFID(lfsck_dto2fid(parent)),
3209                                        ltd->ltd_index, i, rc);
3210
3211                                 RETURN(rc);
3212                         }
3213                 }
3214         }
3215
3216         /* MDT-obj exists, but related layout EA slot is occupied by others */
3217         if (bk->lb_param & LPF_DRYRUN)
3218                 GOTO(unlock_parent, rc = 1);
3219
3220         dt_write_unlock(env, parent);
3221         if (handle != NULL)
3222                 dt_trans_stop(env, dt, handle);
3223         lfsck_ibits_unlock(&lh, LCK_EX);
3224         if (magic == LOV_MAGIC_V1)
3225                 objs = &lmm->lmm_objects[ea_off];
3226         else
3227                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3228         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3229                                           buf, lmm, objs, ea_off, lovea_size);
3230
3231         RETURN(rc);
3232
3233 unlock_parent:
3234         if (locked)
3235                 dt_write_unlock(env, parent);
3236
3237 stop:
3238         if (handle != NULL)
3239                 dt_trans_stop(env, dt, handle);
3240
3241 unlock_layout:
3242         lfsck_ibits_unlock(&lh, LCK_EX);
3243
3244         return rc;
3245 }
3246
3247 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3248                                         struct lfsck_component *com,
3249                                         struct lfsck_tgt_desc *ltd,
3250                                         struct lu_orphan_rec_v3 *rec,
3251                                         struct lu_fid *cfid)
3252 {
3253         struct lfsck_layout *lo = com->lc_file_ram;
3254         struct lu_fid *pfid = &rec->lor_rec.lor_fid;
3255         struct dt_object *parent = NULL;
3256         __u32 ea_off = pfid->f_stripe_idx;
3257         int rc = 0;
3258
3259         ENTRY;
3260         if (!fid_is_sane(cfid))
3261                 GOTO(out, rc = -EINVAL);
3262
3263         pfid->f_ver = 0;
3264         if (fid_is_zero(pfid)) {
3265                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3266                                                   "", "N", ea_off);
3267                 GOTO(out, rc);
3268         }
3269
3270         if (!fid_is_sane(pfid))
3271                 GOTO(out, rc = -EINVAL);
3272
3273         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3274         if (IS_ERR(parent))
3275                 GOTO(out, rc = PTR_ERR(parent));
3276
3277         if (unlikely(dt_object_remote(parent) != 0))
3278                 GOTO(put, rc = -EXDEV);
3279
3280         if (dt_object_exists(parent) == 0) {
3281                 lfsck_object_put(env, parent);
3282                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3283                                                   "", "R", ea_off);
3284                 GOTO(out, rc);
3285         }
3286
3287         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3288                 GOTO(put, rc = -EISDIR);
3289
3290         /* The orphan OST-object claims to be the parent's stripe, then
3291          * related dangling record in the trace file is meaningless.
3292          */
3293         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3294                                            rec->lor_layout.ol_comp_id, ea_off);
3295         if (rc && rc != -ENOENT)
3296                 GOTO(put, rc);
3297
3298         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3299                                          ltd->ltd_index, ea_off);
3300
3301         GOTO(put, rc);
3302
3303 put:
3304         if (rc <= 0)
3305                 lfsck_object_put(env, parent);
3306         else
3307                 /* The layout EA is changed, need to be reloaded next time. */
3308                 dt_object_put_nocache(env, parent);
3309
3310 out:
3311         down_write(&com->lc_sem);
3312         com->lc_new_scanned++;
3313         com->lc_new_checked++;
3314         if (rc > 0) {
3315                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3316                 rc = 0;
3317         } else if (rc < 0) {
3318                 lo->ll_objs_failed_phase2++;
3319         }
3320         up_write(&com->lc_sem);
3321
3322         return rc;
3323 }
3324
3325 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3326                                     struct lfsck_component *com,
3327                                     struct lfsck_tgt_desc *ltd)
3328 {
3329         struct lfsck_assistant_data *lad = com->lc_data;
3330         struct lfsck_instance *lfsck = com->lc_lfsck;
3331         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
3332         struct lfsck_thread_info *info = lfsck_env_info(env);
3333         struct lu_fid *fid = &info->lti_fid;
3334         struct dt_object *obj;
3335         const struct dt_it_ops *iops;
3336         struct dt_it *di;
3337         int rc = 0;
3338
3339         ENTRY;
3340         CDEBUG(D_LFSCK,
3341                "%s: layout LFSCK assistant starts the orphan scanning for OST%04x\n",
3342                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3343
3344         if (test_bit(ltd->ltd_index, lad->lad_bitmap)) {
3345                 CDEBUG(D_LFSCK,
3346                        "%s: layout LFSCK assistant skip the orphan scanning for OST%04x\n",
3347                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3348
3349                 RETURN(0);
3350         }
3351
3352         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3353         fid->f_oid = fid->f_ver = 0;
3354
3355         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3356         if (unlikely(IS_ERR(obj)))
3357                 GOTO(log, rc = PTR_ERR(obj));
3358
3359         rc = obj->do_ops->do_index_try(env, obj,
3360                                        &dt_lfsck_layout_orphan_features);
3361         if (rc != 0)
3362                 GOTO(put, rc);
3363
3364         iops = &obj->do_index_ops->dio_it;
3365         di = iops->init(env, obj, 0);
3366         if (IS_ERR(di))
3367                 GOTO(put, rc = PTR_ERR(di));
3368
3369         rc = iops->load(env, di, 0);
3370         if (rc == -ESRCH) {
3371                 /* -ESRCH means that the orphan OST-objects rbtree has been
3372                  * cleanup because of the OSS server restart or other errors.
3373                  */
3374                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3375                 GOTO(fini, rc);
3376         }
3377
3378         if (rc == 0)
3379                 rc = iops->next(env, di);
3380         else if (rc > 0)
3381                 rc = 0;
3382
3383         if (rc < 0)
3384                 GOTO(fini, rc);
3385
3386         if (rc > 0)
3387                 GOTO(fini, rc = 0);
3388
3389         do {
3390                 struct dt_key           *key;
3391                 struct lu_orphan_rec_v3 *rec = &info->lti_rec;
3392
3393                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3394                     unlikely(!thread_is_running(&lfsck->li_thread)))
3395                         break;
3396
3397                 key = iops->key(env, di);
3398                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3399                 /* Remote target OST may be runnning old LFSCK */
3400                 memset(rec, 0, sizeof(*rec));
3401                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3402                 if (rc == 0)
3403                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3404                                         &com->lc_fid_latest_scanned_phase2);
3405                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3406                         GOTO(fini, rc);
3407
3408                 lfsck_control_speed_by_self(com);
3409                 do {
3410                         rc = iops->next(env, di);
3411                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3412         } while (rc == 0);
3413
3414         GOTO(fini, rc);
3415
3416 fini:
3417         iops->put(env, di);
3418         iops->fini(env, di);
3419 put:
3420         lfsck_object_put(env, obj);
3421
3422 log:
3423         CDEBUG(D_LFSCK,
3424                "%s: layout LFSCK assistant finished the orphan scanning for OST%04x: rc = %d\n",
3425                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3426
3427         return rc > 0 ? 0 : rc;
3428 }
3429
3430 static int lfsck_lov2layout(struct lov_mds_md_v1 *lmm, struct filter_fid *ff,
3431                             __u32 comp_id)
3432 {
3433         struct ost_layout *ol = &ff->ff_layout;
3434         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3435         int rc = 0;
3436
3437         ENTRY;
3438         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3439                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3440                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3441                 ol->ol_comp_start = 0;
3442                 ol->ol_comp_end = 0;
3443                 ol->ol_comp_id = 0;
3444                 ff->ff_layout_version = 0;
3445                 ff->ff_range = 0;
3446         } else if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3447                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3448                 struct lov_comp_md_entry_v1 *lcme = NULL;
3449                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3450                 int i;
3451
3452                 for (i = 0; i < count; i++) {
3453                         lcme = &lcm->lcm_entries[i];
3454                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3455                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3456                                         LCME_FL_INIT);
3457
3458                                 break;
3459                         }
3460                 }
3461
3462                 /* The comp has been removed, do nothing. */
3463                 if (i == count)
3464                         GOTO(out, rc = 1);
3465
3466                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3467                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3468                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3469                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3470                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3471                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3472                 ff->ff_layout_version = le32_to_cpu(lcme->lcme_layout_gen);
3473                 ff->ff_range = 0;
3474         } else {
3475                 GOTO(out, rc = -EINVAL);
3476         }
3477
3478         EXIT;
3479
3480 out:
3481         return rc;
3482 }
3483
3484 /**
3485  * Repair the MDT-object with dangling LOV EA reference.
3486  *
3487  * we need to repair the inconsistency according to the users' requirement:
3488  *
3489  * 1) Keep the inconsistency there and report the inconsistency case,
3490  *    then give the chance to the application to find related issues,
3491  *    and the users can make the decision about how to handle it with
3492  *    more human knownledge. (by default)
3493  *
3494  * 2) Re-create the missing OST-object with the FID/owner information.
3495  *
3496  * \param[in] env       pointer to the thread context
3497  * \param[in] com       the layout LFSCK component
3498  * \param[in] parent    the MDT-object with dangling LOV EA reference
3499  * \param[in] child     the OST-object to be created
3500  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3501  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3502  * \param[in] ost_idx   the index of OST on which the OST-object resides
3503  *
3504  * \retval              +1 for repair successfully
3505  * \retval              0 for did nothing
3506  * \retval              negative error number on failure
3507  */
3508 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3509                                           struct lfsck_component *com,
3510                                           struct dt_object *parent,
3511                                           struct dt_object *child,
3512                                           __u32 comp_id, __u32 ea_off,
3513                                           __u32 ost_idx, bool log)
3514 {
3515         struct lfsck_thread_info *info = lfsck_env_info(env);
3516         struct filter_fid *ff = &info->lti_ff;
3517         struct dt_object_format *dof = &info->lti_dof;
3518         struct lu_attr *la = &info->lti_la;
3519         struct lfsck_instance *lfsck = com->lc_lfsck;
3520         struct dt_device *dev = lfsck_obj2dev(child);
3521         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3522         const struct lu_fid *cfid = lfsck_dto2fid(child);
3523         struct lu_buf *tbuf = &info->lti_big_buf;
3524         struct thandle *handle;
3525         struct lu_buf *buf;
3526         struct lustre_handle lh = { 0 };
3527         int rc;
3528
3529         ENTRY;
3530         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3531                 GOTO(log, rc = 1);
3532
3533         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3534                 GOTO(log, rc = 1);
3535
3536         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3537                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3538                               LCK_EX);
3539         if (rc != 0)
3540                 GOTO(log, rc);
3541
3542         rc = dt_attr_get(env, parent, la);
3543         if (rc != 0)
3544                 GOTO(unlock1, rc);
3545
3546         la->la_mode = S_IFREG | 0666;
3547         la->la_atime = la->la_mtime = la->la_ctime = 0;
3548         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3549                        LA_ATIME | LA_MTIME | LA_CTIME;
3550         memset(dof, 0, sizeof(*dof));
3551         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3552         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3553         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3554          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3555          * parent MDT-object's layout EA.
3556          */
3557         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3558
3559         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3560         if (unlikely(rc == -ENODATA))
3561                 rc = 0;
3562         if (rc <= 0)
3563                 GOTO(unlock1, rc);
3564
3565         rc = lfsck_lov2layout(tbuf->lb_buf, ff, comp_id);
3566         if (rc)
3567                 GOTO(unlock1, rc);
3568
3569         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3570         handle = lfsck_trans_create(env, dev, lfsck);
3571         if (IS_ERR(handle))
3572                 GOTO(unlock1, rc = PTR_ERR(handle));
3573
3574         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3575         if (rc != 0)
3576                 GOTO(stop, rc);
3577
3578         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3579                                   LU_XATTR_CREATE, handle);
3580         if (rc != 0)
3581                 GOTO(stop, rc);
3582
3583         rc = dt_trans_start_local(env, dev, handle);
3584         if (rc != 0)
3585                 GOTO(stop, rc);
3586
3587         dt_read_lock(env, parent, 0);
3588         if (unlikely(lfsck_is_dead_obj(parent)))
3589                 GOTO(unlock2, rc = 0);
3590
3591         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3592                 struct ost_id *oi = &info->lti_oi;
3593                 struct lu_fid *tfid = &info->lti_fid2;
3594                 struct lu_buf *lovea = &info->lti_big_buf;
3595                 struct lov_mds_md_v1 *lmm;
3596                 struct lov_ost_data_v1 *objs;
3597                 __u32 magic;
3598                 int count;
3599                 int idx2;
3600
3601                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3602                 if (unlikely(rc == -ENODATA))
3603                         rc = 0;
3604                 if (rc <= 0)
3605                         GOTO(unlock2, rc);
3606
3607                 lmm = lovea->lb_buf;
3608                 magic = le32_to_cpu(lmm->lmm_magic);
3609                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
3610                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3611                         struct lov_comp_md_entry_v1 *lcme;
3612                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3613                         int i;
3614
3615                         for (i = 0; i < count; i++) {
3616                                 lcme = &lcm->lcm_entries[i];
3617                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3618                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3619                                                 LCME_FL_INIT);
3620
3621                                         lmm = lovea->lb_buf +
3622                                                 le32_to_cpu(lcme->lcme_offset);
3623                                         magic = le32_to_cpu(lmm->lmm_magic);
3624                                         goto check;
3625                                 }
3626                         }
3627
3628                         /* Someone removed the component, do nothing. */
3629                         GOTO(unlock2, rc = 0);
3630                 }
3631
3632 check:
3633                 count = le16_to_cpu(lmm->lmm_stripe_count);
3634                 /* Someone changed the LOV EA, do nothing. */
3635                 if (count <= ea_off)
3636                         GOTO(unlock2, rc = 0);
3637
3638                 if (magic == LOV_MAGIC_V1) {
3639                         objs = &lmm->lmm_objects[ea_off];
3640                 } else {
3641                         LASSERT(magic == LOV_MAGIC_V3);
3642
3643                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3644                                                         lmm_objects[ea_off];
3645                 }
3646
3647                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3648                 idx2 = le32_to_cpu(objs->l_ost_idx);
3649                 rc = ostid_to_fid(tfid, oi, idx2);
3650                 /* Someone changed the LOV EA, do nothing. */
3651                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3652                         GOTO(unlock2, rc);
3653         }
3654
3655         rc = dt_create(env, child, la, NULL, dof, handle);
3656         if (rc != 0)
3657                 GOTO(unlock2, rc);
3658
3659         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3660                           handle);
3661
3662         GOTO(unlock2, rc);
3663
3664 unlock2:
3665         dt_read_unlock(env, parent);
3666
3667 stop:
3668         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3669
3670 unlock1:
3671         lfsck_ibits_unlock(&lh, LCK_EX);
3672
3673 log:
3674         if (rc && log)
3675                 CDEBUG(D_LFSCK,
3676                        "%s: layout LFSCK assistant found dangling reference for: parent "DFID", child "DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3677                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3678                        comp_id, ea_off, ost_idx,
3679                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3680                                 "Create the lost OST-object as required" :
3681                                 "Keep the MDT-object there by default", rc);
3682
3683         return rc;
3684 }
3685
3686 /**
3687  * Repair the MDT-object with dangling LOV EA reference.
3688  *
3689  * Prepare parameters and call __lfsck_layout_repair_dangling()
3690  * to repair the dangling LOV EA reference.
3691  *
3692  * \param[in] env       pointer to the thread context
3693  * \param[in] com       the layout LFSCK component
3694  * \param[in] pfid      the MDT-object's FID
3695  * \param[in] cfid      the FID for the OST-object to be created
3696  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3697  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3698  * \param[in] ost_idx   the index of OST on which the OST-object resides
3699  *
3700  * \retval              +1 for repair successfully
3701  * \retval              0 for did nothing
3702  * \retval              negative error number on failure
3703  */
3704 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3705                                         struct lfsck_component *com,
3706                                         const struct lu_fid *pfid,
3707                                         const struct lu_fid *cfid,
3708                                         __u32 comp_id, __u32 ea_off,
3709                                         __u32 ost_idx)
3710 {
3711         struct lfsck_instance *lfsck = com->lc_lfsck;
3712         struct dt_object *parent = NULL;
3713         struct dt_object *child = NULL;
3714         struct lfsck_tgt_desc *ltd;
3715         int rc;
3716
3717         ENTRY;
3718         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3719         if (IS_ERR(parent))
3720                 GOTO(log, rc = PTR_ERR(parent));
3721
3722         /* The MDT-object has been removed. */
3723         if (dt_object_exists(parent) == 0)
3724                 GOTO(log, rc = 0);
3725
3726         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3727         if (unlikely(ltd == NULL))
3728                 GOTO(log, rc = -ENODEV);
3729
3730         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3731         if (IS_ERR(child))
3732                 GOTO(log, rc = PTR_ERR(child));
3733
3734         /* The OST-object has been created. */
3735         if (unlikely(dt_object_exists(child) != 0))
3736                 GOTO(log, rc = 0);
3737
3738         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3739                                             comp_id, ea_off, ost_idx, false);
3740
3741         GOTO(log, rc);
3742
3743 log:
3744         if (child != NULL && !IS_ERR(child))
3745                 lfsck_object_put(env, child);
3746
3747         if (parent != NULL && !IS_ERR(parent))
3748                 lfsck_object_put(env, parent);
3749
3750         if (rc)
3751                 CDEBUG(D_LFSCK,
3752                        "%s: layout LFSCK assistant found dangling reference for: parent "DFID", child "DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3753                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3754                        comp_id, ea_off, ost_idx,
3755                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3756                                 "Create the lost OST-object as required" :
3757                                 "Keep the MDT-object there by default", rc);
3758
3759         return rc;
3760 }
3761
3762 /* If the OST-object does not recognize the MDT-object as its parent, and
3763  * there is no other MDT-object claims as its parent, then just trust the
3764  * given MDT-object as its parent. So update the OST-object filter_fid.
3765  */
3766 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3767                                               struct lfsck_component *com,
3768                                               struct dt_object *parent,
3769                                               struct lfsck_layout_req *llr,
3770                                               struct lu_attr *la)
3771 {
3772         struct lfsck_thread_info *info = lfsck_env_info(env);
3773         struct filter_fid *ff = &info->lti_ff;
3774         struct dt_object *child  = llr->llr_child;
3775         struct dt_device *dev = lfsck_obj2dev(child);
3776         const struct lu_fid *tfid = lu_object_fid(&parent->do_lu);
3777         struct lu_buf *tbuf = &info->lti_big_buf;
3778         struct thandle *handle;
3779         struct lu_buf *buf;
3780         struct lustre_handle lh = { 0 };
3781         int rc;
3782
3783         ENTRY;
3784         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3785                 GOTO(log, rc = 0);
3786
3787         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3788                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3789                               LCK_EX);
3790         if (rc != 0)
3791                 GOTO(log, rc);
3792
3793         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3794         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3795         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3796          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3797          * parent MDT-object's layout EA.
3798          */
3799         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3800
3801         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3802         if (unlikely(rc == -ENODATA))
3803                 rc = 0;
3804         if (rc <= 0)
3805                 GOTO(unlock1, rc);
3806
3807         rc = lfsck_lov2layout(tbuf->lb_buf, ff, llr->llr_comp_id);
3808         if (rc)
3809                 GOTO(unlock1, rc);
3810
3811         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3812
3813         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
3814         if (IS_ERR(handle))
3815                 GOTO(unlock1, rc = PTR_ERR(handle));
3816
3817         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3818         if (rc != 0)
3819                 GOTO(stop, rc);
3820
3821         rc = dt_attr_get(env, parent, la);
3822         if (rc != 0)
3823                 GOTO(stop, rc);
3824
3825         la->la_valid = LA_UID | LA_GID;
3826         rc = dt_declare_attr_set(env, child, la, handle);
3827         if (rc != 0)
3828                 GOTO(stop, rc);
3829
3830         rc = dt_trans_start_local(env, dev, handle);
3831         if (rc != 0)
3832                 GOTO(stop, rc);
3833
3834         dt_write_lock(env, parent, 0);
3835         if (unlikely(lfsck_is_dead_obj(parent)))
3836                 GOTO(unlock2, rc = 1);
3837
3838         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3839         if (rc != 0)
3840                 GOTO(unlock2, rc);
3841
3842         /* Get the latest parent's owner. */
3843         rc = dt_attr_get(env, parent, la);
3844         if (rc != 0)
3845                 GOTO(unlock2, rc);
3846
3847         la->la_valid = LA_UID | LA_GID;
3848         rc = dt_attr_set(env, child, la, handle);
3849
3850         GOTO(unlock2, rc);
3851
3852 unlock2:
3853         dt_write_unlock(env, parent);
3854
3855 stop:
3856         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3857
3858 unlock1:
3859         lfsck_ibits_unlock(&lh, LCK_EX);
3860
3861 log:
3862         if (rc)
3863                 CDEBUG(D_LFSCK,
3864                        "%s: layout LFSCK assistant repaired unmatched MDT-OST pair for: parent "DFID", child "DFID", comp_id %u, OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
3865                        lfsck_lfsck2name(com->lc_lfsck),
3866                        PFID(lfsck_dto2fid(parent)),
3867                        PFID(lfsck_dto2fid(child)),
3868                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3869                        la->la_uid, la->la_gid, rc);
3870
3871         return rc;
3872 }
3873
3874 /* If there are more than one MDT-objects claim as the OST-object's parent,
3875  * and the OST-object only recognizes one of them, then we need to generate
3876  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s).
3877  */
3878 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3879                                                    struct lfsck_component *com,
3880                                                    struct dt_object *parent,
3881                                                    struct lfsck_layout_req *llr,
3882                                                    struct lu_attr *la)
3883 {
3884         struct lfsck_thread_info *info = lfsck_env_info(env);
3885         struct dt_allocation_hint *hint = &info->lti_hint;
3886         struct dt_object_format *dof = &info->lti_dof;
3887         struct ost_id *oi = &info->lti_oi;
3888         struct lu_buf *buf = &info->lti_big_buf;
3889         struct lfsck_instance *lfsck = com->lc_lfsck;
3890         struct dt_device *dev;
3891         struct lu_device *d = &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3892         struct lu_object *o;
3893         struct lu_object *n;
3894         struct dt_object *child = NULL;
3895         struct thandle *handle = NULL;
3896         struct lov_mds_md_v1 *lmm;
3897         struct lov_ost_data_v1 *objs;
3898         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3899         struct lu_fid tfid;
3900         struct lustre_handle lh = { 0 };
3901         __u32 magic;
3902         __u32 index;
3903         int rc;
3904
3905         ENTRY;
3906         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
3907                 RETURN(0);
3908
3909         /* We use two separated transactions to repair the inconsistency.
3910          *
3911          * 1) create the child (OST-object).
3912          * 2) update the parent LOV EA according to the child's FID.
3913          *
3914          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3915          * handled as orphan when the layout LFSCK run next time.
3916          *
3917          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3918          * as dangling referened case when the layout LFSCK run next time.
3919          */
3920
3921         /* The 1st transaction. */
3922         o = lu_object_anon(env, d, NULL);
3923         if (IS_ERR(o))
3924                 GOTO(log, rc = PTR_ERR(o));
3925
3926         n = lu_object_locate(o->lo_header, d->ld_type);
3927         if (unlikely(n == NULL)) {
3928                 lu_object_put_nocache(env, o);
3929
3930                 GOTO(log, rc = -EINVAL);
3931         }
3932
3933         child = container_of(n, struct dt_object, do_lu);
3934         memset(hint, 0, sizeof(*hint));
3935         rc = dt_attr_get(env, parent, la);
3936         if (rc != 0)
3937                 GOTO(log, rc);
3938
3939         la->la_valid = LA_UID | LA_GID;
3940         memset(dof, 0, sizeof(*dof));
3941
3942         dev = lfsck_obj2dev(child);
3943         handle = lfsck_trans_create(env, dev, lfsck);
3944         if (IS_ERR(handle))
3945                 GOTO(log, rc = PTR_ERR(handle));
3946
3947         rc = dt_declare_create(env, child, la, hint, dof, handle);
3948         if (rc != 0)
3949                 GOTO(stop, rc);
3950
3951         rc = dt_trans_start_local(env, dev, handle);
3952         if (rc != 0)
3953                 GOTO(stop, rc);
3954
3955         rc = dt_create(env, child, la, hint, dof, handle);
3956         dt_trans_stop(env, dev, handle);
3957         handle = NULL;
3958         if (rc != 0)
3959                 GOTO(log, rc);
3960
3961         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3962                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3963                               LCK_EX);
3964         if (rc != 0)
3965                 GOTO(log, rc);
3966
3967         /* The 2nd transaction. */
3968
3969         /* XXX: Generally, we should use bottom device (OSD) to update parent
3970          *      LOV EA. But because the LOD-object still references the wrong
3971          *      OSP-object that should be detached after the parent's LOV EA
3972          *      refreshed. Unfortunately, there is no suitable API for that.
3973          *      So we have to make the LOD to re-load the OSP-object(s) via
3974          *      replacing the LOV EA against the LOD-object.
3975          *
3976          *      Once the DNE2 patches have been landed, we can replace the
3977          *      LOD device with the OSD device. LU-6230.
3978          */
3979
3980         dev = lfsck->li_next;
3981         parent = lfsck_object_locate(dev, parent);
3982         if (IS_ERR(parent))
3983                 GOTO(log, rc = PTR_ERR(parent));
3984
3985         handle = lfsck_trans_create(env, dev, lfsck);
3986         if (IS_ERR(handle))
3987                 GOTO(log, rc = PTR_ERR(handle));
3988
3989         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3990                                   LU_XATTR_REPLACE, handle);
3991         if (rc != 0)
3992                 GOTO(stop, rc);
3993
3994         rc = dt_trans_start_local(env, dev, handle);
3995         if (rc != 0)
3996                 GOTO(stop, rc);
3997
3998         dt_write_lock(env, parent, 0);
3999         if (unlikely(lfsck_is_dead_obj(parent)))
4000                 GOTO(unlock, rc = 0);
4001
4002         rc = lfsck_layout_get_lovea(env, parent, buf);
4003         if (unlikely(rc == -ENODATA))
4004                 rc = 0;
4005         if (rc <= 0)
4006                 GOTO(unlock, rc);
4007
4008         lmm = buf->lb_buf;
4009         magic = le32_to_cpu(lmm->lmm_magic);
4010         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4011                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4012                 struct lov_comp_md_entry_v1 *lcme;
4013                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
4014                 int i;
4015
4016                 LASSERT(llr->llr_comp_id != 0);
4017
4018                 for (i = 0; i < count; i++) {
4019                         lcme = &lcm->lcm_entries[i];
4020                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
4021                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
4022                                         LCME_FL_INIT);
4023
4024                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
4025                                 lmm = buf->lb_buf +
4026                                         le32_to_cpu(lcme->lcme_offset);
4027                                 magic = le32_to_cpu(lmm->lmm_magic);
4028                                 goto set;
4029                         }
4030                 }
4031
4032                 GOTO(unlock, rc = 0);
4033         }
4034
4035 set:
4036         if (magic == LOV_MAGIC_V1) {
4037                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
4038         } else {
4039                 LASSERT(magic == LOV_MAGIC_V3);
4040                 objs =
4041                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
4042         }
4043
4044         ostid_le_to_cpu(&objs->l_ost_oi, oi);
4045         index = le32_to_cpu(objs->l_ost_idx);
4046         rc = ostid_to_fid(&tfid, oi, index);
4047         /* Someone changed layout during the LFSCK, no need to repair then. */
4048         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
4049                 GOTO(unlock, rc = 0);
4050
4051         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
4052         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
4053         ostid_cpu_to_le(oi, &objs->l_ost_oi);
4054         objs->l_ost_gen = cpu_to_le32(0);
4055         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
4056         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
4057                           LU_XATTR_REPLACE, handle);
4058
4059         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
4060
4061 unlock:
4062         dt_write_unlock(env, parent);
4063
4064 stop:
4065         if (handle != NULL)
4066                 dt_trans_stop(env, dev, handle);
4067
4068 log:
4069         lfsck_ibits_unlock(&lh, LCK_EX);
4070         if (child != NULL)
4071                 lfsck_object_put(env, child);
4072
4073         if (rc)
4074                 CDEBUG(D_LFSCK,
4075                        "%s: layout LFSCK assistant repaired multiple references for: parent "DFID", comp_id %u, OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
4076                        lfsck_lfsck2name(lfsck), PFID(pfid),
4077                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
4078                        la->la_uid, la->la_gid, rc);
4079
4080         return rc;
4081 }
4082
4083 /* If the MDT-object and the OST-object have different owner information,
4084  * then trust the MDT-object, because the normal chown/chgrp handle order
4085  * is from MDT to OST, and it is possible that some chown/chgrp operation
4086  * is partly done.
4087  */
4088 static int lfsck_layout_repair_owner(const struct lu_env *env,
4089                                      struct lfsck_component *com,
4090                                      struct dt_object *parent,
4091                                      struct lfsck_layout_req *llr,
4092                                      struct lu_attr *pla,
4093                                      const struct lu_attr *cla)
4094 {
4095         struct lfsck_thread_info *info = lfsck_env_info(env);
4096         struct lu_attr *tla = &info->lti_la2;
4097         struct dt_object *child  = llr->llr_child;
4098         struct dt_device *dev = lfsck_obj2dev(child);
4099         struct thandle *handle;
4100         int rc;
4101         dt_obj_version_t version;
4102
4103         ENTRY;
4104         if (com->lc_lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
4105                 RETURN(0);
4106
4107         tla->la_uid = pla->la_uid;
4108         tla->la_gid = pla->la_gid;
4109         tla->la_valid = LA_UID | LA_GID;
4110         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4111         if (IS_ERR(handle))
4112                 GOTO(log, rc = PTR_ERR(handle));
4113
4114         rc = dt_declare_attr_set(env, child, tla, handle);
4115         if (rc != 0)
4116                 GOTO(stop, rc);
4117
4118         rc = dt_trans_start_local(env, dev, handle);
4119         if (rc != 0)
4120                 GOTO(stop, rc);
4121
4122         /* Use the dt_object lock to serialize with destroy and attr_set. */
4123         dt_read_lock(env, parent, 0);
4124         if (unlikely(lfsck_is_dead_obj(parent)))
4125                 GOTO(unlock, rc = 1);
4126
4127         version = dt_version_get(env, child);
4128         if (version == -EOPNOTSUPP)
4129                 version = 0;
4130
4131         /* Get the latest parent's owner. */
4132         rc = dt_attr_get(env, parent, pla);
4133         if (rc != 0)
4134                 GOTO(unlock, rc);
4135
4136         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
4137         if (unlikely((!version && tla->la_ctime == 0) ||
4138                      tla->la_uid != pla->la_uid || tla->la_gid != pla->la_gid))
4139                 rc = 1;
4140         else
4141                 rc = dt_attr_set(env, child, tla, handle);
4142
4143         GOTO(unlock, rc);
4144
4145 unlock:
4146         dt_read_unlock(env, parent);
4147
4148 stop:
4149         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4150
4151 log:
4152         if (rc != 0)
4153                 CDEBUG(D_LFSCK,
4154                        "%s: layout LFSCK assistant repaired inconsistent file owner for: parent "DFID", child "DFID", OST-index %u, stripe-index %u, old owner %u/%u, new owner %u/%u: rc = %d\n",
4155                        lfsck_lfsck2name(com->lc_lfsck),
4156                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4157                        llr->llr_ost_idx, llr->llr_lov_idx,
4158                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
4159
4160         return rc;
4161 }
4162
4163 #define CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid, msg)               \
4164         CDEBUG(D_LFSCK, "%s:("DFID"|"DFID")/"DFID":XATTR %s: %s\n",      \
4165                lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid), PFID(pfid), \
4166                PFID(cfid), XATTR_NAME_FID, msg);
4167
4168 /* Check whether the OST-object correctly back points to the
4169  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid).
4170  */
4171 static int lfsck_layout_check_parent(const struct lu_env *env,
4172                                      struct lfsck_component *com,
4173                                      struct lfsck_assistant_object *lso,
4174                                      struct filter_fid *ff,
4175                                      const struct lu_fid *cfid,
4176                                      const struct lu_attr *cla,
4177                                      struct lfsck_layout_req *llr)
4178 {
4179         struct lfsck_thread_info *info = lfsck_env_info(env);
4180         struct lu_buf *buf = &info->lti_big_buf;
4181         struct lu_fid *pfid   = &info->lti_fid;
4182         struct dt_object *tobj;
4183         struct lov_mds_md_v1 *lmm;
4184         struct lov_ost_data_v1 *objs;
4185         struct lustre_handle lh = { 0 };
4186         struct lfsck_instance *lfsck  = com->lc_lfsck;
4187         int rc;
4188         int i;
4189         __u32 magic;
4190         __u32 idx;
4191         __u16 count;
4192
4193         ENTRY;
4194         *pfid = ff->ff_parent;
4195         idx = pfid->f_stripe_idx;
4196         pfid->f_ver = 0;
4197
4198         if (unlikely(!fid_is_sane(pfid))) {
4199                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4200                                       "the parent FID is invalid");
4201
4202                 RETURN(LLIT_UNMATCHED_PAIR);
4203         }
4204
4205         if (lu_fid_eq(pfid, &lso->lso_fid)) {
4206                 if (likely(llr->llr_lov_idx == idx))
4207                         RETURN(0);
4208
4209                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4210                                       "the stripe index is unmatched");
4211
4212                 RETURN(LLIT_UNMATCHED_PAIR);
4213         }
4214
4215         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4216         if (IS_ERR(tobj))
4217                 RETURN(PTR_ERR(tobj));
4218
4219         if (dt_object_exists(tobj) == 0) {
4220                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4221                                       "the parent is nonexistent");
4222
4223                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4224         }
4225
4226         if (lfsck_is_dead_obj(tobj)) {
4227                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4228                                       "the parent is dead object");
4229
4230                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4231         }
4232
4233         if (!S_ISREG(lfsck_object_type(tobj))) {
4234                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4235                                       "the parent is not a regular file");
4236
4237                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4238         }
4239
4240         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
4241          * remote one on another MDT. Then check whether the given OST-object
4242          * is in such layout. If yes, it is multiple referenced, otherwise it
4243          * is unmatched referenced case.
4244          */
4245         rc = lfsck_layout_get_lovea(env, tobj, buf);
4246         if (rc == 0 || rc == -ENODATA || rc == -ENOENT) {
4247                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4248                                       "the parent has no stripe data");
4249
4250                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4251         }
4252
4253         if (unlikely(rc == -EOPNOTSUPP))
4254                 GOTO(out, rc = LLIT_NONE);
4255
4256         if (rc < 0)
4257                 GOTO(out, rc);
4258
4259         lmm = buf->lb_buf;
4260         magic = le32_to_cpu(lmm->lmm_magic);
4261         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
4262                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4263                 struct lov_comp_md_entry_v1 *lcme;
4264
4265                 if (ff->ff_layout.ol_comp_id == 0) {
4266                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4267                                               "the parent has incorrect comp_id");
4268
4269                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4270                 }
4271
4272                 count = le16_to_cpu(lcm->lcm_entry_count);
4273                 for (i = 0; i < count; i++) {
4274                         lcme = &lcm->lcm_entries[i];
4275                         if (le32_to_cpu(lcme->lcme_id) ==
4276                             ff->ff_layout.ol_comp_id) {
4277                                 lmm = buf->lb_buf +
4278                                         le32_to_cpu(lcme->lcme_offset);
4279                                 magic = le32_to_cpu(lmm->lmm_magic);
4280                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4281                                       LCME_FL_INIT)) {
4282                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4283                                                               cfid,
4284                                                               "the parent has uninitialized component");
4285
4286                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4287                                 }
4288
4289                                 goto further;
4290                         }
4291                 }
4292
4293                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4294                                       "the parent has no matched comp_id");
4295
4296                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4297         }
4298
4299 further:
4300         if (magic == LOV_MAGIC_V1) {
4301                 objs = &lmm->lmm_objects[0];
4302         } else {
4303                 LASSERT(magic == LOV_MAGIC_V3);
4304                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4305         }
4306
4307         count = le16_to_cpu(lmm->lmm_stripe_count);
4308         for (i = 0; i < count; i++, objs++) {
4309                 struct lu_fid           *tfid   = &info->lti_fid2;
4310                 struct ost_id           *oi     = &info->lti_oi;
4311                 __u32                    idx2;
4312
4313                 if (lovea_slot_is_dummy(objs))
4314                         continue;
4315
4316                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4317                 idx2 = le32_to_cpu(objs->l_ost_idx);
4318                 rc = ostid_to_fid(tfid, oi, idx2);
4319                 if (rc != 0) {
4320                         CDEBUG(D_LFSCK,
4321                                "%s: the parent "DFID" contains invalid layout EA at the slot %d, index %u\n",
4322                                lfsck_lfsck2name(com->lc_lfsck),
4323                                PFID(pfid), i, idx2);
4324
4325                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4326                 }
4327
4328                 if (lu_fid_eq(cfid, tfid)) {
4329                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4330                                               MDS_INODELOCK_UPDATE |
4331                                               MDS_INODELOCK_LAYOUT |
4332                                               MDS_INODELOCK_XATTR,
4333                                               LCK_EX);
4334                         if (rc != 0)
4335                                 GOTO(out, rc);
4336
4337                         dt_read_lock(env, tobj, 0);
4338
4339                         /* For local MDT-object, re-check existence
4340                          * after taken the lock.
4341                          */
4342                         if (!dt_object_remote(tobj)) {
4343                                 if (dt_object_exists(tobj) == 0 ||
4344                                     lfsck_is_dead_obj(tobj)) {
4345                                         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid,
4346                                                               cfid,
4347                                                               "the parent doesn't exist anymore after lock");
4348
4349                                         rc = LLIT_UNMATCHED_PAIR;
4350                                 } else {
4351                                         rc = LLIT_MULTIPLE_REFERENCED;
4352                                 }
4353
4354                                 GOTO(unlock, rc);
4355                         }
4356
4357                         /* For migration case, the new MDT-object and old
4358                          * MDT-object may reference the same OST-object at
4359                          * some migration internal time.
4360                          *
4361                          * For remote MDT-object, the local MDT may not know
4362                          * whether it has been removed or not.  Try checking
4363                          * for a non-existent xattr to check if this object
4364                          * has been been removed or not.
4365                          */
4366                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4367                                           XATTR_NAME_DUMMY);
4368                         if (unlikely(rc == -ENOENT || rc >= 0)) {
4369                                 CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4370                                                       "the parent is remote object and nonexistent after lock");
4371
4372                                 rc = LLIT_UNMATCHED_PAIR;
4373                         } else if (rc == -ENODATA) {
4374                                 rc = LLIT_MULTIPLE_REFERENCED;
4375                         }
4376
4377                         GOTO(unlock, rc);
4378                 }
4379         }
4380
4381         CDEBUG_UNMATCHED_PAIR(lfsck, lso, pfid, cfid,
4382                               "the parent has no matched stripe");
4383
4384         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4385
4386 unlock:
4387         if (lustre_handle_is_used(&lh)) {
4388                 dt_read_unlock(env, tobj);
4389                 lfsck_ibits_unlock(&lh, LCK_EX);
4390         }
4391
4392 out:
4393         lfsck_object_put(env, tobj);
4394
4395         return rc;
4396 }
4397
4398 /*
4399  * If the MDT-object has the LUSTRE_ENCRYPT_FL flag, it needs to be set
4400  * on the OST-object as well.
4401  */
4402 static int lfsck_layout_repair_encflag(const struct lu_env *env,
4403                                        struct lfsck_component *com,
4404                                        struct dt_object *parent,
4405                                        struct lfsck_layout_req *llr)
4406 {
4407         struct lfsck_thread_info *info = lfsck_env_info(env);
4408         struct lu_attr *tla = &info->lti_la2;
4409         struct dt_object *child = llr->llr_child;
4410         struct dt_device *dev = lfsck_obj2dev(child);
4411         struct thandle *handle;
4412         int rc;
4413
4414         ENTRY;
4415
4416         tla->la_valid = LA_FLAGS;
4417         tla->la_flags = LUSTRE_ENCRYPT_FL;
4418         handle = lfsck_trans_create(env, dev, com->lc_lfsck);
4419         if (IS_ERR(handle))
4420                 GOTO(log, rc = PTR_ERR(handle));
4421
4422         rc = dt_declare_attr_set(env, child, tla, handle);
4423         if (rc != 0)
4424                 GOTO(stop, rc);
4425
4426         rc = dt_trans_start_local(env, dev, handle);
4427         if (rc != 0)
4428                 GOTO(stop, rc);
4429
4430         /* Use the dt_object lock to serialize with destroy and attr_set. */
4431         dt_read_lock(env, parent, 0);
4432         if (unlikely(lfsck_is_dead_obj(parent)))
4433                 GOTO(unlock, rc = 1);
4434
4435         rc = dt_attr_set(env, child, tla, handle);
4436         GOTO(unlock, rc);
4437
4438 unlock:
4439         dt_read_unlock(env, parent);
4440
4441 stop:
4442         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
4443
4444 log:
4445         if (rc != 0)
4446                 CDEBUG(D_LFSCK,
4447                        "%s: layout LFSCK assistant repair of inconsistent file enc flag for: parent "DFID", child "DFID", OST-index %u, stripe-index %u: rc = %d\n",
4448                        lfsck_lfsck2name(com->lc_lfsck),
4449                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
4450                        llr->llr_ost_idx, llr->llr_lov_idx, rc);
4451
4452         return rc;
4453 }
4454
4455 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4456                                              struct lfsck_component *com,
4457                                              struct lfsck_assistant_req *lar)
4458 {
4459         struct lfsck_layout_req *llr = container_of(lar,
4460                                                     struct lfsck_layout_req,
4461                                                     llr_lar);
4462         struct lfsck_assistant_object *lso = lar->lar_parent;
4463         struct lfsck_layout *lo = com->lc_file_ram;
4464         struct lfsck_thread_info *info = lfsck_env_info(env);
4465         struct filter_fid *ff = &info->lti_ff;
4466         struct lu_buf buf = { .lb_buf = ff, .lb_len = sizeof(*ff) };
4467         struct dt_object *parent = NULL;
4468         struct dt_object *child  = llr->llr_child;
4469         struct lu_attr *pla = &lso->lso_attr;
4470         struct lu_attr *cla = &info->lti_la;
4471         struct lfsck_instance *lfsck = com->lc_lfsck;
4472         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4473         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4474         int rc;
4475
4476         ENTRY;
4477         if (lso->lso_dead)
4478                 RETURN(0);
4479
4480         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ENGINE_DELAY, cfs_fail_val);
4481
4482         rc = dt_attr_get(env, child, cla);
4483         if (rc == -ENOENT) {
4484                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4485                 if (IS_ERR(parent)) {
4486                         rc = PTR_ERR(parent);
4487
4488                         RETURN(rc == -ENOENT ? 0 : rc);
4489                 }
4490
4491                 type = LLIT_DANGLING;
4492                 goto repair;
4493         }
4494
4495         if (rc != 0)
4496                 GOTO(out, rc);
4497
4498         if (!(bk->lb_param & LPF_DRYRUN) &&
4499             pla->la_valid & LA_FLAGS && pla->la_flags & LUSTRE_ENCRYPT_FL) {
4500                 /* MDT-inode is encrypted */
4501                 struct lu_buf lb = { .lb_buf = NULL, .lb_len = 0 };
4502
4503                 /* if OST-inode is missing encryption.c xattr, fix it */
4504                 if (dt_xattr_get(env, child, &lb,
4505                                  LL_XATTR_NAME_ENCRYPTION_CONTEXT) >= 0)
4506                         goto check_fid;
4507
4508                 if (parent == NULL)
4509                         parent = lfsck_assistant_object_load(env, lfsck, lso);
4510                 if (!IS_ERR_OR_NULL(parent))
4511                         rc = lfsck_layout_repair_encflag(env, com, parent, llr);
4512                 down_write(&com->lc_sem);
4513                 if (rc < 0)
4514                         lfsck_layout_record_failure(env, lfsck, lo);
4515                 else if (rc > 0)
4516                         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
4517                 up_write(&com->lc_sem);
4518         }
4519
4520 check_fid:
4521         lfsck_buf_init(&buf, ff, sizeof(*ff));
4522         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4523         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4524                 CDEBUG(D_LFSCK,
4525                        "%s:"DFID"/"DFID": the child object's %s is corrupted\n",
4526                        lfsck_lfsck2name(lfsck), PFID(&lso->lso_fid),
4527                        PFID(lu_object_fid(&child->do_lu)),
4528                        XATTR_NAME_FID);
4529
4530                 type = LLIT_UNMATCHED_PAIR;
4531                 goto repair;
4532         }
4533
4534         if (rc < 0 && rc != -ENODATA)
4535                 GOTO(out, rc);
4536
4537         if (rc == 0 || rc == -ENODATA)
4538                 GOTO(check_owner, rc = 0);
4539
4540         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4541         rc = lfsck_layout_check_parent(env, com, lso, ff,
4542                                        lu_object_fid(&child->do_lu), cla, llr);
4543         if (rc > 0) {
4544                 type = rc;
4545                 goto repair;
4546         }
4547
4548         if (rc < 0)
4549                 GOTO(out, rc);
4550
4551 check_owner:
4552         /* Someone may has changed the owner after the parent attr pre-loaded.
4553          * It can be handled later inside the lfsck_layout_repair_owner().
4554          */
4555         if (unlikely(cla->la_uid != pla->la_uid ||
4556                      cla->la_gid != pla->la_gid)) {
4557                 type = LLIT_INCONSISTENT_OWNER;
4558                 goto repair;
4559         }
4560
4561 repair:
4562         if (type == LLIT_NONE)
4563                 GOTO(out, rc = 0);
4564
4565         if (bk->lb_param & LPF_DRYRUN)
4566                 GOTO(out, rc = 1);
4567
4568         if (parent == NULL) {
4569                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4570                 if (IS_ERR(parent)) {
4571                         rc = PTR_ERR(parent);
4572
4573                         if (rc == -ENOENT)
4574                                 RETURN(0);
4575
4576                         GOTO(out, rc);
4577                 }
4578         }
4579
4580         switch (type) {
4581         case LLIT_DANGLING:
4582                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4583                         rc = lfsck_layout_ins_dangling_rec(env, com,
4584                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4585                                 llr->llr_comp_id, llr->llr_lov_idx,
4586                                 llr->llr_ost_idx);
4587                 else
4588                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4589                                                             llr->llr_child,
4590                                                             llr->llr_comp_id,
4591                                                             llr->llr_lov_idx,
4592                                                             llr->llr_ost_idx,
4593                                                             true);
4594                 break;
4595         case LLIT_UNMATCHED_PAIR:
4596                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4597                                                         llr, pla);
4598                 break;
4599         case LLIT_MULTIPLE_REFERENCED:
4600                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4601                                                              llr, pla);
4602                 break;
4603         case LLIT_INCONSISTENT_OWNER:
4604                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4605                 break;
4606         default:
4607                 rc = 0;
4608                 break;
4609         }
4610
4611         GOTO(out, rc);
4612
4613 out:
4614         down_write(&com->lc_sem);
4615         if (rc < 0) {
4616                 struct lfsck_assistant_data *lad = com->lc_data;
4617
4618                 if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags))) {
4619                         rc = 0;
4620                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4621                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4622                            rc == -EHOSTUNREACH) {
4623                         /* cannot touch target server? mark LFSCK INCOMPLETE */
4624                         CDEBUG(D_LFSCK,
4625                                "%s: layout LFSCK assistant fail to talk with OST %x: rc = %d\n",
4626                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4627                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4628                         lo->ll_objs_skipped++;
4629                         rc = 0;
4630                 } else {
4631                         lfsck_layout_record_failure(env, lfsck, lo);
4632                 }
4633         } else if (rc > 0 && (type != LLIT_DANGLING ||
4634                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4635                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4636                          "unknown type = %d\n", type);
4637
4638                 lo->ll_objs_repaired[type - 1]++;
4639                 if (bk->lb_param & LPF_DRYRUN &&
4640                     unlikely(lo->ll_pos_first_inconsistent == 0))
4641                         lo->ll_pos_first_inconsistent =
4642                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4643                                                         lfsck->li_di_oit);
4644         }
4645         up_write(&com->lc_sem);
4646
4647         if (parent != NULL && !IS_ERR(parent))
4648                 lfsck_object_put(env, parent);
4649
4650         return rc;
4651 }
4652
4653 static int
4654 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4655                                         struct lfsck_component *com,
4656                                         struct dt_object *obj, bool first)
4657 {
4658         struct lfsck_instance *lfsck = com->lc_lfsck;
4659         struct ptlrpc_thread *thread = &lfsck->li_thread;
4660         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4661         struct lfsck_layout *lo = com->lc_file_ram;
4662         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4663         struct dt_it *di;
4664         struct dt_key *key;
4665         struct lfsck_layout_dangling_key *parent =
4666                                         &lfsck_env_info(env)->lti_lldk;
4667         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4668         __u32 ost_idx;
4669         int rc;
4670
4671         ENTRY;
4672         di = iops->init(env, obj, 0);
4673         if (IS_ERR(di))
4674                 RETURN(PTR_ERR(di));
4675
4676         if (first)
4677                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4678         else
4679                 memset(parent, 0, sizeof(*parent));
4680         rc = iops->get(env, di, (const struct dt_key *)parent);
4681         if (rc < 0)
4682                 GOTO(fini, rc);
4683
4684         if (first) {
4685                 /* The start one either has been processed or does not exist,
4686                  * skip it.
4687                  */
4688                 rc = iops->next(env, di);
4689                 if (rc != 0)
4690                         GOTO(put, rc);
4691         }
4692
4693         do {
4694                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4695                     unlikely(!thread_is_running(thread)))
4696                         GOTO(put, rc = 0);
4697
4698                 key = iops->key(env, di);
4699                 if (IS_ERR(key)) {
4700                         rc = PTR_ERR(key);
4701                         if (rc == -ENOENT)
4702                                 GOTO(put, rc = 1);
4703
4704                         goto checkpoint;
4705                 }
4706
4707                 lldk_be_to_cpu(parent,
4708                                 (const struct lfsck_layout_dangling_key *)key);
4709                 if (!fid_is_sane(&parent->lldk_fid)) {
4710                         rc = 0;
4711                         goto checkpoint;
4712                 }
4713
4714                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4715                 if (rc == 0) {
4716                         fid_be_to_cpu(cfid, cfid);
4717                         ost_idx = cfid->f_ver;
4718                         cfid->f_ver = 0;
4719                         if (!fid_is_sane(cfid)) {
4720                                 rc = 0;
4721                                 goto checkpoint;
4722                         }
4723
4724                         rc = lfsck_layout_repair_dangling(env, com,
4725                                         &parent->lldk_fid, cfid,
4726                                         parent->lldk_comp_id,
4727                                         parent->lldk_ea_off, ost_idx);
4728                 }
4729
4730 checkpoint:
4731                 down_write(&com->lc_sem);
4732                 com->lc_new_checked++;
4733                 com->lc_new_scanned++;
4734                 if (rc >= 0)
4735                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4736
4737                 if (rc > 0)
4738                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4739                 else if (rc < 0)
4740                         lo->ll_objs_failed_phase2++;
4741                 up_write(&com->lc_sem);
4742
4743                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4744                         GOTO(put, rc);
4745
4746                 if (unlikely(com->lc_time_next_checkpoint <=
4747                              ktime_get_seconds()) &&
4748                     com->lc_new_checked != 0) {
4749                         down_write(&com->lc_sem);
4750                         lo->ll_run_time_phase2 += ktime_get_seconds() -
4751                                                   com->lc_time_last_checkpoint;
4752                         lo->ll_time_last_checkpoint = ktime_get_real_seconds();
4753                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4754                         com->lc_new_checked = 0;
4755                         lfsck_layout_store(env, com);
4756                         up_write(&com->lc_sem);
4757
4758                         com->lc_time_last_checkpoint = ktime_get_seconds();
4759                         com->lc_time_next_checkpoint =
4760                                 com->lc_time_last_checkpoint +
4761                                 LFSCK_CHECKPOINT_INTERVAL;
4762                 }
4763
4764                 lfsck_control_speed_by_self(com);
4765                 if (unlikely(!thread_is_running(thread)))
4766                         GOTO(put, rc = 0);
4767
4768                 rc = iops->next(env, di);
4769         } while (rc == 0);
4770
4771         GOTO(put, rc);
4772
4773 put:
4774         iops->put(env, di);
4775
4776 fini:
4777         iops->fini(env, di);
4778
4779         return rc;
4780 }
4781
4782 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4783                                              struct lfsck_component *com)
4784 {
4785         struct lfsck_assistant_data *lad = com->lc_data;
4786         struct lfsck_instance *lfsck = com->lc_lfsck;
4787         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4788         struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs;
4789         struct lfsck_tgt_desc *ltd;
4790         int rc = 0;
4791
4792         ENTRY;
4793         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4794                lfsck_lfsck2name(lfsck));
4795
4796         spin_lock(&ltds->ltd_lock);
4797         while (!list_empty(&lad->lad_ost_phase2_list)) {
4798                 ltd = list_first_entry(&lad->lad_ost_phase2_list,
4799                                        struct lfsck_tgt_desc,
4800                                        ltd_layout_phase_list);
4801                 list_del_init(&ltd->ltd_layout_phase_list);
4802                 if (bk->lb_param & LPF_OST_ORPHAN) {
4803                         spin_unlock(&ltds->ltd_lock);
4804                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4805                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4806                                 RETURN(rc);
4807
4808                         if (unlikely(test_bit(LAD_EXIT, &lad->lad_flags) ||
4809                                      !thread_is_running(&lfsck->li_thread)))
4810                                 RETURN(0);
4811                         spin_lock(&ltds->ltd_lock);
4812                 }
4813         }
4814
4815         if (list_empty(&lad->lad_ost_phase1_list))
4816                 rc = 1;
4817         else
4818                 rc = 0;
4819         spin_unlock(&ltds->ltd_lock);
4820
4821         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4822                 struct lfsck_layout *lo = com->lc_file_ram;
4823                 int i;
4824
4825                 com->lc_new_checked = 0;
4826                 com->lc_new_scanned = 0;
4827                 com->lc_time_last_checkpoint = ktime_get_seconds();
4828                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4829                                                LFSCK_CHECKPOINT_INTERVAL;
4830
4831                 i = lfsck_sub_trace_file_fid2idx(
4832                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4833                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4834                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4835                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4836                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4837                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4838
4839                 CDEBUG(D_LFSCK,
4840                        "%s: layout LFSCK phase2 scan dangling stop at the No. %d trace file: rc = %d\n",
4841                        lfsck_lfsck2name(lfsck), i, rc);
4842         }
4843
4844         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4845                lfsck_lfsck2name(lfsck), rc);
4846
4847         RETURN(rc);
4848 }
4849
4850 static int
4851 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4852                                    struct ptlrpc_request *req,
4853                                    void *args, int rc)
4854 {
4855         struct lfsck_layout_slave_async_args *llsaa = args;
4856         struct obd_export *exp = llsaa->llsaa_exp;
4857         struct lfsck_component *com = llsaa->llsaa_com;
4858         struct lfsck_layout_slave_target *llst = llsaa->llsaa_llst;
4859         struct lfsck_layout_slave_data *llsd = com->lc_data;
4860         struct lfsck_reply *lr = NULL;
4861         bool done = false;
4862
4863         if (rc != 0) {
4864                 /* It is probably caused by network trouble, or target crash,
4865                  * it will try several times (depends on the obd_timeout, and
4866                  * will not less than 3 times). But to make the LFSCK can go
4867                  * ahead, we should not try for ever. After some try but still
4868                  * hit failure, it will assume that the target exit the LFSCK
4869                  * prcoessing and stop try.
4870                  */
4871                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4872                         int max_try = max_t(int, obd_timeout / 30, 3);
4873
4874                         if (++(llst->llst_failures) > max_try)
4875                                 done = true;
4876                 } else {
4877                         done = true;
4878                 }
4879         } else {
4880                 llst->llst_failures = 0;
4881                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4882                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4883                     lr->lr_status != LS_SCANNING_PHASE2)
4884                         done = true;
4885         }
4886
4887         if (done) {
4888                 CDEBUG(D_LFSCK,
4889                        "%s: layout LFSCK slave gets the MDT %x status %d, failures_try %d\n",
4890                        lfsck_lfsck2name(com->lc_lfsck), llst->llst_index,
4891                        lr != NULL ? lr->lr_status : rc, llst->llst_failures);
4892
4893                 lfsck_layout_llst_del(llsd, llst);
4894         }
4895
4896         kref_put(&llst->llst_ref, lfsck_layout_llst_put);
4897         lfsck_component_put(env, com);
4898         class_export_put(exp);
4899
4900         return 0;
4901 }
4902
4903 static int lfsck_layout_async_query(const struct lu_env *env,
4904                                     struct lfsck_component *com,
4905                                     struct obd_export *exp,
4906                                     struct lfsck_layout_slave_target *llst,
4907                                     struct lfsck_request *lr,
4908                                     struct ptlrpc_request_set *set)
4909 {
4910         struct lfsck_layout_slave_async_args *llsaa;
4911         struct ptlrpc_request *req;
4912         struct lfsck_request *tmp;
4913         int rc;
4914
4915         ENTRY;
4916         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4917         if (req == NULL)
4918                 RETURN(-ENOMEM);
4919
4920         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4921         if (rc != 0) {
4922                 ptlrpc_request_free(req);
4923                 RETURN(rc);
4924         }
4925
4926         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4927         *tmp = *lr;
4928         ptlrpc_request_set_replen(req);
4929
4930         llsaa = ptlrpc_req_async_args(llsaa, req);
4931         llsaa->llsaa_exp = exp;
4932         llsaa->llsaa_com = lfsck_component_get(com);
4933         llsaa->llsaa_llst = llst;
4934         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4935         req->rq_allow_intr = 1;
4936         req->rq_no_delay = 1;
4937         ptlrpc_set_add_req(set, req);
4938
4939         RETURN(0);
4940 }
4941
4942 static int lfsck_layout_async_notify(const struct lu_env *env,
4943                                      struct obd_export *exp,
4944                                      struct lfsck_request *lr,
4945                                      struct ptlrpc_request_set *set)
4946 {
4947         struct ptlrpc_request *req;
4948         struct lfsck_request *tmp;
4949         int rc;
4950
4951         ENTRY;
4952         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4953         if (req == NULL)
4954                 RETURN(-ENOMEM);
4955
4956         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4957         if (rc != 0) {
4958                 ptlrpc_request_free(req);
4959                 RETURN(rc);
4960         }
4961
4962         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4963         *tmp = *lr;
4964         ptlrpc_request_set_replen(req);
4965         req->rq_allow_intr = 1;
4966         req->rq_no_delay = 1;
4967         ptlrpc_set_add_req(set, req);
4968
4969         RETURN(0);
4970 }
4971
4972 static int
4973 lfsck_layout_slave_query_master(const struct lu_env *env,
4974                                 struct lfsck_component *com)
4975 {
4976         struct lfsck_request *lr = &lfsck_env_info(env)->lti_lr;
4977         struct lfsck_instance *lfsck = com->lc_lfsck;
4978         struct lfsck_layout_slave_data *llsd  = com->lc_data;
4979         struct lfsck_layout_slave_target *llst;
4980         struct obd_export *exp;
4981         struct ptlrpc_request_set *set;
4982         int rc = 0;
4983         int rc1 = 0;
4984
4985         ENTRY;
4986         set = ptlrpc_prep_set();
4987         if (set == NULL)
4988                 GOTO(log, rc = -ENOMEM);
4989
4990         memset(lr, 0, sizeof(*lr));
4991         lr->lr_event = LE_QUERY;
4992         lr->lr_active = LFSCK_TYPE_LAYOUT;
4993
4994         llsd->llsd_touch_gen++;
4995         spin_lock(&llsd->llsd_lock);
4996         while (!list_empty(&llsd->llsd_master_list)) {
4997                 llst = list_first_entry(&llsd->llsd_master_list,
4998                                         struct lfsck_layout_slave_target,
4999                                         llst_list);
5000                 if (llst->llst_gen == llsd->llsd_touch_gen)
5001                         break;
5002
5003                 llst->llst_gen = llsd->llsd_touch_gen;
5004                 list_move_tail(&llst->llst_list,
5005                                &llsd->llsd_master_list);
5006                 kref_get(&llst->llst_ref);
5007                 spin_unlock(&llsd->llsd_lock);
5008
5009                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
5010                                                llst->llst_index);
5011                 if (exp == NULL) {
5012                         lfsck_layout_llst_del(llsd, llst);
5013                         kref_put(&llst->llst_ref, lfsck_layout_llst_put);
5014                         spin_lock(&llsd->llsd_lock);
5015                         continue;
5016                 }
5017
5018                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
5019                 if (rc != 0) {
5020                         CDEBUG(D_LFSCK,
5021                                "%s: layout LFSCK slave fail to query %s for layout: rc = %d\n",
5022                                lfsck_lfsck2name(lfsck), exp->exp_obd->obd_name,
5023                                rc);
5024
5025                         rc1 = rc;
5026                         kref_put(&llst->llst_ref, lfsck_layout_llst_put);
5027                         class_export_put(exp);
5028                 }
5029                 spin_lock(&llsd->llsd_lock);
5030         }
5031         spin_unlock(&llsd->llsd_lock);
5032
5033         rc = ptlrpc_set_wait(env, set);
5034         ptlrpc_set_destroy(set);
5035
5036         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
5037
5038 log:
5039         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
5040                lfsck_lfsck2name(com->lc_lfsck), rc);
5041
5042         return rc;
5043 }
5044
5045 static void
5046 lfsck_layout_slave_notify_master(const struct lu_env *env,
5047                                  struct lfsck_component *com,
5048                                  enum lfsck_events event, int result)
5049 {
5050         struct lfsck_layout *lo = com->lc_file_ram;
5051         struct lfsck_instance *lfsck = com->lc_lfsck;
5052         struct lfsck_layout_slave_data *llsd = com->lc_data;
5053         struct lfsck_request *lr = &lfsck_env_info(env)->lti_lr;
5054         struct lfsck_layout_slave_target *llst;
5055         struct obd_export *exp;
5056         struct ptlrpc_request_set *set;
5057         int rc;
5058
5059         ENTRY;
5060         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
5061                lfsck_lfsck2name(com->lc_lfsck));
5062
5063         set = ptlrpc_prep_set();
5064         if (set == NULL)
5065                 RETURN_EXIT;
5066
5067         memset(lr, 0, sizeof(*lr));
5068         lr->lr_event = event;
5069         lr->lr_flags = LEF_FROM_OST;
5070         lr->lr_status = result;
5071         lr->lr_index = lfsck_dev_idx(lfsck);
5072         lr->lr_active = LFSCK_TYPE_LAYOUT;
5073         lr->lr_flags2 = lo->ll_flags;
5074         llsd->llsd_touch_gen++;
5075         spin_lock(&llsd->llsd_lock);
5076         while (!list_empty(&llsd->llsd_master_list)) {
5077                 llst = list_first_entry(&llsd->llsd_master_list,
5078                                         struct lfsck_layout_slave_target,
5079                                         llst_list);
5080                 if (llst->llst_gen == llsd->llsd_touch_gen)
5081                         break;
5082
5083                 llst->llst_gen = llsd->llsd_touch_gen;
5084                 list_move_tail(&llst->llst_list,
5085                                &llsd->llsd_master_list);
5086                 kref_get(&llst->llst_ref);
5087                 spin_unlock(&llsd->llsd_lock);
5088
5089                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
5090                                                llst->llst_index);
5091                 if (exp == NULL) {
5092                         lfsck_layout_llst_del(llsd, llst);
5093                         kref_put(&llst->llst_ref, lfsck_layout_llst_put);
5094                         spin_lock(&llsd->llsd_lock);
5095                         continue;
5096                 }
5097
5098                 rc = lfsck_layout_async_notify(env, exp, lr, set);
5099                 if (rc != 0)
5100                         CDEBUG(D_LFSCK,
5101                                "%s: layout LFSCK slave fail to notify %s for layout: rc = %d\n",
5102                                lfsck_lfsck2name(lfsck),
5103                                exp->exp_obd->obd_name, rc);
5104
5105                 kref_put(&llst->llst_ref, lfsck_layout_llst_put);
5106                 class_export_put(exp);
5107                 spin_lock(&llsd->llsd_lock);
5108         }
5109         spin_unlock(&llsd->llsd_lock);
5110
5111         ptlrpc_set_wait(env, set);
5112         ptlrpc_set_destroy(set);
5113
5114         RETURN_EXIT;
5115 }
5116
5117 /*
5118  * \ret -ENODATA: unrecognized stripe
5119  * \ret = 0     : recognized stripe
5120  * \ret < 0     : other failures
5121  */
5122 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
5123                                            struct lfsck_component *com,
5124                                            struct lu_fid *cfid,
5125                                            struct lu_fid *pfid, __u32 comp_id)
5126 {
5127         struct lfsck_thread_info *info = lfsck_env_info(env);
5128         struct lu_buf *buf = &info->lti_big_buf;
5129         struct ost_id *oi = &info->lti_oi;
5130         struct dt_object *obj;
5131         struct lov_mds_md_v1 *lmm;
5132         struct lov_ost_data_v1 *objs;
5133         __u32 idx = pfid->f_stripe_idx;
5134         __u32 magic;
5135         int rc = 0;
5136         int i;
5137         __u16 count;
5138
5139         ENTRY;
5140         pfid->f_ver = 0;
5141         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
5142         if (IS_ERR(obj))
5143                 RETURN(PTR_ERR(obj));
5144
5145         dt_read_lock(env, obj, 0);
5146         if (unlikely(dt_object_exists(obj) == 0 ||
5147                      lfsck_is_dead_obj(obj)))
5148                 GOTO(unlock, rc = -ENOENT);
5149
5150         if (!S_ISREG(lfsck_object_type(obj)))
5151                 GOTO(unlock, rc = -ENODATA);
5152
5153         rc = lfsck_layout_get_lovea(env, obj, buf);
5154         if (rc < 0)
5155                 GOTO(unlock, rc);
5156
5157         lmm = buf->lb_buf;
5158         magic = le32_to_cpu(lmm->lmm_magic);
5159         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5160                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
5161                 struct lov_comp_md_entry_v1 *lcme;
5162
5163                 if (comp_id == 0)
5164                         GOTO(unlock, rc = -ENODATA);
5165
5166                 count = le16_to_cpu(lcm->lcm_entry_count);
5167                 for (i = 0; i < count; i++) {
5168                         lcme = &lcm->lcm_entries[i];
5169                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
5170                                 lmm = buf->lb_buf +
5171                                         le32_to_cpu(lcme->lcme_offset);
5172                                 magic = le32_to_cpu(lmm->lmm_magic);
5173                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5174                                       LCME_FL_INIT))
5175                                         GOTO(unlock, rc = -ENODATA);
5176
5177                                 goto further;
5178                         }
5179                 }
5180
5181                 GOTO(unlock, rc = -ENODATA);
5182         }
5183
5184 further:
5185         if (magic == LOV_MAGIC_V1) {
5186                 objs = &lmm->lmm_objects[0];
5187         } else {
5188                 LASSERT(magic == LOV_MAGIC_V3);
5189                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5190         }
5191
5192         fid_to_ostid(cfid, oi);
5193         count = le16_to_cpu(lmm->lmm_stripe_count);
5194         for (i = 0; i < count; i++, objs++) {
5195                 struct ost_id oi2;
5196
5197                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
5198                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
5199                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
5200         }
5201
5202         GOTO(unlock, rc = -ENODATA);
5203
5204 unlock:
5205         dt_read_unlock(env, obj);
5206         lfsck_object_put(env, obj);
5207
5208         return rc;
5209 }
5210
5211 /*
5212  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
5213  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
5214  * layout EA from MDT to OST. On one hand, the OST no need to understand
5215  * the layout EA structure; on the other hand, it may cause trouble when
5216  * transfer large layout EA from MDT to OST via normal OUT RPC.
5217  *
5218  * \ret > 0: unrecognized stripe
5219  * \ret = 0: recognized stripe
5220  * \ret < 0: other failures
5221  */
5222 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
5223                                           struct lfsck_component *com,
5224                                           struct lu_fid *cfid,
5225                                           struct lu_fid *pfid, __u32 comp_id)
5226 {
5227         struct lfsck_instance *lfsck = com->lc_lfsck;
5228         struct obd_device *obd = lfsck->li_obd;
5229         struct seq_server_site *ss = lfsck_dev_site(lfsck);
5230         struct obd_export *exp = NULL;
5231         struct ptlrpc_request *req = NULL;
5232         struct lfsck_request *lr;
5233         struct lu_seq_range *range  = &lfsck_env_info(env)->lti_range;
5234         int rc = 0;
5235
5236         ENTRY;
5237         if (unlikely(fid_is_idif(pfid)))
5238                 RETURN(1);
5239
5240         fld_range_set_any(range);
5241         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
5242         if (rc != 0)
5243                 RETURN(rc == -ENOENT ? 1 : rc);
5244
5245         if (unlikely(!fld_range_is_mdt(range)))
5246                 RETURN(1);
5247
5248         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
5249         if (unlikely(exp == NULL))
5250                 RETURN(1);
5251
5252         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
5253                 GOTO(out, rc = -EOPNOTSUPP);
5254
5255         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
5256         if (req == NULL)
5257                 GOTO(out, rc = -ENOMEM);
5258
5259         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
5260         if (rc != 0) {
5261                 ptlrpc_request_free(req);
5262
5263                 GOTO(out, rc);
5264         }
5265
5266         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
5267         memset(lr, 0, sizeof(*lr));
5268         lr->lr_event = LE_PAIRS_VERIFY;
5269         lr->lr_active = LFSCK_TYPE_LAYOUT;
5270         lr->lr_fid = *cfid; /* OST-object itself FID. */
5271         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
5272         lr->lr_comp_id = comp_id;
5273
5274         ptlrpc_request_set_replen(req);
5275         rc = ptlrpc_queue_wait(req);
5276         ptlrpc_req_finished(req);
5277
5278         if (rc == -ENOENT || rc == -ENODATA)
5279                 rc = 1;
5280
5281         GOTO(out, rc);
5282
5283 out:
5284         if (exp != NULL)
5285                 class_export_put(exp);
5286
5287         return rc;
5288 }
5289
5290 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
5291                                           struct lfsck_component *com,
5292                                           struct lfsck_req_local *lrl)
5293 {
5294         struct dt_object *obj;
5295         int rc = 0;
5296
5297         ENTRY;
5298         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
5299         if (IS_ERR(obj))
5300                 GOTO(log, rc = PTR_ERR(obj));
5301
5302         rc = __lfsck_layout_update_pfid(env, com, obj,
5303                                         &lrl->lrl_ff_client.ff_parent,
5304                                         &lrl->lrl_ff_client.ff_layout,
5305                                         lrl->lrl_ff_client.ff_layout_version,
5306                                         lrl->lrl_ff_client.ff_range,
5307                                         lrl->lrl_ff_client.ff_parent.f_ver);
5308
5309         lfsck_object_put(env, obj);
5310
5311 log:
5312         CDEBUG(D_LFSCK,
5313                "%s: layout LFSCK slave repaired pfid for "DFID", parent "DFID": rc = %d\n",
5314                lfsck_lfsck2name(com->lc_lfsck), PFID(&lrl->lrl_fid),
5315                PFID(&lrl->lrl_ff_client.ff_parent), rc);
5316
5317         return rc;
5318 }
5319
5320 /* layout APIs */
5321
5322 static void lfsck_layout_slave_quit(const struct lu_env *env,
5323                                     struct lfsck_component *com);
5324
5325 static int lfsck_layout_reset(const struct lu_env *env,
5326                               struct lfsck_component *com, bool init)
5327 {
5328         struct lfsck_layout *lo = com->lc_file_ram;
5329         int rc;
5330
5331         down_write(&com->lc_sem);
5332         if (init) {
5333                 memset(lo, 0, com->lc_file_size);
5334         } else {
5335                 __u32 count = lo->ll_success_count;
5336                 time64_t last_time = lo->ll_time_last_complete;
5337
5338                 memset(lo, 0, com->lc_file_size);
5339                 lo->ll_success_count = count;
5340                 lo->ll_time_last_complete = last_time;
5341         }
5342
5343         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
5344         lo->ll_status = LS_INIT;
5345
5346         if (com->lc_lfsck->li_master) {
5347                 struct lfsck_assistant_data *lad = com->lc_data;
5348
5349                 clear_bit(LAD_INCOMPLETE, &lad->lad_flags);
5350                 bitmap_zero(lad->lad_bitmap, lad->lad_bitmap_count);
5351         }
5352
5353         rc = lfsck_layout_store(env, com);
5354         if (rc == 0 && com->lc_lfsck->li_master)
5355                 rc = lfsck_load_sub_trace_files(env, com,
5356                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
5357         up_write(&com->lc_sem);
5358
5359         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
5360                lfsck_lfsck2name(com->lc_lfsck), rc);
5361
5362         return rc;
5363 }
5364
5365 static void lfsck_layout_fail(const struct lu_env *env,
5366                               struct lfsck_component *com, bool new_checked)
5367 {
5368         struct lfsck_layout *lo = com->lc_file_ram;
5369
5370         down_write(&com->lc_sem);
5371         if (new_checked)
5372                 com->lc_new_checked++;
5373         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
5374         up_write(&com->lc_sem);
5375 }
5376
5377 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
5378                                           struct lfsck_component *com, bool init)
5379 {
5380         struct lfsck_instance *lfsck = com->lc_lfsck;
5381         struct lfsck_layout *lo = com->lc_file_ram;
5382         int rc;
5383
5384         if (!init) {
5385                 rc = lfsck_checkpoint_generic(env, com);
5386                 if (rc != 0)
5387                         return rc > 0 ? 0 : rc;
5388         }
5389
5390         down_write(&com->lc_sem);
5391         if (init) {
5392                 lo->ll_pos_latest_start =
5393                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5394         } else {
5395                 lo->ll_pos_last_checkpoint =
5396                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5397                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5398                                           lfsck->li_time_last_checkpoint;
5399                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5400                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5401                 com->lc_new_checked = 0;
5402         }
5403
5404         rc = lfsck_layout_store(env, com);
5405         up_write(&com->lc_sem);
5406
5407         CDEBUG(D_LFSCK,
5408                "%s: layout LFSCK master checkpoint at the pos [%llu], status = %d: rc = %d\n",
5409                lfsck_lfsck2name(lfsck), lfsck->li_pos_current.lp_oit_cookie,
5410                lo->ll_status, rc);
5411
5412         return rc;
5413 }
5414
5415 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5416                                          struct lfsck_component *com, bool init)
5417 {
5418         struct lfsck_instance *lfsck = com->lc_lfsck;
5419         struct lfsck_layout *lo = com->lc_file_ram;
5420         int rc;
5421
5422         if (com->lc_new_checked == 0 && !init)
5423                 return 0;
5424
5425         down_write(&com->lc_sem);
5426         if (init) {
5427                 lo->ll_pos_latest_start =
5428                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5429         } else {
5430                 lo->ll_pos_last_checkpoint =
5431                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5432                 lo->ll_run_time_phase1 += ktime_get_seconds() -
5433                                           lfsck->li_time_last_checkpoint;
5434                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
5435                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5436                 com->lc_new_checked = 0;
5437         }
5438
5439         rc = lfsck_layout_store(env, com);
5440         up_write(&com->lc_sem);
5441
5442         CDEBUG(D_LFSCK,
5443                "%s: layout LFSCK slave checkpoint at the pos [%llu], status = %d: rc = %d\n",
5444                lfsck_lfsck2name(lfsck), lfsck->li_pos_current.lp_oit_cookie,
5445                lo->ll_status, rc);
5446
5447         return rc;
5448 }
5449
5450 static int lfsck_layout_prep(const struct lu_env *env,
5451                              struct lfsck_component *com,
5452                              struct lfsck_start *start)
5453 {
5454         struct lfsck_instance *lfsck = com->lc_lfsck;
5455         struct lfsck_layout *lo = com->lc_file_ram;
5456         struct lfsck_position *pos = &com->lc_pos_start;
5457
5458         fid_zero(&pos->lp_dir_parent);
5459         pos->lp_dir_cookie = 0;
5460         if (lo->ll_status == LS_COMPLETED ||
5461             lo->ll_status == LS_PARTIAL ||
5462             /* To handle orphan, must scan from the beginning. */
5463             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5464                 int rc;
5465
5466                 rc = lfsck_layout_reset(env, com, false);
5467                 if (rc == 0)
5468                         rc = lfsck_set_param(env, lfsck, start, true);
5469
5470                 if (rc != 0) {
5471                         CDEBUG(D_LFSCK,
5472                                "%s: layout LFSCK prep failed: rc = %d\n",
5473                                lfsck_lfsck2name(lfsck), rc);
5474
5475                         return rc;
5476                 }
5477         }
5478
5479         down_write(&com->lc_sem);
5480         lo->ll_time_latest_start = ktime_get_real_seconds();
5481         spin_lock(&lfsck->li_lock);
5482         if (lo->ll_flags & LF_SCANNED_ONCE) {
5483                 if (!lfsck->li_drop_dryrun ||
5484                     lo->ll_pos_first_inconsistent == 0) {
5485                         lo->ll_status = LS_SCANNING_PHASE2;
5486                         list_move_tail(&com->lc_link,
5487                                        &lfsck->li_list_double_scan);
5488                         pos->lp_oit_cookie = 0;
5489                 } else {
5490                         int i;
5491
5492                         lo->ll_status = LS_SCANNING_PHASE1;
5493                         lo->ll_run_time_phase1 = 0;
5494                         lo->ll_run_time_phase2 = 0;
5495                         lo->ll_objs_checked_phase1 = 0;
5496                         lo->ll_objs_checked_phase2 = 0;
5497                         lo->ll_objs_failed_phase1 = 0;
5498                         lo->ll_objs_failed_phase2 = 0;
5499                         for (i = 0; i < LLIT_MAX; i++)
5500                                 lo->ll_objs_repaired[i] = 0;
5501
5502                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5503                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5504                 }
5505         } else {
5506                 lo->ll_status = LS_SCANNING_PHASE1;
5507                 if (!lfsck->li_drop_dryrun ||
5508                     lo->ll_pos_first_inconsistent == 0)
5509                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5510                 else
5511                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5512         }
5513         spin_unlock(&lfsck->li_lock);
5514         up_write(&com->lc_sem);
5515
5516         return 0;
5517 }
5518
5519 static int lfsck_layout_slave_prep(const struct lu_env *env,
5520                                    struct lfsck_component *com,
5521                                    struct lfsck_start_param *lsp)
5522 {
5523         struct lfsck_layout_slave_data *llsd = com->lc_data;
5524         struct lfsck_instance *lfsck = com->lc_lfsck;
5525         struct lfsck_layout *lo = com->lc_file_ram;
5526         struct lfsck_start *start = lsp->lsp_start;
5527         int rc;
5528
5529         rc = lfsck_layout_prep(env, com, start);
5530         if (rc != 0)
5531                 return rc;
5532
5533         if (lo->ll_flags & LF_CRASHED_LASTID &&
5534             list_empty(&llsd->llsd_master_list)) {
5535                 LASSERT(lfsck->li_out_notify != NULL);
5536
5537                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5538                                      LE_LASTID_REBUILDING);
5539         }
5540
5541         if (!lsp->lsp_index_valid)
5542                 return 0;
5543
5544         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5545         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5546                 LASSERT(!llsd->llsd_rbtree_valid);
5547
5548                 down_write(&llsd->llsd_rb_rwsem);
5549                 rc = lfsck_rbtree_setup(env, com);
5550                 up_write(&llsd->llsd_rb_rwsem);
5551         }
5552
5553         CDEBUG(D_LFSCK,
5554                "%s: layout LFSCK slave prep done, start pos [%llu]\n",
5555                lfsck_lfsck2name(lfsck), com->lc_pos_start.lp_oit_cookie);
5556
5557         return rc;
5558 }
5559
5560 static int lfsck_layout_master_prep(const struct lu_env *env,
5561                                     struct lfsck_component *com,
5562                                     struct lfsck_start_param *lsp)
5563 {
5564         int rc;
5565
5566         ENTRY;
5567         rc = lfsck_layout_load_bitmap(env, com);
5568         if (rc != 0) {
5569                 rc = lfsck_layout_reset(env, com, false);
5570                 if (rc == 0)
5571                         rc = lfsck_set_param(env, com->lc_lfsck,
5572                                              lsp->lsp_start, true);
5573
5574                 if (rc != 0)
5575                         GOTO(log, rc);
5576         }
5577
5578         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5579         if (rc != 0)
5580                 RETURN(rc);
5581
5582         rc = lfsck_start_assistant(env, com, lsp);
5583
5584         GOTO(log, rc);
5585
5586 log:
5587         CDEBUG(D_LFSCK,
5588                "%s: layout LFSCK master prep done, start pos [%llu]\n",
5589                lfsck_lfsck2name(com->lc_lfsck),
5590                com->lc_pos_start.lp_oit_cookie);
5591
5592         return 0;
5593 }
5594
5595 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5596 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5597                                      struct lfsck_component *com,
5598                                      struct dt_object *parent,
5599                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5600 {
5601         struct lfsck_thread_info *info = lfsck_env_info(env);
5602         struct lfsck_instance *lfsck = com->lc_lfsck;
5603         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
5604         struct lfsck_layout *lo = com->lc_file_ram;
5605         struct lfsck_assistant_data *lad = com->lc_data;
5606         struct lfsck_assistant_object *lso = NULL;
5607         struct lov_ost_data_v1 *objs;
5608         struct lfsck_tgt_descs *ltds = &lfsck->li_ost_descs;
5609         struct ptlrpc_thread *mthread = &lfsck->li_thread;
5610         struct ptlrpc_thread *athread = &lad->lad_thread;
5611         struct lu_buf buf;
5612         int rc = 0;
5613         int i;
5614         __u32 magic;
5615         __u16 count;
5616
5617         ENTRY;
5618         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5619         magic = le32_to_cpu(lmm->lmm_magic);
5620         if (magic == LOV_MAGIC_V1) {
5621                 objs = &lmm->lmm_objects[0];
5622         } else {
5623                 LASSERT(magic == LOV_MAGIC_V3);
5624                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5625         }
5626
5627         count = le16_to_cpu(lmm->lmm_stripe_count);
5628         for (i = 0; i < count; i++, objs++) {
5629                 struct lu_fid           *fid    = &info->lti_fid;
5630                 struct ost_id           *oi     = &info->lti_oi;
5631                 struct lfsck_layout_req *llr;
5632                 struct lfsck_tgt_desc   *tgt    = NULL;
5633                 struct dt_object        *cobj   = NULL;
5634                 __u32                    index;
5635                 bool                     wakeup = false;
5636
5637                 if (unlikely(lovea_slot_is_dummy(objs)))
5638                         continue;
5639
5640                 wait_event_idle(mthread->t_ctl_waitq,
5641                                 lad->lad_prefetched < bk->lb_async_windows ||
5642                                 !thread_is_running(mthread) ||
5643                                 thread_is_stopped(athread));
5644
5645                 if (unlikely(!thread_is_running(mthread)) ||
5646                              thread_is_stopped(athread))
5647                         GOTO(out, rc = 0);
5648
5649                 if (unlikely(lfsck_is_dead_obj(parent)))
5650                         GOTO(out, rc = 0);
5651
5652                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5653                 index = le32_to_cpu(objs->l_ost_idx);
5654                 rc = ostid_to_fid(fid, oi, index);
5655                 if (rc != 0) {
5656                         CDEBUG(D_LFSCK,
5657                                "%s: get invalid layout EA for "DFID": "DOSTID", idx %u, comp_id %u\n",
5658                                lfsck_lfsck2name(lfsck),
5659                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5660                                index, comp_id);
5661                         goto next;
5662                 }
5663
5664                 tgt = lfsck_tgt_get(ltds, index);
5665                 if (unlikely(tgt == NULL)) {
5666                         CDEBUG(D_LFSCK,
5667                                "%s: cannot talk with OST %x which did not join the layout LFSCK, comp_id %u\n",
5668                                lfsck_lfsck2name(lfsck), index, comp_id);
5669                         lfsck_lad_set_bitmap(env, com, index);
5670                         goto next;
5671                 }
5672
5673                 /* There is potential deadlock race condition between object
5674                  * destroy and layout LFSCK. Consider the following scenario:
5675                  *
5676                  * 1) The LFSCK thread obtained the parent object firstly, at
5677                  *    that time, the parent object has not been destroyed yet.
5678                  *
5679                  * 2) One RPC service thread destroyed the parent and all its
5680                  *    children objects. Because the LFSCK is referencing the
5681                  *    parent object, then the parent object will be marked as
5682                  *    dying in RAM. On the other hand, the parent object is
5683                  *    referencing all its children objects, then all children
5684                  *    objects will be marked as dying in RAM also.
5685                  *
5686                  * 3) The LFSCK thread tries to find some child object with
5687                  *    the parent object referenced. Then it will find that the
5688                  *    child object is dying. According to the object visibility
5689                  *    rules: the object with dying flag cannot be returned to
5690                  *    others. So the LFSCK thread has to wait until the dying
5691                  *    object has been purged from RAM, then it can allocate a
5692                  *    new object (with the same FID) in RAM. Unfortunately, the
5693                  *    LFSCK thread itself is referencing the parent object, and
5694                  *    cause the parent object cannot be purged, then cause the
5695                  *    child object cannot be purged also. So the LFSCK thread
5696                  *    will fall into deadlock.
5697                  */
5698                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5699                 if (IS_ERR(cobj)) {
5700                         if (lfsck_is_dead_obj(parent)) {
5701                                 lfsck_tgt_put(tgt);
5702
5703                                 GOTO(out, rc = 0);
5704                         }
5705
5706                         rc = PTR_ERR(cobj);
5707                         goto next;
5708                 }
5709
5710                 rc = dt_declare_attr_get(env, cobj);
5711                 if (rc)
5712                         goto next;
5713
5714                 rc = dt_declare_xattr_get(env, cobj, &buf, XATTR_NAME_FID);
5715                 if (rc)
5716                         goto next;
5717
5718                 if (lso == NULL) {
5719                         struct lu_attr *attr = &info->lti_la;
5720
5721                         rc = dt_attr_get(env, parent, attr);
5722                         if (rc != 0)
5723                                 goto next;
5724
5725                         lso = lfsck_assistant_object_init(env,
5726                                 lfsck_dto2fid(parent), attr,
5727                                 lfsck->li_pos_current.lp_oit_cookie, false);
5728                         if (IS_ERR(lso)) {
5729                                 rc = PTR_ERR(lso);
5730                                 lso = NULL;
5731
5732                                 goto next;
5733                         }
5734                 }
5735
5736                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5737                                                       index, i);
5738                 if (IS_ERR(llr)) {
5739                         rc = PTR_ERR(llr);
5740                         goto next;
5741                 }
5742
5743                 cobj = NULL;
5744                 spin_lock(&lad->lad_lock);
5745                 if (lad->lad_assistant_status < 0) {
5746                         spin_unlock(&lad->lad_lock);
5747                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5748                         lfsck_tgt_put(tgt);
5749                         RETURN(lad->lad_assistant_status);
5750                 }
5751
5752                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5753                 if (lad->lad_prefetched == 0)
5754                         wakeup = true;
5755
5756                 lad->lad_prefetched++;
5757                 spin_unlock(&lad->lad_lock);
5758                 if (wakeup)
5759                         wake_up(&athread->t_ctl_waitq);
5760
5761 next:
5762                 down_write(&com->lc_sem);
5763                 com->lc_new_checked++;
5764                 if (rc < 0)
5765                         lfsck_layout_record_failure(env, lfsck, lo);
5766                 up_write(&com->lc_sem);
5767
5768                 if (cobj != NULL && !IS_ERR(cobj))
5769                         lfsck_object_put(env, cobj);
5770
5771                 if (likely(tgt != NULL))
5772                         lfsck_tgt_put(tgt);
5773
5774                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5775                         GOTO(out, rc);
5776         }
5777
5778         GOTO(out, rc = 0);
5779
5780 out:
5781         if (lso != NULL)
5782                 kref_put(&lso->lso_ref, lfsck_assistant_object_put);
5783
5784         return rc;
5785 }
5786
5787 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5788  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5789  * list ::lad_req_list.
5790  *
5791  * For each request on above list, the lfsck_layout_assistant thread compares
5792  * the OST side attribute with local attribute, if inconsistent, then repair it.
5793  *
5794  * All above processing is async mode with pipeline.
5795  */
5796 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5797                                         struct lfsck_component *com,
5798                                         struct dt_object *obj)
5799 {
5800         struct lfsck_thread_info *info = lfsck_env_info(env);
5801         struct ost_id *oi = &info->lti_oi;
5802         struct lfsck_layout *lo = com->lc_file_ram;
5803         struct lfsck_assistant_data *lad = com->lc_data;
5804         struct lfsck_instance *lfsck = com->lc_lfsck;
5805         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
5806         struct thandle *handle = NULL;
5807         struct lu_buf *buf = &info->lti_big_buf;
5808         struct lov_mds_md_v1 *lmm = NULL;
5809         struct dt_device *dev = lfsck_obj2dev(obj);
5810         struct lustre_handle lh = { 0 };
5811         struct lu_buf ea_buf = { NULL };
5812         struct lov_comp_md_v1 *lcm = NULL;
5813         struct lov_comp_md_entry_v1 *lcme = NULL;
5814         int rc = 0;
5815         int size = 0;
5816         __u32 magic = 0;
5817         __u16 count = 0;
5818         bool locked = false;
5819         bool stripe = false;
5820         bool bad_oi = false;
5821
5822         ENTRY;
5823         if (!S_ISREG(lfsck_object_type(obj)))
5824                 GOTO(out, rc = 0);
5825
5826         if (lad->lad_assistant_status < 0)
5827                 GOTO(out, rc = -ESRCH);
5828
5829         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5830         lmm_oi_cpu_to_le(oi, oi);
5831         dt_read_lock(env, obj, 0);
5832         locked = true;
5833
5834 again:
5835         bad_oi = false;
5836         if (dt_object_exists(obj) == 0 ||
5837             lfsck_is_dead_obj(obj))
5838                 GOTO(out, rc = 0);
5839
5840         rc = lfsck_layout_get_lovea(env, obj, buf);
5841         if (rc == -EINVAL || rc == -ENODATA || rc == -EOPNOTSUPP)
5842                 /* Skip bad lov EA during the 1st cycle scanning, and
5843                  * try to recover it via orphan in the 2nd scanning.
5844                  */
5845                 rc = 0;
5846         if (rc <= 0)
5847                 GOTO(out, rc);
5848
5849         size = rc;
5850         lmm = buf->lb_buf;
5851         magic = le32_to_cpu(lmm->lmm_magic);
5852         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5853                 struct lov_mds_md_v1 *v1;
5854                 int i;
5855
5856                 lcm = buf->lb_buf;
5857                 count = le16_to_cpu(lcm->lcm_entry_count);
5858                 for (i = 0; i < count; i++) {
5859                         lcme = &lcm->lcm_entries[i];
5860                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5861                         if (memcmp(oi, &v1->lmm_oi, sizeof(*oi)) != 0)
5862                                 goto fix;
5863                 }
5864
5865                 GOTO(out, stripe = true);
5866         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5867                 GOTO(out, stripe = true);
5868         }
5869
5870 fix:
5871         /* Inconsistent lmm_oi, should be repaired. */
5872         bad_oi = true;
5873
5874         if (bk->lb_param & LPF_DRYRUN) {
5875                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5876
5877                 GOTO(out, stripe = true);
5878         }
5879
5880         if (!lustre_handle_is_used(&lh)) {
5881                 dt_read_unlock(env, obj);
5882                 locked = false;
5883                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5884                                       MDS_INODELOCK_LAYOUT |
5885                                       MDS_INODELOCK_XATTR, LCK_EX);
5886                 if (rc != 0)
5887                         GOTO(out, rc);
5888
5889                 handle = lfsck_trans_create(env, dev, lfsck);
5890                 if (IS_ERR(handle))
5891                         GOTO(out, rc = PTR_ERR(handle));
5892
5893                 lfsck_buf_init(&ea_buf, buf->lb_buf, size);
5894                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5895                                           LU_XATTR_REPLACE, handle);
5896                 if (rc != 0)
5897                         GOTO(out, rc);
5898
5899                 rc = dt_trans_start_local(env, dev, handle);
5900                 if (rc != 0)
5901                         GOTO(out, rc);
5902
5903                 dt_write_lock(env, obj, 0);
5904                 locked = true;
5905
5906                 goto again;
5907         }
5908
5909         if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5910                 struct lov_mds_md_v1 *v1;
5911                 int i;
5912
5913                 for (i = 0; i < count; i++) {
5914                         lcme = &lcm->lcm_entries[i];
5915                         v1 = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5916                         v1->lmm_oi = *oi;
5917                 }
5918         } else {
5919                 lmm->lmm_oi = *oi;
5920         }
5921
5922         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5923                           LU_XATTR_REPLACE, handle);
5924         if (rc != 0)
5925                 GOTO(out, rc);
5926
5927         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5928
5929         GOTO(out, stripe = true);
5930
5931 out:
5932         if (locked) {
5933                 if (lustre_handle_is_used(&lh))
5934                         dt_write_unlock(env, obj);
5935                 else
5936                         dt_read_unlock(env, obj);
5937         }
5938
5939         if (handle != NULL && !IS_ERR(handle))
5940                 dt_trans_stop(env, dev, handle);
5941
5942         lfsck_ibits_unlock(&lh, LCK_EX);
5943
5944         if (bad_oi)
5945                 CDEBUG(D_LFSCK,
5946                        "%s: layout LFSCK master %s bad lmm_oi for "DFID": rc = %d\n",
5947                        lfsck_lfsck2name(lfsck),
5948                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5949                        PFID(lfsck_dto2fid(obj)), rc);
5950
5951         if (stripe) {
5952                 if (magic == LOV_MAGIC_COMP_V1 || magic == LOV_MAGIC_SEL) {
5953                         int i;
5954
5955                         for (i = 0; i < count; i++) {
5956                                 lcme = &lcm->lcm_entries[i];
5957                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5958                                       LCME_FL_INIT))
5959                                         continue;
5960
5961                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5962                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5963                                         le32_to_cpu(lcme->lcme_offset)),
5964                                         le32_to_cpu(lcme->lcme_id));
5965                         }
5966                 } else {
5967                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5968                 }
5969         } else {
5970                 down_write(&com->lc_sem);
5971                 com->lc_new_checked++;
5972                 if (rc < 0)
5973                         lfsck_layout_record_failure(env, lfsck, lo);
5974                 up_write(&com->lc_sem);
5975         }
5976
5977         return rc;
5978 }
5979
5980 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5981                                        struct lfsck_component *com,
5982                                        struct dt_object *obj)
5983 {
5984         struct lfsck_instance *lfsck = com->lc_lfsck;
5985         struct lfsck_layout *lo = com->lc_file_ram;
5986         const struct lu_fid *fid = lfsck_dto2fid(obj);
5987         struct lfsck_layout_slave_data *llsd = com->lc_data;
5988         struct lfsck_layout_seq *lls;
5989         __u64 seq;
5990         __u64 oid;
5991         int rc;
5992
5993         ENTRY;
5994         LASSERT(llsd != NULL);
5995
5996         if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5997             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5998                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5999
6000                 wait_event_idle_timeout(thread->t_ctl_waitq,
6001                                         !thread_is_running(thread),
6002                                         cfs_time_seconds(1));
6003         }
6004
6005         lfsck_rbtree_update_bitmap(env, com, fid, false);
6006
6007         down_write(&com->lc_sem);
6008         if (fid_is_idif(fid))
6009                 seq = 0;
6010         else if (!fid_is_norm(fid) ||
6011                  !fid_is_for_ostobj(env, lfsck, obj, fid))
6012                 GOTO(unlock, rc = 0);
6013         else
6014                 seq = fid_seq(fid);
6015         com->lc_new_checked++;
6016
6017         lls = lfsck_layout_seq_lookup(llsd, seq);
6018         if (lls == NULL) {
6019                 OBD_ALLOC_PTR(lls);
6020                 if (unlikely(lls == NULL))
6021                         GOTO(unlock, rc = -ENOMEM);
6022
6023                 INIT_LIST_HEAD(&lls->lls_list);
6024                 lls->lls_seq = seq;
6025                 rc = lfsck_layout_lastid_load(env, com, lls);
6026                 if (rc != 0) {
6027                         CDEBUG(D_LFSCK,
6028                                "%s: layout LFSCK failed to load LAST_ID for %#llx: rc = %d\n",
6029                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
6030                         lo->ll_objs_failed_phase1++;
6031                         OBD_FREE_PTR(lls);
6032                         GOTO(unlock, rc);
6033                 }
6034
6035                 lfsck_layout_seq_insert(llsd, lls);
6036         }
6037
6038         if (unlikely(fid_is_last_id(fid)))
6039                 GOTO(unlock, rc = 0);
6040
6041         if (fid_is_idif(fid))
6042                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
6043         else
6044                 oid = fid_oid(fid);
6045
6046         if (oid > lls->lls_lastid_known)
6047                 lls->lls_lastid_known = oid;
6048
6049         if (oid > lls->lls_lastid) {
6050                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
6051                         /* OFD may create new objects during LFSCK scanning. */
6052                         rc = lfsck_layout_lastid_reload(env, com, lls);
6053                         if (unlikely(rc != 0)) {
6054                                 CDEBUG(D_LFSCK,
6055                                        "%s: layout LFSCK failed to reload LAST_ID for %#llx: rc = %d\n",
6056                                       lfsck_lfsck2name(com->lc_lfsck),
6057                                       lls->lls_seq, rc);
6058
6059                                 GOTO(unlock, rc);
6060                         }
6061
6062                         if (oid <= lls->lls_lastid ||
6063                             lo->ll_flags & LF_CRASHED_LASTID)
6064                                 GOTO(unlock, rc = 0);
6065
6066                         LASSERT(lfsck->li_out_notify != NULL);
6067
6068                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6069                                              LE_LASTID_REBUILDING);
6070                         lo->ll_flags |= LF_CRASHED_LASTID;
6071
6072                         CDEBUG(D_LFSCK,
6073                                "%s: layout LFSCK finds crashed LAST_ID file (2) for the sequence %#llx, old value %llu, known value %llu\n",
6074                                lfsck_lfsck2name(lfsck), lls->lls_seq,
6075                                lls->lls_lastid, oid);
6076                 }
6077
6078                 lls->lls_lastid = oid;
6079                 lls->lls_dirty = 1;
6080         }
6081
6082         GOTO(unlock, rc = 0);
6083
6084 unlock:
6085         up_write(&com->lc_sem);
6086
6087         return rc;
6088 }
6089
6090 static int lfsck_layout_exec_dir(const struct lu_env *env,
6091                                  struct lfsck_component *com,
6092                                  struct lfsck_assistant_object *lso,
6093                                  struct lu_dirent *ent, __u16 type)
6094 {
6095         return 0;
6096 }
6097
6098 static int lfsck_layout_master_post(const struct lu_env *env,
6099                                     struct lfsck_component *com,
6100                                     int result, bool init)
6101 {
6102         struct lfsck_instance *lfsck = com->lc_lfsck;
6103         struct lfsck_layout *lo = com->lc_file_ram;
6104         int rc;
6105
6106         ENTRY;
6107         lfsck_post_generic(env, com, &result);
6108
6109         down_write(&com->lc_sem);
6110         spin_lock(&lfsck->li_lock);
6111         if (!init)
6112                 lo->ll_pos_last_checkpoint =
6113                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6114
6115         if (result > 0) {
6116                 if (lo->ll_flags & LF_INCOMPLETE)
6117                         lo->ll_status = LS_PARTIAL;
6118                 else
6119                         lo->ll_status = LS_SCANNING_PHASE2;
6120                 lo->ll_flags |= LF_SCANNED_ONCE;
6121                 lo->ll_flags &= ~LF_UPGRADE;
6122                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6123         } else if (result == 0) {
6124                 if (lfsck->li_status != 0)
6125                         lo->ll_status = lfsck->li_status;
6126                 else
6127                         lo->ll_status = LS_STOPPED;
6128                 if (lo->ll_status != LS_PAUSED)
6129                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6130         } else {
6131                 lo->ll_status = LS_FAILED;
6132                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6133         }
6134         spin_unlock(&lfsck->li_lock);
6135
6136         if (!init) {
6137                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6138                                           lfsck->li_time_last_checkpoint;
6139                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6140                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6141                 com->lc_new_checked = 0;
6142         }
6143
6144         rc = lfsck_layout_store(env, com);
6145         up_write(&com->lc_sem);
6146
6147         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
6148                lfsck_lfsck2name(lfsck), rc);
6149
6150         RETURN(rc);
6151 }
6152
6153 static int lfsck_layout_slave_post(const struct lu_env *env,
6154                                    struct lfsck_component *com,
6155                                    int result, bool init)
6156 {
6157         struct lfsck_instance *lfsck = com->lc_lfsck;
6158         struct lfsck_layout *lo = com->lc_file_ram;
6159         int rc;
6160         bool done = false;
6161
6162         down_write(&com->lc_sem);
6163         rc = lfsck_layout_lastid_store(env, com);
6164         if (rc != 0)
6165                 result = rc;
6166
6167         LASSERT(lfsck->li_out_notify != NULL);
6168
6169         spin_lock(&lfsck->li_lock);
6170         if (!init)
6171                 lo->ll_pos_last_checkpoint =
6172                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
6173
6174         if (result > 0) {
6175                 lo->ll_status = LS_SCANNING_PHASE2;
6176                 lo->ll_flags |= LF_SCANNED_ONCE;
6177                 if (lo->ll_flags & LF_CRASHED_LASTID) {
6178                         done = true;
6179                         lo->ll_flags &= ~LF_CRASHED_LASTID;
6180
6181                         CDEBUG(D_LFSCK,
6182                                "%s: layout LFSCK has rebuilt crashed LAST_ID files successfully\n",
6183                                lfsck_lfsck2name(lfsck));
6184                 }
6185                 lo->ll_flags &= ~LF_UPGRADE;
6186                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
6187         } else if (result == 0) {
6188                 if (lfsck->li_status != 0)
6189                         lo->ll_status = lfsck->li_status;
6190                 else
6191                         lo->ll_status = LS_STOPPED;
6192                 if (lo->ll_status != LS_PAUSED)
6193                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6194         } else {
6195                 lo->ll_status = LS_FAILED;
6196                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
6197         }
6198         spin_unlock(&lfsck->li_lock);
6199
6200         if (done)
6201                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6202                                      LE_LASTID_REBUILT);
6203
6204         if (!init) {
6205                 lo->ll_run_time_phase1 += ktime_get_seconds() -
6206                                           lfsck->li_time_last_checkpoint;
6207                 lo->ll_time_last_checkpoint = ktime_get_real_seconds();
6208                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
6209                 com->lc_new_checked = 0;
6210         }
6211
6212         rc = lfsck_layout_store(env, com);
6213         up_write(&com->lc_sem);
6214
6215         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
6216
6217         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
6218                lfsck_lfsck2name(lfsck), rc);
6219
6220         return rc;
6221 }
6222
6223 static void lfsck_layout_dump(const struct lu_env *env,
6224                               struct lfsck_component *com, struct seq_file *m)
6225 {
6226         struct lfsck_instance *lfsck = com->lc_lfsck;
6227         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
6228         struct lfsck_layout *lo = com->lc_file_ram;
6229         const char *prefix;
6230
6231         down_read(&com->lc_sem);
6232         if (bk->lb_param & LPF_DRYRUN)
6233                 prefix = "inconsistent";
6234         else
6235                 prefix = "repaired";
6236
6237         seq_printf(m, "name: lfsck_layout\n"
6238                    "magic: %#x\n"
6239                    "version: %d\n"
6240                    "status: %s\n",
6241                    lo->ll_magic,
6242                    bk->lb_version,
6243                    lfsck_status2name(lo->ll_status));
6244
6245         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
6246
6247         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
6248
6249         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
6250
6251         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
6252
6253         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
6254
6255         seq_printf(m, "latest_start_position: %llu\n"
6256                    "last_checkpoint_position: %llu\n"
6257                    "first_failure_position: %llu\n",
6258                    lo->ll_pos_latest_start,
6259                    lo->ll_pos_last_checkpoint,
6260                    lo->ll_pos_first_inconsistent);
6261
6262         seq_printf(m, "success_count: %u\n"
6263                    "%s_dangling: %llu\n"
6264                    "%s_unmatched_pair: %llu\n"
6265                    "%s_multiple_referenced: %llu\n"
6266                    "%s_orphan: %llu\n"
6267                    "%s_inconsistent_owner: %llu\n"
6268                    "%s_others: %llu\n"
6269                    "skipped: %llu\n"
6270                    "failed_phase1: %llu\n"
6271                    "failed_phase2: %llu\n",
6272                    lo->ll_success_count,
6273                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
6274                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
6275                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
6276                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
6277                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
6278                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
6279                    lo->ll_objs_skipped,
6280                    lo->ll_objs_failed_phase1,
6281                    lo->ll_objs_failed_phase2);
6282
6283         if (lo->ll_status == LS_SCANNING_PHASE1) {
6284                 time64_t duration = ktime_get_seconds() -
6285                                     lfsck->li_time_last_checkpoint;
6286                 u64 checked = lo->ll_objs_checked_phase1 +
6287                               com->lc_new_checked;
6288                 u64 speed = checked;
6289                 u64 new_checked = com->lc_new_checked;
6290                 time64_t rtime = lo->ll_run_time_phase1 + duration;
6291                 u64 pos;
6292
6293                 if (duration != 0)
6294                         new_checked = div64_s64(new_checked, duration);
6295                 if (rtime != 0)
6296                         speed = div64_s64(speed, rtime);
6297                 seq_printf(m, "checked_phase1: %llu\n"
6298                            "checked_phase2: %llu\n"
6299                            "run_time_phase1: %lld seconds\n"
6300                            "run_time_phase2: %lld seconds\n"
6301                            "average_speed_phase1: %llu items/sec\n"
6302                            "average_speed_phase2: N/A\n"
6303                            "real_time_speed_phase1: %llu items/sec\n"
6304                            "real_time_speed_phase2: N/A\n",
6305                            checked,
6306                            lo->ll_objs_checked_phase2,
6307                            rtime,
6308                            lo->ll_run_time_phase2,
6309                            speed,
6310                            new_checked);
6311
6312                 if (likely(lfsck->li_di_oit)) {
6313                         const struct dt_it_ops *iops =
6314                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
6315
6316                         /* The low layer otable-based iteration position may NOT
6317                          * exactly match the layout-based directory traversal
6318                          * cookie. Generally, it is not a serious issue. But the
6319                          * caller should NOT make assumption on that.
6320                          */
6321                         pos = iops->store(env, lfsck->li_di_oit);
6322                         if (!lfsck->li_current_oit_processed)
6323                                 pos--;
6324                 } else {
6325                         pos = lo->ll_pos_last_checkpoint;
6326                 }
6327
6328                 seq_printf(m, "current_position: %llu\n", pos);
6329         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
6330                 time64_t duration = ktime_get_seconds() -
6331                                     com->lc_time_last_checkpoint;
6332                 u64 checked = lo->ll_objs_checked_phase2 +
6333                               com->lc_new_checked;
6334                 u64 speed1 = lo->ll_objs_checked_phase1;
6335                 u64 speed2 = checked;
6336                 u64 new_checked = com->lc_new_checked;
6337                 time64_t rtime = lo->ll_run_time_phase2 + duration;
6338
6339                 if (duration != 0)
6340                         new_checked = div64_s64(new_checked, duration);
6341                 if (lo->ll_run_time_phase1 != 0)
6342                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6343                 if (rtime != 0)
6344                         speed2 = div64_s64(speed2, rtime);
6345                 seq_printf(m, "checked_phase1: %llu\n"
6346                            "checked_phase2: %llu\n"
6347                            "run_time_phase1: %lld seconds\n"
6348                            "run_time_phase2: %lld seconds\n"
6349                            "average_speed_phase1: %llu items/sec\n"
6350                            "average_speed_phase2: %llu items/sec\n"
6351                            "real_time_speed_phase1: N/A\n"
6352                            "real_time_speed_phase2: %llu items/sec\n"
6353                            "current_position: "DFID"\n",
6354                            lo->ll_objs_checked_phase1,
6355                            checked,
6356                            lo->ll_run_time_phase1,
6357                            rtime,
6358                            speed1,
6359                            speed2,
6360                            new_checked,
6361                            PFID(&com->lc_fid_latest_scanned_phase2));
6362         } else {
6363                 __u64 speed1 = lo->ll_objs_checked_phase1;
6364                 __u64 speed2 = lo->ll_objs_checked_phase2;
6365
6366                 if (lo->ll_run_time_phase1 != 0)
6367                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
6368                 if (lo->ll_run_time_phase2 != 0)
6369                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
6370                 seq_printf(m, "checked_phase1: %llu\n"
6371                            "checked_phase2: %llu\n"
6372                            "run_time_phase1: %lld seconds\n"
6373                            "run_time_phase2: %lld seconds\n"
6374                            "average_speed_phase1: %llu items/sec\n"
6375                            "average_speed_phase2: %llu objs/sec\n"
6376                            "real_time_speed_phase1: N/A\n"
6377                            "real_time_speed_phase2: N/A\n"
6378                            "current_position: N/A\n",
6379                            lo->ll_objs_checked_phase1,
6380                            lo->ll_objs_checked_phase2,
6381                            lo->ll_run_time_phase1,
6382                            lo->ll_run_time_phase2,
6383                            speed1,
6384                            speed2);
6385         }
6386
6387         up_read(&com->lc_sem);
6388 }
6389
6390 static int lfsck_layout_master_double_scan(const struct lu_env *env,
6391                                            struct lfsck_component *com)
6392 {
6393         struct lfsck_layout *lo = com->lc_file_ram;
6394         struct lfsck_assistant_data *lad = com->lc_data;
6395         struct lfsck_instance *lfsck = com->lc_lfsck;
6396         struct lfsck_tgt_descs *ltds;
6397         struct lfsck_tgt_desc *ltd;
6398         struct lfsck_tgt_desc *next;
6399         int rc;
6400
6401         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6402
6403         if (thread_is_stopped(&lad->lad_thread)) {
6404                 LASSERT(list_empty(&lad->lad_req_list));
6405                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6406                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6407
6408                 ltds = &lfsck->li_ost_descs;
6409                 spin_lock(&ltds->ltd_lock);
6410                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6411                                          ltd_layout_phase_list) {
6412                         list_del_init(&ltd->ltd_layout_phase_list);
6413                 }
6414                 spin_unlock(&ltds->ltd_lock);
6415
6416                 ltds = &lfsck->li_mdt_descs;
6417                 spin_lock(&ltds->ltd_lock);
6418                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6419                                          ltd_layout_phase_list) {
6420                         list_del_init(&ltd->ltd_layout_phase_list);
6421                 }
6422                 spin_unlock(&ltds->ltd_lock);
6423         }
6424
6425         return rc;
6426 }
6427
6428 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6429                                           struct lfsck_component *com)
6430 {
6431         struct lfsck_instance *lfsck = com->lc_lfsck;
6432         struct lfsck_layout_slave_data *llsd = com->lc_data;
6433         struct lfsck_layout *lo = com->lc_file_ram;
6434         struct ptlrpc_thread *thread = &lfsck->li_thread;
6435         int rc;
6436
6437         ENTRY;
6438         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6439                lfsck_lfsck2name(lfsck));
6440
6441         atomic_inc(&lfsck->li_double_scan_count);
6442
6443         if (lo->ll_flags & LF_INCOMPLETE)
6444                 GOTO(done, rc = 1);
6445
6446         com->lc_new_checked = 0;
6447         com->lc_new_scanned = 0;
6448         com->lc_time_last_checkpoint = ktime_get_seconds();
6449         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6450                                        LFSCK_CHECKPOINT_INTERVAL;
6451
6452         while (1) {
6453                 rc = lfsck_layout_slave_query_master(env, com);
6454                 if (list_empty(&llsd->llsd_master_list)) {
6455                         if (unlikely(!thread_is_running(thread)))
6456                                 rc = 0;
6457                         else
6458                                 rc = 1;
6459
6460                         GOTO(done, rc);
6461                 }
6462
6463                 if (rc < 0)
6464                         GOTO(done, rc);
6465
6466                 rc = wait_event_idle_timeout(
6467                         thread->t_ctl_waitq,
6468                         !thread_is_running(thread) ||
6469                         lo->ll_flags & LF_INCOMPLETE ||
6470                         list_empty(&llsd->llsd_master_list),
6471                         cfs_time_seconds(30));
6472                 if (unlikely(!thread_is_running(thread)))
6473                         GOTO(done, rc = 0);
6474
6475                 if (lo->ll_flags & LF_INCOMPLETE)
6476                         GOTO(done, rc = 1);
6477
6478                 if (rc == 0)
6479                         continue;
6480
6481                 GOTO(done, rc = 1);
6482         }
6483
6484 done:
6485         rc = lfsck_layout_double_scan_result(env, com, rc);
6486         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6487                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6488         lfsck_layout_slave_quit(env, com);
6489         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6490                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6491
6492         CDEBUG(D_LFSCK,
6493                "%s: layout LFSCK slave phase2 scan finished, status %d: rc = %d\n",
6494                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6495
6496         return rc;
6497 }
6498
6499 static void lfsck_layout_master_data_release(const struct lu_env *env,
6500                                              struct lfsck_component *com)
6501 {
6502         struct lfsck_assistant_data *lad = com->lc_data;
6503         struct lfsck_instance *lfsck = com->lc_lfsck;
6504         struct lfsck_tgt_descs *ltds;
6505         struct lfsck_tgt_desc *ltd;
6506         struct lfsck_tgt_desc *next;
6507
6508         LASSERT(lad != NULL);
6509         LASSERT(thread_is_init(&lad->lad_thread) ||
6510                 thread_is_stopped(&lad->lad_thread));
6511         LASSERT(list_empty(&lad->lad_req_list));
6512
6513         com->lc_data = NULL;
6514
6515         ltds = &lfsck->li_ost_descs;
6516         spin_lock(&ltds->ltd_lock);
6517         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6518                                  ltd_layout_phase_list) {
6519                 list_del_init(&ltd->ltd_layout_phase_list);
6520         }
6521         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6522                                  ltd_layout_phase_list) {
6523                 list_del_init(&ltd->ltd_layout_phase_list);
6524         }
6525         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6526                                  ltd_layout_list) {
6527                 list_del_init(&ltd->ltd_layout_list);
6528         }
6529         spin_unlock(&ltds->ltd_lock);
6530
6531         ltds = &lfsck->li_mdt_descs;
6532         spin_lock(&ltds->ltd_lock);
6533         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6534                                  ltd_layout_phase_list) {
6535                 list_del_init(&ltd->ltd_layout_phase_list);
6536         }
6537         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6538                                  ltd_layout_phase_list) {
6539                 list_del_init(&ltd->ltd_layout_phase_list);
6540         }
6541         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6542                                  ltd_layout_list) {
6543                 list_del_init(&ltd->ltd_layout_list);
6544         }
6545         spin_unlock(&ltds->ltd_lock);
6546
6547         bitmap_free(lad->lad_bitmap);
6548
6549         OBD_FREE_PTR(lad);
6550 }
6551
6552 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6553                                             struct lfsck_component *com)
6554 {
6555         struct lfsck_layout_slave_data *llsd = com->lc_data;
6556
6557         lfsck_layout_slave_quit(env, com);
6558         com->lc_data = NULL;
6559         OBD_FREE_PTR(llsd);
6560 }
6561
6562 static void lfsck_layout_master_quit(const struct lu_env *env,
6563                                      struct lfsck_component *com)
6564 {
6565         struct lfsck_assistant_data     *lad    = com->lc_data;
6566         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6567         struct lfsck_tgt_descs          *ltds;
6568         struct lfsck_tgt_desc           *ltd;
6569         struct lfsck_tgt_desc           *next;
6570
6571         LASSERT(lad != NULL);
6572
6573         lfsck_quit_generic(env, com);
6574
6575         LASSERT(thread_is_init(&lad->lad_thread) ||
6576                 thread_is_stopped(&lad->lad_thread));
6577         LASSERT(list_empty(&lad->lad_req_list));
6578
6579         ltds = &lfsck->li_ost_descs;
6580         spin_lock(&ltds->ltd_lock);
6581         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6582                                  ltd_layout_phase_list) {
6583                 list_del_init(&ltd->ltd_layout_phase_list);
6584         }
6585         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6586                                  ltd_layout_phase_list) {
6587                 list_del_init(&ltd->ltd_layout_phase_list);
6588         }
6589         spin_unlock(&ltds->ltd_lock);
6590
6591         ltds = &lfsck->li_mdt_descs;
6592         spin_lock(&ltds->ltd_lock);
6593         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6594                                  ltd_layout_phase_list) {
6595                 list_del_init(&ltd->ltd_layout_phase_list);
6596         }
6597         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6598                                  ltd_layout_phase_list) {
6599                 list_del_init(&ltd->ltd_layout_phase_list);
6600         }
6601         spin_unlock(&ltds->ltd_lock);
6602 }
6603
6604 static void lfsck_layout_slave_quit(const struct lu_env *env,
6605                                     struct lfsck_component *com)
6606 {
6607         struct lfsck_layout_slave_data *llsd = com->lc_data;
6608         struct lfsck_layout_seq *lls;
6609         struct lfsck_layout_seq *next;
6610         struct lfsck_layout_slave_target *llst;
6611
6612         LASSERT(llsd != NULL);
6613
6614         down_write(&com->lc_sem);
6615         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6616                                  lls_list) {
6617                 list_del_init(&lls->lls_list);
6618                 lfsck_object_put(env, lls->lls_lastid_obj);
6619                 OBD_FREE_PTR(lls);
6620         }
6621         up_write(&com->lc_sem);
6622
6623         spin_lock(&llsd->llsd_lock);
6624         while (!list_empty(&llsd->llsd_master_list)) {
6625                 llst = list_first_entry(&llsd->llsd_master_list,
6626                                         struct lfsck_layout_slave_target,
6627                                         llst_list);
6628                 list_del_init(&llst->llst_list);
6629                 spin_unlock(&llsd->llsd_lock);
6630                 kref_put(&llst->llst_ref, lfsck_layout_llst_put);
6631                 spin_lock(&llsd->llsd_lock);
6632         }
6633         spin_unlock(&llsd->llsd_lock);
6634
6635         lfsck_rbtree_cleanup(env, com);
6636 }
6637
6638 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6639                                          struct lfsck_component *com,
6640                                          struct lfsck_request *lr)
6641 {
6642         struct lfsck_instance *lfsck = com->lc_lfsck;
6643         struct lfsck_layout *lo = com->lc_file_ram;
6644         struct lfsck_assistant_data *lad = com->lc_data;
6645         struct lfsck_tgt_descs *ltds;
6646         struct lfsck_tgt_desc *ltd;
6647         bool fail = false;
6648
6649         ENTRY;
6650         if (lr->lr_event == LE_PAIRS_VERIFY) {
6651                 int rc;
6652
6653                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6654                                                      &lr->lr_fid2,
6655                                                      lr->lr_comp_id);
6656
6657                 RETURN(rc);
6658         }
6659
6660         CDEBUG(D_LFSCK,
6661                "%s: layout LFSCK master handles notify %u from %s %x, status %d, flags %x, flags2 %x\n",
6662                lfsck_lfsck2name(lfsck), lr->lr_event,
6663                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6664                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6665
6666         if (lr->lr_event != LE_PHASE1_DONE &&
6667             lr->lr_event != LE_PHASE2_DONE &&
6668             lr->lr_event != LE_PEER_EXIT)
6669                 RETURN(-EINVAL);
6670
6671         if (lr->lr_flags & LEF_FROM_OST)
6672                 ltds = &lfsck->li_ost_descs;
6673         else
6674                 ltds = &lfsck->li_mdt_descs;
6675         spin_lock(&ltds->ltd_lock);
6676         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6677         if (ltd == NULL) {
6678                 spin_unlock(&ltds->ltd_lock);
6679
6680                 RETURN(-ENXIO);
6681         }
6682
6683         list_del_init(&ltd->ltd_layout_phase_list);
6684         switch (lr->lr_event) {
6685         case LE_PHASE1_DONE:
6686                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6687                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6688                                 if (lr->lr_flags & LEF_FROM_OST)
6689                                         lfsck_lad_set_bitmap(env, com,
6690                                                              ltd->ltd_index);
6691                                 else
6692                                         lo->ll_flags |= LF_INCOMPLETE;
6693                         }
6694                         ltd->ltd_layout_done = 1;
6695                         list_del_init(&ltd->ltd_layout_list);
6696                         fail = true;
6697                         break;
6698                 }
6699
6700                 if (lr->lr_flags & LEF_FROM_OST) {
6701                         if (list_empty(&ltd->ltd_layout_list))
6702                                 list_add_tail(&ltd->ltd_layout_list,
6703                                               &lad->lad_ost_list);
6704                         list_add_tail(&ltd->ltd_layout_phase_list,
6705                                       &lad->lad_ost_phase2_list);
6706                 } else {
6707                         if (list_empty(&ltd->ltd_layout_list))
6708                                 list_add_tail(&ltd->ltd_layout_list,
6709                                               &lad->lad_mdt_list);
6710                         list_add_tail(&ltd->ltd_layout_phase_list,
6711                                       &lad->lad_mdt_phase2_list);
6712                 }
6713                 break;
6714         case LE_PHASE2_DONE:
6715                 ltd->ltd_layout_done = 1;
6716                 if (!list_empty(&ltd->ltd_layout_list))
6717                         list_del_init(&ltd->ltd_layout_list);
6718
6719                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6720                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6721                         fail = true;
6722                 }
6723
6724                 break;
6725         case LE_PEER_EXIT:
6726                 fail = true;
6727                 ltd->ltd_layout_done = 1;
6728                 list_del_init(&ltd->ltd_layout_list);
6729                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6730                     !(lr->lr_flags & LEF_FROM_OST))
6731                         lo->ll_flags |= LF_INCOMPLETE;
6732                 break;
6733         default:
6734                 break;
6735         }
6736         spin_unlock(&ltds->ltd_lock);
6737
6738         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6739                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6740
6741                 memset(stop, 0, sizeof(*stop));
6742                 stop->ls_status = lr->lr_status;
6743                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6744                 lfsck_stop(env, lfsck->li_bottom, stop);
6745         } else if (lfsck_phase2_next_ready(lad)) {
6746                 wake_up(&lad->lad_thread.t_ctl_waitq);
6747         }
6748
6749         RETURN(0);
6750 }
6751
6752 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6753                                               struct lfsck_component *com,
6754                                               struct lfsck_req_local *lrl,
6755                                               struct thandle *th)
6756 {
6757         ENTRY;
6758
6759         switch (lrl->lrl_event) {
6760         case LEL_FID_ACCESSED:
6761                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6762                 RETURN(0);
6763         case LEL_PAIRS_VERIFY_LOCAL: {
6764                 int rc;
6765
6766                 lrl->lrl_status = LPVS_INIT;
6767                 /* Firstly, if the MDT-object which is claimed via OST-object
6768                  * local stored PFID xattr recognizes the OST-object, then it
6769                  * must be that the client given PFID is wrong.
6770                  */
6771                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6772                                 &lrl->lrl_ff_local.ff_parent,
6773                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6774                 if (rc <= 0)
6775                         RETURN(0);
6776
6777                 lrl->lrl_status = LPVS_INCONSISTENT;
6778                 /* The OST-object local stored PFID xattr is stale. We need to
6779                  * check whether the MDT-object that is claimed via the client
6780                  * given PFID information recognizes the OST-object or not. If
6781                  * matches, then need to update the OST-object's PFID xattr.
6782                  */
6783                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6784                                 &lrl->lrl_ff_client.ff_parent,
6785                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6786                 /* For rc < 0 case:
6787                  * We are not sure whether the client given PFID information
6788                  * is correct or not, do nothing to avoid improper fixing.
6789                  *
6790                  * For rc > 0 case:
6791                  * The client given PFID information is also invalid, we can
6792                  * NOT fix the OST-object inconsistency.
6793                  */
6794                 if (!rc) {
6795                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6796                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6797                 }
6798
6799                 RETURN(rc);
6800         }
6801         default:
6802                 break;
6803         }
6804
6805         RETURN(-EOPNOTSUPP);
6806 }
6807
6808 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6809                                         struct lfsck_component *com,
6810                                         struct lfsck_request *lr)
6811 {
6812         struct lfsck_instance *lfsck = com->lc_lfsck;
6813         struct lfsck_layout_slave_data *llsd = com->lc_data;
6814         struct lfsck_layout_slave_target *llst;
6815         int rc;
6816
6817         ENTRY;
6818         switch (lr->lr_event) {
6819         case LE_CONDITIONAL_DESTROY:
6820                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6821                 RETURN(rc);
6822         case LE_PHASE1_DONE: {
6823                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6824                         struct lfsck_layout *lo = com->lc_file_ram;
6825
6826                         lo->ll_flags |= LF_INCOMPLETE;
6827                         llst = lfsck_layout_llst_find_and_del(llsd,
6828                                                               lr->lr_index,
6829                                                               true);
6830                         if (llst != NULL) {
6831                                 kref_put(&llst->llst_ref,
6832                                          lfsck_layout_llst_put);
6833                                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6834                         }
6835                 }
6836
6837                 RETURN(0);
6838         }
6839         case LE_PHASE2_DONE:
6840         case LE_PEER_EXIT:
6841                 CDEBUG(D_LFSCK,
6842                        "%s: layout LFSCK slave handle notify %u from MDT %x, status %d\n",
6843                        lfsck_lfsck2name(lfsck), lr->lr_event, lr->lr_index,
6844                        lr->lr_status);
6845                 break;
6846         default:
6847                 RETURN(-EINVAL);
6848         }
6849
6850         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6851         if (llst == NULL)
6852                 RETURN(0);
6853
6854         kref_put(&llst->llst_ref, lfsck_layout_llst_put);
6855         if (list_empty(&llsd->llsd_master_list))
6856                 wake_up(&lfsck->li_thread.t_ctl_waitq);
6857
6858         if (lr->lr_event == LE_PEER_EXIT &&
6859             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6860              (list_empty(&llsd->llsd_master_list) &&
6861               (lr->lr_status == LS_STOPPED ||
6862                lr->lr_status == LS_CO_STOPPED)))) {
6863                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6864
6865                 memset(stop, 0, sizeof(*stop));
6866                 stop->ls_status = lr->lr_status;
6867                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6868                 lfsck_stop(env, lfsck->li_bottom, stop);
6869         }
6870
6871         RETURN(0);
6872 }
6873
6874 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6875 {
6876         int i;
6877
6878         for (i = 0; i < LLIT_MAX; i++)
6879                 *count += lo->ll_objs_repaired[i];
6880 }
6881
6882 static int lfsck_layout_query_all(const struct lu_env *env,
6883                                   struct lfsck_component *com,
6884                                   __u32 *mdts_count, __u32 *osts_count,
6885                                   __u64 *repaired)
6886 {
6887         struct lfsck_layout *lo = com->lc_file_ram;
6888         struct lfsck_tgt_descs *ltds;
6889         struct lfsck_tgt_desc *ltd;
6890         int idx;
6891         int rc;
6892
6893         ENTRY;
6894         rc = lfsck_query_all(env, com);
6895         if (rc != 0)
6896                 RETURN(rc);
6897
6898         ltds = &com->lc_lfsck->li_mdt_descs;
6899         down_read(&ltds->ltd_rw_sem);
6900         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6901                 ltd = lfsck_ltd2tgt(ltds, idx);
6902                 LASSERT(ltd != NULL);
6903
6904                 mdts_count[ltd->ltd_layout_status]++;
6905                 *repaired += ltd->ltd_layout_repaired;
6906         }
6907         up_read(&ltds->ltd_rw_sem);
6908
6909         ltds = &com->lc_lfsck->li_ost_descs;
6910         down_read(&ltds->ltd_rw_sem);
6911         for_each_set_bit(idx, ltds->ltd_tgts_bitmap, ltds->ltd_tgts_mask_len) {
6912                 ltd = lfsck_ltd2tgt(ltds, idx);
6913                 LASSERT(ltd != NULL);
6914
6915                 osts_count[ltd->ltd_layout_status]++;
6916                 *repaired += ltd->ltd_layout_repaired;
6917         }
6918         up_read(&ltds->ltd_rw_sem);
6919
6920         down_read(&com->lc_sem);
6921         mdts_count[lo->ll_status]++;
6922         lfsck_layout_repaired(lo, repaired);
6923         up_read(&com->lc_sem);
6924
6925         RETURN(0);
6926 }
6927
6928 static int lfsck_layout_query(const struct lu_env *env,
6929                               struct lfsck_component *com,
6930                               struct lfsck_request *req,
6931                               struct lfsck_reply *rep,
6932                               struct lfsck_query *que, int idx)
6933 {
6934         struct lfsck_layout *lo = com->lc_file_ram;
6935         int rc = 0;
6936
6937         if (que != NULL) {
6938                 LASSERT(com->lc_lfsck->li_master);
6939
6940                 rc = lfsck_layout_query_all(env, com,
6941                                             que->lu_mdts_count[idx],
6942                                             que->lu_osts_count[idx],
6943                                             &que->lu_repaired[idx]);
6944         } else {
6945                 down_read(&com->lc_sem);
6946                 rep->lr_status = lo->ll_status;
6947                 if (req->lr_flags & LEF_QUERY_ALL)
6948                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6949                 up_read(&com->lc_sem);
6950         }
6951
6952         return rc;
6953 }
6954
6955 /* with lfsck::li_lock held */
6956 static int lfsck_layout_slave_join(const struct lu_env *env,
6957                                    struct lfsck_component *com,
6958                                    struct lfsck_start_param *lsp)
6959 {
6960         struct lfsck_instance *lfsck = com->lc_lfsck;
6961         struct lfsck_layout_slave_data *llsd  = com->lc_data;
6962         struct lfsck_layout_slave_target *llst;
6963         struct lfsck_start *start = lsp->lsp_start;
6964         int rc = 0;
6965
6966         ENTRY;
6967         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6968                 RETURN(0);
6969
6970         if (!lsp->lsp_index_valid)
6971                 RETURN(-EINVAL);
6972
6973         /* If someone is running the LFSCK without orphan handling,
6974          * it will not maintain the object accessing rbtree. So we
6975          * cannot join it for orphan handling.
6976          */
6977         if (!llsd->llsd_rbtree_valid)
6978                 RETURN(-EBUSY);
6979
6980         spin_unlock(&lfsck->li_lock);
6981         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6982         spin_lock(&lfsck->li_lock);
6983         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6984                 spin_unlock(&lfsck->li_lock);
6985                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6986                                                       true);
6987                 if (llst != NULL)
6988                         kref_put(&llst->llst_ref, lfsck_layout_llst_put);
6989                 spin_lock(&lfsck->li_lock);
6990                 rc = -EAGAIN;
6991         }
6992
6993         RETURN(rc);
6994 }
6995
6996 static const struct lfsck_operations lfsck_layout_master_ops = {
6997         .lfsck_reset            = lfsck_layout_reset,
6998         .lfsck_fail             = lfsck_layout_fail,
6999         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
7000         .lfsck_prep             = lfsck_layout_master_prep,
7001         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
7002         .lfsck_exec_dir         = lfsck_layout_exec_dir,
7003         .lfsck_post             = lfsck_layout_master_post,
7004         .lfsck_dump             = lfsck_layout_dump,
7005         .lfsck_double_scan      = lfsck_layout_master_double_scan,
7006         .lfsck_data_release     = lfsck_layout_master_data_release,
7007         .lfsck_quit             = lfsck_layout_master_quit,
7008         .lfsck_in_notify        = lfsck_layout_master_in_notify,
7009         .lfsck_query            = lfsck_layout_query,
7010 };
7011
7012 static const struct lfsck_operations lfsck_layout_slave_ops = {
7013         .lfsck_reset            = lfsck_layout_reset,
7014         .lfsck_fail             = lfsck_layout_fail,
7015         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
7016         .lfsck_prep             = lfsck_layout_slave_prep,
7017         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
7018         .lfsck_exec_dir         = lfsck_layout_exec_dir,
7019         .lfsck_post             = lfsck_layout_slave_post,
7020         .lfsck_dump             = lfsck_layout_dump,
7021         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
7022         .lfsck_data_release     = lfsck_layout_slave_data_release,
7023         .lfsck_quit             = lfsck_layout_slave_quit,
7024         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
7025         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
7026         .lfsck_query            = lfsck_layout_query,
7027         .lfsck_join             = lfsck_layout_slave_join,
7028 };
7029
7030 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
7031                                             struct lfsck_component *com,
7032                                             struct lfsck_position *pos)
7033 {
7034         struct lfsck_assistant_data *lad = com->lc_data;
7035         struct lfsck_layout_req *llr;
7036
7037         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
7038             LS_SCANNING_PHASE1)
7039                 return;
7040
7041         if (list_empty(&lad->lad_req_list))
7042                 return;
7043
7044         llr = list_first_entry(&lad->lad_req_list,
7045                                struct lfsck_layout_req,
7046                                llr_lar.lar_list);
7047         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
7048 }
7049
7050 const struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
7051         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
7052         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
7053         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
7054         .la_double_scan_result  = lfsck_layout_double_scan_result,
7055         .la_req_fini            = lfsck_layout_assistant_req_fini,
7056         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
7057 };
7058
7059 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
7060 {
7061         struct lfsck_component *com;
7062         struct lfsck_layout *lo;
7063         struct dt_object *root = NULL;
7064         struct dt_object *obj;
7065         int i;
7066         int rc;
7067
7068         ENTRY;
7069         OBD_ALLOC_PTR(com);
7070         if (com == NULL)
7071                 RETURN(-ENOMEM);
7072
7073         INIT_LIST_HEAD(&com->lc_link);
7074         INIT_LIST_HEAD(&com->lc_link_dir);
7075         init_rwsem(&com->lc_sem);
7076         atomic_set(&com->lc_ref, 1);
7077         com->lc_lfsck = lfsck;
7078         com->lc_type = LFSCK_TYPE_LAYOUT;
7079         if (lfsck->li_master) {
7080                 com->lc_ops = &lfsck_layout_master_ops;
7081                 com->lc_data = lfsck_assistant_data_init(
7082                                 &lfsck_layout_assistant_ops,
7083                                 LFSCK_LAYOUT);
7084                 if (com->lc_data == NULL)
7085                         GOTO(out, rc = -ENOMEM);
7086
7087                 for (i = 0; i < LFSCK_STF_COUNT; i++)
7088                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
7089         } else {
7090                 struct lfsck_layout_slave_data *llsd;
7091
7092                 com->lc_ops = &lfsck_layout_slave_ops;
7093                 OBD_ALLOC_PTR(llsd);
7094                 if (llsd == NULL)
7095                         GOTO(out, rc = -ENOMEM);
7096
7097                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
7098                 INIT_LIST_HEAD(&llsd->llsd_master_list);
7099                 spin_lock_init(&llsd->llsd_lock);
7100                 llsd->llsd_rb_root = RB_ROOT;
7101                 init_rwsem(&llsd->llsd_rb_rwsem);
7102                 com->lc_data = llsd;
7103         }
7104         com->lc_file_size = sizeof(*lo);
7105         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
7106         if (com->lc_file_ram == NULL)
7107                 GOTO(out, rc = -ENOMEM);
7108
7109         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
7110         if (com->lc_file_disk == NULL)
7111                 GOTO(out, rc = -ENOMEM);
7112
7113         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
7114         if (IS_ERR(root))
7115                 GOTO(out, rc = PTR_ERR(root));
7116
7117         if (unlikely(!dt_try_as_dir(env, root, true)))
7118                 GOTO(out, rc = -ENOTDIR);
7119
7120         obj = local_file_find_or_create(env, lfsck->li_los, root,
7121                                         LFSCK_LAYOUT,
7122                                         S_IFREG | S_IRUGO | S_IWUSR);
7123         if (IS_ERR(obj))
7124                 GOTO(out, rc = PTR_ERR(obj));
7125
7126         com->lc_obj = obj;
7127         rc = lfsck_layout_load(env, com);
7128         if (rc > 0) {
7129                 rc = lfsck_layout_reset(env, com, true);
7130         } else if (rc == -ENOENT) {
7131                 rc = lfsck_layout_init(env, com);
7132         } else if (lfsck->li_master) {
7133                 rc = lfsck_load_sub_trace_files(env, com,
7134                                 &dt_lfsck_layout_dangling_features,
7135                                 LFSCK_LAYOUT, false);
7136                 if (rc)
7137                         rc = lfsck_layout_reset(env, com, true);
7138         }
7139
7140         if (rc != 0)
7141                 GOTO(out, rc);
7142
7143         lo = com->lc_file_ram;
7144         switch (lo->ll_status) {
7145         case LS_INIT:
7146         case LS_COMPLETED:
7147         case LS_FAILED:
7148         case LS_STOPPED:
7149         case LS_PARTIAL:
7150                 spin_lock(&lfsck->li_lock);
7151                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
7152                 spin_unlock(&lfsck->li_lock);
7153                 break;
7154         default:
7155                 CERROR("%s: unknown lfsck_layout status %d\n",
7156                        lfsck_lfsck2name(lfsck), lo->ll_status);
7157                 fallthrough;
7158         case LS_SCANNING_PHASE1:
7159         case LS_SCANNING_PHASE2:
7160                 /* No need to store the status to disk right now.
7161                  * If the system crashed before the status stored,
7162                  * it will be loaded back when next time.
7163                  */
7164                 lo->ll_status = LS_CRASHED;
7165                 if (!lfsck->li_master)
7166                         lo->ll_flags |= LF_INCOMPLETE;
7167                 fallthrough;
7168         case LS_PAUSED:
7169         case LS_CRASHED:
7170         case LS_CO_FAILED:
7171         case LS_CO_STOPPED:
7172         case LS_CO_PAUSED:
7173                 spin_lock(&lfsck->li_lock);
7174                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
7175                 spin_unlock(&lfsck->li_lock);
7176                 break;
7177         }
7178
7179         if (lo->ll_flags & LF_CRASHED_LASTID) {
7180                 LASSERT(lfsck->li_out_notify != NULL);
7181
7182                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
7183                                      LE_LASTID_REBUILDING);
7184         }
7185
7186         GOTO(out, rc = 0);
7187
7188 out:
7189         if (root != NULL && !IS_ERR(root))
7190                 lfsck_object_put(env, root);
7191
7192         if (rc != 0) {
7193                 lfsck_component_cleanup(env, com);
7194                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
7195                        lfsck_lfsck2name(lfsck), rc);
7196         }
7197
7198         return rc;
7199 }
7200
7201 struct lfsck_orphan_it {
7202         struct lfsck_component           *loi_com;
7203         struct lfsck_rbtree_node         *loi_lrn;
7204         struct lfsck_layout_slave_target *loi_llst;
7205         struct lu_fid                     loi_key;
7206         struct lu_orphan_rec_v3           loi_rec;
7207         __u64                             loi_hash;
7208         unsigned int                      loi_over:1;
7209 };
7210
7211 static int lfsck_fid_match_idx(const struct lu_env *env,
7212                                struct lfsck_instance *lfsck,
7213                                const struct lu_fid *fid, int idx)
7214 {
7215         struct seq_server_site *ss;
7216         struct lu_server_fld *sf;
7217         struct lu_seq_range *range = &lfsck_env_info(env)->lti_range;
7218         int rc;
7219
7220         /* All abnormal cases will be returned to MDT0. */
7221         if (!fid_is_norm(fid)) {
7222                 if (idx == 0)
7223                         return 1;
7224
7225                 return 0;
7226         }
7227
7228         ss = lfsck_dev_site(lfsck);
7229         if (unlikely(ss == NULL))
7230                 return -ENOTCONN;
7231
7232         sf = ss->ss_server_fld;
7233         LASSERT(sf != NULL);
7234
7235         fld_range_set_any(range);
7236         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
7237         if (rc != 0)
7238                 return rc;
7239
7240         if (!fld_range_is_mdt(range))
7241                 return -EINVAL;
7242
7243         if (range->lsr_index == idx)
7244                 return 1;
7245
7246         return 0;
7247 }
7248
7249 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
7250                                         struct lfsck_instance *lfsck,
7251                                         struct dt_object *obj)
7252 {
7253         struct dt_device *dev = lfsck_obj2dev(obj);
7254         struct thandle *handle;
7255         int rc;
7256
7257         ENTRY;
7258         if (lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN)
7259                 GOTO(log, rc = 0);
7260
7261         handle = lfsck_trans_create(env, dev, lfsck);
7262         if (IS_ERR(handle))
7263                 RETURN_EXIT;
7264
7265         rc = dt_declare_ref_del(env, obj, handle);
7266         if (rc != 0)
7267                 GOTO(stop, rc);
7268
7269         rc = dt_declare_destroy(env, obj, handle);
7270         if (rc != 0)
7271                 GOTO(stop, rc);
7272
7273         rc = dt_trans_start_local(env, dev, handle);
7274         if (rc != 0)
7275                 GOTO(stop, rc);
7276
7277         dt_write_lock(env, obj, 0);
7278         rc = dt_ref_del(env, obj, handle);
7279         if (rc == 0)
7280                 rc = dt_destroy(env, obj, handle);
7281         dt_write_unlock(env, obj);
7282
7283         GOTO(stop, rc);
7284
7285 stop:
7286         dt_trans_stop(env, dev, handle);
7287
7288 log:
7289         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
7290                PFID(lfsck_dto2fid(obj)), rc);
7291
7292         RETURN_EXIT;
7293 }
7294
7295 static int lfsck_orphan_index_lookup(const struct lu_env *env,
7296                                      struct dt_object *dt,
7297                                      struct dt_rec *rec,
7298                                      const struct dt_key *key)
7299 {
7300         return -EOPNOTSUPP;
7301 }
7302
7303 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
7304                                              struct dt_object *dt,
7305                                              const struct dt_rec *rec,
7306                                              const struct dt_key *key,
7307                                              struct thandle *handle)
7308 {
7309         return -EOPNOTSUPP;
7310 }
7311
7312 static int lfsck_orphan_index_insert(const struct lu_env *env,
7313                                      struct dt_object *dt,
7314                                      const struct dt_rec *rec,
7315                                      const struct dt_key *key,
7316                                      struct thandle *handle)
7317 {
7318         return -EOPNOTSUPP;
7319 }
7320
7321 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
7322                                              struct dt_object *dt,
7323                                              const struct dt_key *key,
7324                                              struct thandle *handle)
7325 {
7326         return -EOPNOTSUPP;
7327 }
7328
7329 static int lfsck_orphan_index_delete(const struct lu_env *env,
7330                                      struct dt_object *dt,
7331                                      const struct dt_key *key,
7332                                      struct thandle *handle)
7333 {
7334         return -EOPNOTSUPP;
7335 }
7336
7337 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
7338                                           struct dt_object *dt,
7339                                           __u32 attr)
7340 {
7341         struct dt_device *dev = lu2dt_dev(dt->do_lu.lo_dev);
7342         struct lfsck_instance *lfsck;
7343         struct lfsck_component *com = NULL;
7344         struct lfsck_layout_slave_data *llsd;
7345         struct lfsck_orphan_it *it = NULL;
7346         struct lfsck_layout *lo;
7347         int rc = 0;
7348
7349         ENTRY;
7350         lfsck = lfsck_instance_find(dev, true, false);
7351         if (unlikely(lfsck == NULL))
7352                 RETURN(ERR_PTR(-ENXIO));
7353
7354         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
7355         if (unlikely(com == NULL))
7356                 GOTO(out, rc = -ENOENT);
7357
7358         lo = com->lc_file_ram;
7359         if (lo->ll_flags & LF_INCOMPLETE)
7360                 GOTO(out, rc = -ESRCH);
7361
7362         llsd = com->lc_data;
7363         if (!llsd->llsd_rbtree_valid)
7364                 GOTO(out, rc = -ESRCH);
7365
7366         OBD_ALLOC_PTR(it);
7367         if (it == NULL)
7368                 GOTO(out, rc = -ENOMEM);
7369
7370         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
7371         if (it->loi_llst == NULL)
7372                 GOTO(out, rc = -ENXIO);
7373
7374         if (dev->dd_record_fid_accessed) {
7375                 /* The first iteration against the rbtree, scan the whole rbtree
7376                  * to remove the nodes which do NOT need to be handled.
7377                  */
7378                 down_write(&llsd->llsd_rb_rwsem);
7379                 if (dev->dd_record_fid_accessed) {
7380                         struct rb_node                  *node;
7381                         struct rb_node                  *next;
7382                         struct lfsck_rbtree_node        *lrn;
7383
7384                         /* No need to record the fid accessing anymore. */
7385                         dev->dd_record_fid_accessed = 0;
7386
7387                         node = rb_first(&llsd->llsd_rb_root);
7388                         while (node != NULL) {
7389                                 next = rb_next(node);
7390                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
7391                                                lrn_node);
7392                                 if (atomic_read(&lrn->lrn_known_count) <=
7393                                     atomic_read(&lrn->lrn_accessed_count)) {
7394                                         rb_erase(node, &llsd->llsd_rb_root);
7395                                         lfsck_rbtree_free(lrn);
7396                                 }
7397                                 node = next;
7398                         }
7399                 }
7400                 up_write(&llsd->llsd_rb_rwsem);
7401         }
7402
7403         /* read lock the rbtree when init, and unlock when fini */
7404         down_read(&llsd->llsd_rb_rwsem);
7405         it->loi_com = com;
7406         com = NULL;
7407
7408         GOTO(out, rc = 0);
7409
7410 out:
7411         if (com != NULL)
7412                 lfsck_component_put(env, com);
7413
7414         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7415                lfsck_lfsck2name(lfsck), rc);
7416
7417         lfsck_instance_put(env, lfsck);
7418         if (rc != 0) {
7419                 if (it != NULL)
7420                         OBD_FREE_PTR(it);
7421
7422                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7423         }
7424
7425         return (struct dt_it *)it;
7426 }
7427
7428 static void lfsck_orphan_it_fini(const struct lu_env *env,
7429                                  struct dt_it *di)
7430 {
7431         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7432         struct lfsck_component *com = it->loi_com;
7433         struct lfsck_layout_slave_data *llsd;
7434         struct lfsck_layout_slave_target *llst;
7435
7436         if (com != NULL) {
7437                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7438                        lfsck_lfsck2name(com->lc_lfsck));
7439
7440                 llsd = com->lc_data;
7441                 up_read(&llsd->llsd_rb_rwsem);
7442                 llst = it->loi_llst;
7443                 LASSERT(llst != NULL);
7444
7445                 /* Save the key and hash for iterate next. */
7446                 llst->llst_fid = it->loi_key;
7447                 llst->llst_hash = it->loi_hash;
7448                 kref_put(&llst->llst_ref, lfsck_layout_llst_put);
7449                 lfsck_component_put(env, com);
7450         }
7451         OBD_FREE_PTR(it);
7452 }
7453
7454 /**
7455  * \retval       +1: the iteration finished
7456  * \retval        0: on success, not finished
7457  * \retval      -ve: on error
7458  */
7459 static int lfsck_orphan_it_next(const struct lu_env *env,
7460                                 struct dt_it *di)
7461 {
7462         struct lfsck_thread_info *info = lfsck_env_info(env);
7463         struct filter_fid *ff = &info->lti_ff;
7464         struct lu_attr *la = &info->lti_la;
7465         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7466         struct lu_fid *key = &it->loi_key;
7467         struct lu_orphan_rec_v3 *rec = &it->loi_rec;
7468         struct ost_layout *ol = &rec->lor_layout;
7469         struct lfsck_component *com = it->loi_com;
7470         struct lfsck_instance *lfsck = com->lc_lfsck;
7471         struct lfsck_layout_slave_data *llsd = com->lc_data;
7472         struct dt_object *obj;
7473         struct lfsck_rbtree_node *lrn;
7474         int pos;
7475         int rc;
7476         __u32 save;
7477         __u32 idx = it->loi_llst->llst_index;
7478         bool exact = false;
7479
7480         ENTRY;
7481         if (it->loi_over)
7482                 RETURN(1);
7483
7484 again0:
7485         lrn = it->loi_lrn;
7486         if (lrn == NULL) {
7487                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7488                 if (lrn == NULL) {
7489                         it->loi_over = 1;
7490                         RETURN(1);
7491                 }
7492
7493                 it->loi_lrn = lrn;
7494                 if (!exact) {
7495                         key->f_seq = lrn->lrn_seq;
7496                         key->f_oid = lrn->lrn_first_oid;
7497                         key->f_ver = 0;
7498                 }
7499         } else {
7500                 key->f_oid++;
7501                 if (unlikely(key->f_oid == 0)) {
7502                         key->f_seq++;
7503                         it->loi_lrn = NULL;
7504                         goto again0;
7505                 }
7506
7507                 if (key->f_oid >=
7508                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7509                         it->loi_lrn = NULL;
7510                         goto again0;
7511                 }
7512         }
7513
7514         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7515                      atomic_read(&lrn->lrn_accessed_count))) {
7516                 struct rb_node *next = rb_next(&lrn->lrn_node);
7517
7518                 while (next != NULL) {
7519                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7520                                        lrn_node);
7521                         if (atomic_read(&lrn->lrn_known_count) >
7522                             atomic_read(&lrn->lrn_accessed_count))
7523                                 break;
7524                         next = rb_next(next);
7525                 }
7526
7527                 if (next == NULL) {
7528                         it->loi_over = 1;
7529                         RETURN(1);
7530                 }
7531
7532                 it->loi_lrn = lrn;
7533                 key->f_seq = lrn->lrn_seq;
7534                 key->f_oid = lrn->lrn_first_oid;
7535                 key->f_ver = 0;
7536         }
7537
7538         pos = key->f_oid - lrn->lrn_first_oid;
7539
7540 again1:
7541         pos = find_next_bit(lrn->lrn_known_bitmap,
7542                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7543         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7544                 key->f_oid = lrn->lrn_first_oid + pos;
7545                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7546                         key->f_seq++;
7547                         key->f_oid = 0;
7548                 }
7549                 it->loi_lrn = NULL;
7550                 goto again0;
7551         }
7552
7553         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7554                 pos++;
7555                 goto again1;
7556         }
7557
7558         key->f_oid = lrn->lrn_first_oid + pos;
7559         obj = lfsck_object_find_bottom(env, lfsck, key);
7560         if (IS_ERR(obj)) {
7561                 rc = PTR_ERR(obj);
7562                 if (rc == -ENOENT) {
7563                         pos++;
7564                         goto again1;
7565                 }
7566                 RETURN(rc);
7567         }
7568
7569         dt_read_lock(env, obj, 0);
7570         if (dt_object_exists(obj) == 0 ||
7571             lfsck_is_dead_obj(obj)) {
7572                 dt_read_unlock(env, obj);
7573                 lfsck_object_put(env, obj);
7574                 pos++;
7575                 goto again1;
7576         }
7577
7578         rc = dt_attr_get(env, obj, la);
7579         if (rc != 0)
7580                 GOTO(out, rc);
7581
7582         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7583                           XATTR_NAME_FID);
7584         if (rc == -ENODATA) {
7585                 /* For the pre-created OST-object, update the bitmap to avoid
7586                  * others LFSCK (second phase) iteration to touch it again.
7587                  */
7588                 if (la->la_ctime == 0) {
7589                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7590                                 atomic_inc(&lrn->lrn_accessed_count);
7591
7592                         /* For the race between repairing dangling referenced
7593                          * MDT-object and unlink the file, it may left orphan
7594                          * OST-object there. Destroy it now!
7595                          */
7596                         if (unlikely(!(la->la_mode & S_ISUID))) {
7597                                 dt_read_unlock(env, obj);
7598                                 lfsck_layout_destroy_orphan(env, lfsck, obj);
7599                                 lfsck_object_put(env, obj);
7600                                 pos++;
7601                                 goto again1;
7602                         }
7603                 } else if (idx == 0) {
7604                         /* If the orphan OST-object has no parent information,
7605                          * regard it as referenced by the MDT-object on MDT0.
7606                          */
7607                         fid_zero(&rec->lor_rec.lor_fid);
7608                         rec->lor_rec.lor_uid = la->la_uid;
7609                         rec->lor_rec.lor_gid = la->la_gid;
7610                         memset(ol, 0, sizeof(*ol));
7611                         rec->lor_layout_version = 0;
7612                         rec->lor_range = 0;
7613
7614                         GOTO(out, rc = 0);
7615                 }
7616
7617                 dt_read_unlock(env, obj);
7618                 lfsck_object_put(env, obj);
7619                 pos++;
7620                 goto again1;
7621         }
7622
7623         if (rc < sizeof(struct lu_fid))
7624                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7625
7626         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7627         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7628          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7629          * parent MDT-object's layout EA.
7630          */
7631         save = rec->lor_rec.lor_fid.f_stripe_idx;
7632         rec->lor_rec.lor_fid.f_ver = 0;
7633         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7634         /* If the orphan OST-object does not claim the MDT, then next.
7635          *
7636          * If we do not know whether it matches or not, then return it
7637          * to the MDT for further check.
7638          */
7639         if (rc == 0) {
7640                 dt_read_unlock(env, obj);
7641                 lfsck_object_put(env, obj);
7642                 pos++;
7643                 goto again1;
7644         }
7645
7646         rec->lor_rec.lor_fid.f_stripe_idx = save;
7647         rec->lor_rec.lor_uid = la->la_uid;
7648         rec->lor_rec.lor_gid = la->la_gid;
7649         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7650         rec->lor_layout_version =
7651                 le32_to_cpu(ff->ff_layout_version & ~LU_LAYOUT_RESYNC);
7652         rec->lor_range = le32_to_cpu(ff->ff_range);
7653
7654         CDEBUG(D_LFSCK,
7655                "%s: return orphan "DFID", PFID "DFID", owner %u:%u, stripe size %u, stripe count %u, COMP id %u, COMP start %llu, COMP end %llu, layout version %u, range %u\n",
7656                lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7657                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7658                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7659                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
7660                rec->lor_layout_version, rec->lor_range);
7661
7662         GOTO(out, rc = 0);
7663
7664 out:
7665         dt_read_unlock(env, obj);
7666         lfsck_object_put(env, obj);
7667         if (rc == 0)
7668                 it->loi_hash++;
7669
7670         return rc;
7671 }
7672
7673 /**
7674  * \retval       +1: locate to the exactly position
7675  * \retval        0: cannot locate to the exactly position,
7676  *                   call next() to move to a valid position.
7677  * \retval      -ve: on error
7678  */
7679 static int lfsck_orphan_it_get(const struct lu_env *env,
7680                                struct dt_it *di,
7681                                const struct dt_key *key)
7682 {
7683         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7684         int rc;
7685
7686         it->loi_key = *(struct lu_fid *)key;
7687         rc = lfsck_orphan_it_next(env, di);
7688         if (rc == 1)
7689                 return 0;
7690
7691         if (rc == 0)
7692                 return 1;
7693
7694         return rc;
7695 }
7696
7697 static void lfsck_orphan_it_put(const struct lu_env *env,
7698                                 struct dt_it *di)
7699 {
7700 }
7701
7702 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7703                                           const struct dt_it *di)
7704 {
7705         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7706
7707         return (struct dt_key *)&it->loi_key;
7708 }
7709
7710 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7711                                     const struct dt_it *di)
7712 {
7713         return sizeof(struct lu_fid);
7714 }
7715
7716 static int lfsck_orphan_it_rec(const struct lu_env *env,
7717                                const struct dt_it *di,
7718                                struct dt_rec *rec,
7719                                __u32 attr)
7720 {
7721         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7722
7723         *(struct lu_orphan_rec_v3 *)rec = it->loi_rec;
7724
7725         return 0;
7726 }
7727
7728 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7729                                    const struct dt_it *di)
7730 {
7731         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7732
7733         return it->loi_hash;
7734 }
7735
7736 /**
7737  * \retval       +1: locate to the exactly position
7738  * \retval        0: cannot locate to the exactly position,
7739  *                   call next() to move to a valid position.
7740  * \retval      -ve: on error
7741  */
7742 static int lfsck_orphan_it_load(const struct lu_env *env,
7743                                 const struct dt_it *di,
7744                                 __u64 hash)
7745 {
7746         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7747         struct lfsck_layout_slave_target *llst = it->loi_llst;
7748         int rc;
7749
7750         LASSERT(llst != NULL);
7751
7752         if (hash != llst->llst_hash) {
7753                 CDEBUG(D_LFSCK,
7754                        "%s: the given hash %llu for orphan iteration does not match the one when fini %llu, to be reset.\n",
7755                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7756                        llst->llst_hash);
7757                 fid_zero(&llst->llst_fid);
7758                 llst->llst_hash = 0;
7759         }
7760
7761         it->loi_key = llst->llst_fid;
7762         it->loi_hash = llst->llst_hash;
7763         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7764         if (rc == 1)
7765                 return 0;
7766
7767         if (rc == 0)
7768                 return 1;
7769
7770         return rc;
7771 }
7772
7773 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7774                                    const struct dt_it *di,
7775                                    void *key_rec)
7776 {
7777         return 0;
7778 }
7779
7780 static const struct dt_index_operations lfsck_orphan_index_ops = {
7781         .dio_lookup             = lfsck_orphan_index_lookup,
7782         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7783         .dio_insert             = lfsck_orphan_index_insert,
7784         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7785         .dio_delete             = lfsck_orphan_index_delete,
7786         .dio_it = {
7787                 .init           = lfsck_orphan_it_init,
7788                 .fini           = lfsck_orphan_it_fini,
7789                 .get            = lfsck_orphan_it_get,
7790                 .put            = lfsck_orphan_it_put,
7791                 .next           = lfsck_orphan_it_next,
7792                 .key            = lfsck_orphan_it_key,
7793                 .key_size       = lfsck_orphan_it_key_size,
7794                 .rec            = lfsck_orphan_it_rec,
7795                 .store          = lfsck_orphan_it_store,
7796                 .load           = lfsck_orphan_it_load,
7797                 .key_rec        = lfsck_orphan_it_key_rec,
7798         }
7799 };