Whamcloud - gitweb
031bc261f73c76f297faaae3c7407e7c32a645f9
[fs/lustre-release.git] / lustre / lfsck / lfsck_layout.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2016, Intel Corporation.
24  */
25 /*
26  * lustre/lfsck/lfsck_layout.c
27  *
28  * Author: Fan, Yong <fan.yong@intel.com>
29  */
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34 #define DEBUG_SUBSYSTEM S_LFSCK
35
36 #include <linux/bitops.h>
37 #include <linux/rbtree.h>
38
39 #include <lustre/lustre_idl.h>
40 #include <lu_object.h>
41 #include <dt_object.h>
42 #include <lustre_fid.h>
43 #include <lustre_lib.h>
44 #include <lustre_net.h>
45 #include <lustre/lustre_user.h>
46 #include <md_object.h>
47 #include <obd_class.h>
48
49 #include "lfsck_internal.h"
50
51 #define LFSCK_LAYOUT_MAGIC_V1           0xB173AE14
52 #define LFSCK_LAYOUT_MAGIC_V2           0xB1734D76
53 #define LFSCK_LAYOUT_MAGIC_V3           0xB17371B9
54
55 #define LFSCK_LAYOUT_MAGIC              LFSCK_LAYOUT_MAGIC_V3
56
57 struct lfsck_layout_seq {
58         struct list_head         lls_list;
59         __u64                    lls_seq;
60         __u64                    lls_lastid;
61         __u64                    lls_lastid_known;
62         struct dt_object        *lls_lastid_obj;
63         unsigned int             lls_dirty:1;
64 };
65
66 struct lfsck_layout_slave_target {
67         /* link into lfsck_layout_slave_data::llsd_master_list. */
68         struct list_head        llst_list;
69         /* The position for next record in the rbtree for iteration. */
70         struct lu_fid           llst_fid;
71         /* Dummy hash for iteration against the rbtree. */
72         __u64                   llst_hash;
73         __u64                   llst_gen;
74         atomic_t                llst_ref;
75         __u32                   llst_index;
76         /* How many times we have failed to get the master status. */
77         int                     llst_failures;
78 };
79
80 struct lfsck_layout_slave_data {
81         /* list for lfsck_layout_seq */
82         struct list_head         llsd_seq_list;
83
84         /* list for the masters involve layout verification. */
85         struct list_head         llsd_master_list;
86         spinlock_t               llsd_lock;
87         __u64                    llsd_touch_gen;
88         struct dt_object        *llsd_rb_obj;
89         struct rb_root           llsd_rb_root;
90         rwlock_t                 llsd_rb_lock;
91         unsigned int             llsd_rbtree_valid:1;
92 };
93
94 struct lfsck_layout_slave_async_args {
95         struct obd_export                *llsaa_exp;
96         struct lfsck_component           *llsaa_com;
97         struct lfsck_layout_slave_target *llsaa_llst;
98 };
99
100 static inline bool lfsck_comp_extent_aligned(__u64 size)
101 {
102          return (size & (LOV_MIN_STRIPE_SIZE - 1)) == 0;
103 }
104
105 static inline void
106 lfsck_layout_llst_put(struct lfsck_layout_slave_target *llst)
107 {
108         if (atomic_dec_and_test(&llst->llst_ref)) {
109                 LASSERT(list_empty(&llst->llst_list));
110
111                 OBD_FREE_PTR(llst);
112         }
113 }
114
115 static inline int
116 lfsck_layout_llst_add(struct lfsck_layout_slave_data *llsd, __u32 index)
117 {
118         struct lfsck_layout_slave_target *llst;
119         struct lfsck_layout_slave_target *tmp;
120         int                               rc   = 0;
121
122         OBD_ALLOC_PTR(llst);
123         if (llst == NULL)
124                 return -ENOMEM;
125
126         INIT_LIST_HEAD(&llst->llst_list);
127         llst->llst_gen = 0;
128         llst->llst_index = index;
129         atomic_set(&llst->llst_ref, 1);
130
131         spin_lock(&llsd->llsd_lock);
132         list_for_each_entry(tmp, &llsd->llsd_master_list, llst_list) {
133                 if (tmp->llst_index == index) {
134                         rc = -EALREADY;
135                         break;
136                 }
137         }
138         if (rc == 0)
139                 list_add_tail(&llst->llst_list, &llsd->llsd_master_list);
140         spin_unlock(&llsd->llsd_lock);
141
142         if (rc != 0)
143                 OBD_FREE_PTR(llst);
144
145         return rc;
146 }
147
148 static inline void
149 lfsck_layout_llst_del(struct lfsck_layout_slave_data *llsd,
150                       struct lfsck_layout_slave_target *llst)
151 {
152         bool del = false;
153
154         spin_lock(&llsd->llsd_lock);
155         if (!list_empty(&llst->llst_list)) {
156                 list_del_init(&llst->llst_list);
157                 del = true;
158         }
159         spin_unlock(&llsd->llsd_lock);
160
161         if (del)
162                 lfsck_layout_llst_put(llst);
163 }
164
165 static inline struct lfsck_layout_slave_target *
166 lfsck_layout_llst_find_and_del(struct lfsck_layout_slave_data *llsd,
167                                __u32 index, bool unlink)
168 {
169         struct lfsck_layout_slave_target *llst;
170
171         spin_lock(&llsd->llsd_lock);
172         list_for_each_entry(llst, &llsd->llsd_master_list, llst_list) {
173                 if (llst->llst_index == index) {
174                         if (unlink)
175                                 list_del_init(&llst->llst_list);
176                         else
177                                 atomic_inc(&llst->llst_ref);
178                         spin_unlock(&llsd->llsd_lock);
179
180                         return llst;
181                 }
182         }
183         spin_unlock(&llsd->llsd_lock);
184
185         return NULL;
186 }
187
188 static struct lfsck_layout_req *
189 lfsck_layout_assistant_req_init(struct lfsck_assistant_object *lso,
190                                 struct dt_object *child, __u32 comp_id,
191                                 __u32 ost_idx, __u32 lov_idx)
192 {
193         struct lfsck_layout_req *llr;
194
195         OBD_ALLOC_PTR(llr);
196         if (llr == NULL)
197                 return ERR_PTR(-ENOMEM);
198
199         INIT_LIST_HEAD(&llr->llr_lar.lar_list);
200         llr->llr_lar.lar_parent = lfsck_assistant_object_get(lso);
201         llr->llr_child = child;
202         llr->llr_comp_id = comp_id;
203         llr->llr_ost_idx = ost_idx;
204         llr->llr_lov_idx = lov_idx;
205
206         return llr;
207 }
208
209 static void lfsck_layout_assistant_req_fini(const struct lu_env *env,
210                                             struct lfsck_assistant_req *lar)
211 {
212         struct lfsck_layout_req *llr =
213                         container_of0(lar, struct lfsck_layout_req, llr_lar);
214
215         lfsck_object_put(env, llr->llr_child);
216         lfsck_assistant_object_put(env, lar->lar_parent);
217         OBD_FREE_PTR(llr);
218 }
219
220 static int
221 lfsck_layout_assistant_sync_failures_interpret(const struct lu_env *env,
222                                                struct ptlrpc_request *req,
223                                                void *args, int rc)
224 {
225         if (rc == 0) {
226                 struct lfsck_async_interpret_args *laia = args;
227                 struct lfsck_tgt_desc             *ltd  = laia->laia_ltd;
228
229                 ltd->ltd_synced_failures = 1;
230                 atomic_dec(laia->laia_count);
231         }
232
233         return 0;
234 }
235
236 /**
237  * Notify remote LFSCK instances about former failures.
238  *
239  * The local LFSCK instance has recorded which OSTs have ever failed to respond
240  * some LFSCK verification requests (maybe because of network issues or the OST
241  * itself trouble). During the respond gap, the OST may missed some OST-objects
242  * verification, then the OST cannot know whether related OST-objects have been
243  * referenced by related MDT-objects or not, then in the second-stage scanning,
244  * these OST-objects will be regarded as orphan, if the OST-object contains bad
245  * parent FID for back reference, then it will misguide the LFSCK to make wrong
246  * fixing for the fake orphan.
247  *
248  * To avoid above trouble, when layout LFSCK finishes the first-stage scanning,
249  * it will scan the bitmap for the ever failed OSTs, and notify them that they
250  * have ever missed some OST-object verification and should skip the handling
251  * for orphan OST-objects on all MDTs that are in the layout LFSCK.
252  *
253  * \param[in] env       pointer to the thread context
254  * \param[in] com       pointer to the lfsck component
255  * \param[in] lr        pointer to the lfsck request
256  */
257 static void lfsck_layout_assistant_sync_failures(const struct lu_env *env,
258                                                  struct lfsck_component *com,
259                                                  struct lfsck_request *lr)
260 {
261         struct lfsck_async_interpret_args *laia  =
262                                 &lfsck_env_info(env)->lti_laia2;
263         struct lfsck_assistant_data       *lad   = com->lc_data;
264         struct lfsck_layout               *lo    = com->lc_file_ram;
265         struct lfsck_instance             *lfsck = com->lc_lfsck;
266         struct lfsck_tgt_descs            *ltds  = &lfsck->li_ost_descs;
267         struct lfsck_tgt_desc             *ltd;
268         struct ptlrpc_request_set         *set;
269         atomic_t                           count;
270         __u32                              idx;
271         int                                rc    = 0;
272         ENTRY;
273
274         if (!lad->lad_incomplete)
275                 RETURN_EXIT;
276
277         /* If the MDT has ever failed to verfiy some OST-objects,
278          * then sync failures with them firstly. */
279         lr->lr_flags2 = lo->ll_flags | LF_INCOMPLETE;
280
281         atomic_set(&count, 0);
282         memset(laia, 0, sizeof(*laia));
283         laia->laia_count = &count;
284         set = ptlrpc_prep_set();
285         if (set == NULL)
286                 GOTO(out, rc = -ENOMEM);
287
288         down_read(&ltds->ltd_rw_sem);
289         cfs_foreach_bit(lad->lad_bitmap, idx) {
290                 ltd = lfsck_ltd2tgt(ltds, idx);
291                 if (unlikely(!ltd))
292                         continue;
293
294                 laia->laia_ltd = ltd;
295                 rc = lfsck_async_request(env, ltd->ltd_exp, lr, set,
296                                 lfsck_layout_assistant_sync_failures_interpret,
297                                 laia, LFSCK_NOTIFY);
298                 if (rc != 0) {
299                         CDEBUG(D_LFSCK, "%s: LFSCK assistant fail to "
300                                "notify target %x for %s phase1 done: "
301                                "rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
302                                ltd->ltd_index, lad->lad_name, rc);
303
304                         break;
305                 }
306
307                 atomic_inc(&count);
308         }
309         up_read(&ltds->ltd_rw_sem);
310
311         if (rc == 0 && atomic_read(&count) > 0)
312                 rc = ptlrpc_set_wait(set);
313
314         ptlrpc_set_destroy(set);
315
316         if (rc == 0 && atomic_read(&count) > 0)
317                 rc = -EINVAL;
318
319         GOTO(out, rc);
320
321 out:
322         if (rc != 0)
323                 /* If failed to sync failures with the OSTs, then have to
324                  * mark the whole LFSCK as LF_INCOMPLETE to skip the whole
325                  * subsequent orphan OST-object handling. */
326                 lo->ll_flags |= LF_INCOMPLETE;
327
328         lr->lr_flags2 = lo->ll_flags;
329 }
330
331 static int lfsck_layout_verify_header_v1v3(struct dt_object *obj,
332                                            struct lov_mds_md_v1 *lmm)
333 {
334         __u32 magic;
335         __u32 pattern;
336
337         magic = le32_to_cpu(lmm->lmm_magic);
338         /* If magic crashed, keep it there. Sometime later, during OST-object
339          * orphan handling, if some OST-object(s) back-point to it, it can be
340          * verified and repaired. */
341         if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3) {
342                 int rc;
343
344                 if ((magic & LOV_MAGIC_MASK) == LOV_MAGIC_MAGIC)
345                         rc = -EOPNOTSUPP;
346                 else
347                         rc = -EINVAL;
348
349                 CDEBUG(D_LFSCK, "%s LOV EA magic %u for the file "DFID"\n",
350                        rc == -EINVAL ? "Unknown" : "Unsupported",
351                        magic, PFID(lfsck_dto2fid(obj)));
352
353                 return rc;
354         }
355
356         pattern = le32_to_cpu(lmm->lmm_pattern);
357         /* XXX: currently, we only support LOV_PATTERN_RAID0. */
358         if (lov_pattern(pattern) != LOV_PATTERN_RAID0) {
359                 CDEBUG(D_LFSCK, "Unsupported LOV EA pattern %u for the file "
360                        DFID"\n", pattern, PFID(lfsck_dto2fid(obj)));
361
362                 return -EOPNOTSUPP;
363         }
364
365         return 0;
366 }
367
368 static int lfsck_layout_verify_header(struct dt_object *obj,
369                                       struct lov_mds_md_v1 *lmm)
370 {
371         int rc = 0;
372
373         if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_COMP_V1) {
374                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
375                 int i;
376                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
377
378                 if (unlikely(count == 0)) {
379                         CDEBUG(D_LFSCK, "the PFL file "DFID" contains invalid "
380                                "components count 0\n",
381                                PFID(lfsck_dto2fid(obj)));
382
383                         return -EINVAL;
384                 }
385
386                 for (i = 0; i < count; i++) {
387                         struct lov_comp_md_entry_v1 *lcme =
388                                                 &lcm->lcm_entries[i];
389                         __u64 start = le64_to_cpu(lcme->lcme_extent.e_start);
390                         __u64 end = le64_to_cpu(lcme->lcme_extent.e_end);
391                         __u32 comp_id = le32_to_cpu(lcme->lcme_id);
392
393                         if (unlikely(comp_id == LCME_ID_INVAL ||
394                                      comp_id > LCME_ID_MAX)) {
395                                 CDEBUG(D_LFSCK, "found invalid FPL ID %u "
396                                        "for the file "DFID" at idx %d\n",
397                                        comp_id, PFID(lfsck_dto2fid(obj)), i);
398
399                                 return -EINVAL;
400                         }
401
402                         if (unlikely(start >= end ||
403                                      !lfsck_comp_extent_aligned(start) ||
404                                      (!lfsck_comp_extent_aligned(end) &&
405                                       end != LUSTRE_EOF))) {
406                                 CDEBUG(D_LFSCK, "found invalid FPL extent "
407                                        "range [%llu - %llu) for the file "
408                                        DFID" at idx %d\n",
409                                        start, end, PFID(lfsck_dto2fid(obj)), i);
410
411                                 return -EINVAL;
412                         }
413
414                         rc = lfsck_layout_verify_header_v1v3(obj,
415                                 (struct lov_mds_md_v1 *)((char *)lmm +
416                                 le32_to_cpu(lcme->lcme_offset)));
417                         if (rc)
418                                 return rc;
419                 }
420         } else {
421                 rc = lfsck_layout_verify_header_v1v3(obj, lmm);
422         }
423
424         return rc;
425 }
426
427 static int lfsck_layout_get_lovea(const struct lu_env *env,
428                                   struct dt_object *obj, struct lu_buf *buf)
429 {
430         int rc;
431         int rc1;
432
433 again:
434         rc = dt_xattr_get(env, obj, buf, XATTR_NAME_LOV);
435         if (rc == -ERANGE) {
436                 rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_LOV);
437                 if (rc <= 0)
438                         return rc;
439
440                 lu_buf_realloc(buf, rc);
441                 if (buf->lb_buf == NULL)
442                         return -ENOMEM;
443
444                 goto again;
445         }
446
447         if (rc == -ENODATA)
448                 rc = 0;
449
450         if (rc <= 0)
451                 return rc;
452
453         if (unlikely(buf->lb_buf == NULL)) {
454                 lu_buf_alloc(buf, rc);
455                 if (buf->lb_buf == NULL)
456                         return -ENOMEM;
457
458                 goto again;
459         }
460
461         rc1 = lfsck_layout_verify_header(obj, buf->lb_buf);
462
463         return rc1 ? rc1 : rc;
464 }
465
466 #define LFSCK_RBTREE_BITMAP_SIZE        PAGE_SIZE
467 #define LFSCK_RBTREE_BITMAP_WIDTH       (LFSCK_RBTREE_BITMAP_SIZE << 3)
468 #define LFSCK_RBTREE_BITMAP_MASK        (LFSCK_RBTREE_BITMAP_WIDTH - 1)
469
470 struct lfsck_rbtree_node {
471         struct rb_node   lrn_node;
472         __u64            lrn_seq;
473         __u32            lrn_first_oid;
474         atomic_t         lrn_known_count;
475         atomic_t         lrn_accessed_count;
476         void            *lrn_known_bitmap;
477         void            *lrn_accessed_bitmap;
478 };
479
480 static inline int lfsck_rbtree_cmp(struct lfsck_rbtree_node *lrn,
481                                    __u64 seq, __u32 oid)
482 {
483         if (seq < lrn->lrn_seq)
484                 return -1;
485
486         if (seq > lrn->lrn_seq)
487                 return 1;
488
489         if (oid < lrn->lrn_first_oid)
490                 return -1;
491
492         if (oid - lrn->lrn_first_oid >= LFSCK_RBTREE_BITMAP_WIDTH)
493                 return 1;
494
495         return 0;
496 }
497
498 /* The caller should hold llsd->llsd_rb_lock. */
499 static struct lfsck_rbtree_node *
500 lfsck_rbtree_search(struct lfsck_layout_slave_data *llsd,
501                     const struct lu_fid *fid, bool *exact)
502 {
503         struct rb_node           *node  = llsd->llsd_rb_root.rb_node;
504         struct rb_node           *prev  = NULL;
505         struct lfsck_rbtree_node *lrn   = NULL;
506         int                       rc    = 0;
507
508         if (exact != NULL)
509                 *exact = true;
510
511         while (node != NULL) {
512                 prev = node;
513                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
514                 rc = lfsck_rbtree_cmp(lrn, fid_seq(fid), fid_oid(fid));
515                 if (rc < 0)
516                         node = node->rb_left;
517                 else if (rc > 0)
518                         node = node->rb_right;
519                 else
520                         return lrn;
521         }
522
523         if (exact == NULL)
524                 return NULL;
525
526         /* If there is no exactly matched one, then to the next valid one. */
527         *exact = false;
528
529         /* The rbtree is empty. */
530         if (rc == 0)
531                 return NULL;
532
533         if (rc < 0)
534                 return lrn;
535
536         node = rb_next(prev);
537
538         /* The end of the rbtree. */
539         if (node == NULL)
540                 return NULL;
541
542         lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
543
544         return lrn;
545 }
546
547 static struct lfsck_rbtree_node *lfsck_rbtree_new(const struct lu_env *env,
548                                                   const struct lu_fid *fid)
549 {
550         struct lfsck_rbtree_node *lrn;
551
552         OBD_ALLOC_PTR(lrn);
553         if (lrn == NULL)
554                 return ERR_PTR(-ENOMEM);
555
556         OBD_ALLOC(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
557         if (lrn->lrn_known_bitmap == NULL) {
558                 OBD_FREE_PTR(lrn);
559
560                 return ERR_PTR(-ENOMEM);
561         }
562
563         OBD_ALLOC(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
564         if (lrn->lrn_accessed_bitmap == NULL) {
565                 OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
566                 OBD_FREE_PTR(lrn);
567
568                 return ERR_PTR(-ENOMEM);
569         }
570
571         RB_CLEAR_NODE(&lrn->lrn_node);
572         lrn->lrn_seq = fid_seq(fid);
573         lrn->lrn_first_oid = fid_oid(fid) & ~LFSCK_RBTREE_BITMAP_MASK;
574         atomic_set(&lrn->lrn_known_count, 0);
575         atomic_set(&lrn->lrn_accessed_count, 0);
576
577         return lrn;
578 }
579
580 static void lfsck_rbtree_free(struct lfsck_rbtree_node *lrn)
581 {
582         OBD_FREE(lrn->lrn_accessed_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
583         OBD_FREE(lrn->lrn_known_bitmap, LFSCK_RBTREE_BITMAP_SIZE);
584         OBD_FREE_PTR(lrn);
585 }
586
587 /* The caller should hold lock. */
588 static struct lfsck_rbtree_node *
589 lfsck_rbtree_insert(struct lfsck_layout_slave_data *llsd,
590                     struct lfsck_rbtree_node *lrn)
591 {
592         struct rb_node           **pos    = &llsd->llsd_rb_root.rb_node;
593         struct rb_node            *parent = NULL;
594         struct lfsck_rbtree_node  *tmp;
595         int                        rc;
596
597         while (*pos != NULL) {
598                 parent = *pos;
599                 tmp = rb_entry(parent, struct lfsck_rbtree_node, lrn_node);
600                 rc = lfsck_rbtree_cmp(tmp, lrn->lrn_seq, lrn->lrn_first_oid);
601                 if (rc < 0)
602                         pos = &(*pos)->rb_left;
603                 else if (rc > 0)
604                         pos = &(*pos)->rb_right;
605                 else
606                         return tmp;
607         }
608
609         rb_link_node(&lrn->lrn_node, parent, pos);
610         rb_insert_color(&lrn->lrn_node, &llsd->llsd_rb_root);
611
612         return lrn;
613 }
614
615 extern const struct dt_index_operations lfsck_orphan_index_ops;
616
617 static int lfsck_rbtree_setup(const struct lu_env *env,
618                               struct lfsck_component *com)
619 {
620         struct lu_fid                   *fid    = &lfsck_env_info(env)->lti_fid;
621         struct lfsck_instance           *lfsck  = com->lc_lfsck;
622         struct dt_device                *dev    = lfsck->li_bottom;
623         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
624         struct dt_object                *obj;
625
626         fid->f_seq = FID_SEQ_LAYOUT_RBTREE;
627         fid->f_oid = lfsck_dev_idx(lfsck);
628         fid->f_ver = 0;
629         obj = dt_locate(env, dev, fid);
630         if (IS_ERR(obj))
631                 RETURN(PTR_ERR(obj));
632
633         /* Generate an in-RAM object to stand for the layout rbtree.
634          * Scanning the layout rbtree will be via the iteration over
635          * the object. In the future, the rbtree may be written onto
636          * disk with the object.
637          *
638          * Mark the object to be as exist. */
639         obj->do_lu.lo_header->loh_attr |= LOHA_EXISTS;
640         obj->do_index_ops = &lfsck_orphan_index_ops;
641         llsd->llsd_rb_obj = obj;
642         llsd->llsd_rbtree_valid = 1;
643         dev->dd_record_fid_accessed = 1;
644
645         CDEBUG(D_LFSCK, "%s: layout LFSCK init OST-objects accessing bitmap\n",
646                lfsck_lfsck2name(lfsck));
647
648         return 0;
649 }
650
651 static void lfsck_rbtree_cleanup(const struct lu_env *env,
652                                  struct lfsck_component *com)
653 {
654         struct lfsck_instance           *lfsck = com->lc_lfsck;
655         struct lfsck_layout_slave_data  *llsd  = com->lc_data;
656         struct rb_node                  *node  = rb_first(&llsd->llsd_rb_root);
657         struct rb_node                  *next;
658         struct lfsck_rbtree_node        *lrn;
659
660         lfsck->li_bottom->dd_record_fid_accessed = 0;
661         /* Invalid the rbtree, then no others will use it. */
662         write_lock(&llsd->llsd_rb_lock);
663         llsd->llsd_rbtree_valid = 0;
664         write_unlock(&llsd->llsd_rb_lock);
665
666         while (node != NULL) {
667                 next = rb_next(node);
668                 lrn = rb_entry(node, struct lfsck_rbtree_node, lrn_node);
669                 rb_erase(node, &llsd->llsd_rb_root);
670                 lfsck_rbtree_free(lrn);
671                 node = next;
672         }
673
674         if (llsd->llsd_rb_obj != NULL) {
675                 lfsck_object_put(env, llsd->llsd_rb_obj);
676                 llsd->llsd_rb_obj = NULL;
677         }
678
679         CDEBUG(D_LFSCK, "%s: layout LFSCK fini OST-objects accessing bitmap\n",
680                lfsck_lfsck2name(lfsck));
681 }
682
683 static void lfsck_rbtree_update_bitmap(const struct lu_env *env,
684                                        struct lfsck_component *com,
685                                        const struct lu_fid *fid,
686                                        bool accessed)
687 {
688         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
689         struct lfsck_rbtree_node        *lrn;
690         bool                             insert = false;
691         int                              idx;
692         int                              rc     = 0;
693         ENTRY;
694
695         if (unlikely(!fid_is_sane(fid) || fid_is_last_id(fid)))
696                 RETURN_EXIT;
697
698         if (!fid_is_idif(fid) && !fid_is_norm(fid))
699                 RETURN_EXIT;
700
701         read_lock(&llsd->llsd_rb_lock);
702         if (!llsd->llsd_rbtree_valid)
703                 GOTO(unlock, rc = 0);
704
705         lrn = lfsck_rbtree_search(llsd, fid, NULL);
706         if (lrn == NULL) {
707                 struct lfsck_rbtree_node *tmp;
708
709                 LASSERT(!insert);
710
711                 read_unlock(&llsd->llsd_rb_lock);
712                 tmp = lfsck_rbtree_new(env, fid);
713                 if (IS_ERR(tmp))
714                         GOTO(out, rc = PTR_ERR(tmp));
715
716                 insert = true;
717                 write_lock(&llsd->llsd_rb_lock);
718                 if (!llsd->llsd_rbtree_valid) {
719                         lfsck_rbtree_free(tmp);
720                         GOTO(unlock, rc = 0);
721                 }
722
723                 lrn = lfsck_rbtree_insert(llsd, tmp);
724                 if (lrn != tmp)
725                         lfsck_rbtree_free(tmp);
726         }
727
728         idx = fid_oid(fid) & LFSCK_RBTREE_BITMAP_MASK;
729         /* Any accessed object must be a known object. */
730         if (!test_and_set_bit(idx, lrn->lrn_known_bitmap))
731                 atomic_inc(&lrn->lrn_known_count);
732         if (accessed && !test_and_set_bit(idx, lrn->lrn_accessed_bitmap))
733                 atomic_inc(&lrn->lrn_accessed_count);
734
735         GOTO(unlock, rc = 0);
736
737 unlock:
738         if (insert)
739                 write_unlock(&llsd->llsd_rb_lock);
740         else
741                 read_unlock(&llsd->llsd_rb_lock);
742 out:
743         if (rc != 0 && accessed) {
744                 struct lfsck_layout *lo = com->lc_file_ram;
745
746                 CDEBUG(D_LFSCK, "%s: fail to update OST-objects accessing "
747                        "bitmap, and will cause incorrect LFSCK OST-object "
748                        "handling, so disable it to cancel orphan handling "
749                        "for related device. rc = %d\n",
750                        lfsck_lfsck2name(com->lc_lfsck), rc);
751
752                 lo->ll_flags |= LF_INCOMPLETE;
753                 lfsck_rbtree_cleanup(env, com);
754         }
755 }
756
757 static inline void lldk_le_to_cpu(struct lfsck_layout_dangling_key *des,
758                                   const struct lfsck_layout_dangling_key *src)
759 {
760         fid_le_to_cpu(&des->lldk_fid, &src->lldk_fid);
761         des->lldk_comp_id = le32_to_cpu(src->lldk_comp_id);
762         des->lldk_ea_off = le32_to_cpu(src->lldk_ea_off);
763 }
764
765 static inline void lldk_cpu_to_le(struct lfsck_layout_dangling_key *des,
766                                   const struct lfsck_layout_dangling_key *src)
767 {
768         fid_cpu_to_le(&des->lldk_fid, &src->lldk_fid);
769         des->lldk_comp_id = cpu_to_le32(src->lldk_comp_id);
770         des->lldk_ea_off = cpu_to_le32(src->lldk_ea_off);
771 }
772
773 static inline void lldk_be_to_cpu(struct lfsck_layout_dangling_key *des,
774                                   const struct lfsck_layout_dangling_key *src)
775 {
776         fid_be_to_cpu(&des->lldk_fid, &src->lldk_fid);
777         des->lldk_comp_id = be32_to_cpu(src->lldk_comp_id);
778         des->lldk_ea_off = be32_to_cpu(src->lldk_ea_off);
779 }
780
781 static inline void lldk_cpu_to_be(struct lfsck_layout_dangling_key *des,
782                                   const struct lfsck_layout_dangling_key *src)
783 {
784         fid_cpu_to_be(&des->lldk_fid, &src->lldk_fid);
785         des->lldk_comp_id = cpu_to_be32(src->lldk_comp_id);
786         des->lldk_ea_off = cpu_to_be32(src->lldk_ea_off);
787 }
788
789 static void lfsck_layout_le_to_cpu(struct lfsck_layout *des,
790                                    const struct lfsck_layout *src)
791 {
792         int i;
793
794         des->ll_magic = le32_to_cpu(src->ll_magic);
795         des->ll_status = le32_to_cpu(src->ll_status);
796         des->ll_flags = le32_to_cpu(src->ll_flags);
797         des->ll_success_count = le32_to_cpu(src->ll_success_count);
798         des->ll_run_time_phase1 = le32_to_cpu(src->ll_run_time_phase1);
799         des->ll_run_time_phase2 = le32_to_cpu(src->ll_run_time_phase2);
800         des->ll_time_last_complete = le64_to_cpu(src->ll_time_last_complete);
801         des->ll_time_latest_start = le64_to_cpu(src->ll_time_latest_start);
802         des->ll_time_last_checkpoint =
803                                 le64_to_cpu(src->ll_time_last_checkpoint);
804         des->ll_pos_latest_start = le64_to_cpu(src->ll_pos_latest_start);
805         des->ll_pos_last_checkpoint = le64_to_cpu(src->ll_pos_last_checkpoint);
806         des->ll_pos_first_inconsistent =
807                         le64_to_cpu(src->ll_pos_first_inconsistent);
808         des->ll_objs_checked_phase1 = le64_to_cpu(src->ll_objs_checked_phase1);
809         des->ll_objs_failed_phase1 = le64_to_cpu(src->ll_objs_failed_phase1);
810         des->ll_objs_checked_phase2 = le64_to_cpu(src->ll_objs_checked_phase2);
811         des->ll_objs_failed_phase2 = le64_to_cpu(src->ll_objs_failed_phase2);
812         for (i = 0; i < LLIT_MAX; i++)
813                 des->ll_objs_repaired[i] =
814                                 le64_to_cpu(src->ll_objs_repaired[i]);
815         des->ll_objs_skipped = le64_to_cpu(src->ll_objs_skipped);
816         des->ll_bitmap_size = le32_to_cpu(src->ll_bitmap_size);
817         lldk_le_to_cpu(&des->ll_lldk_latest_scanned_phase2,
818                        &src->ll_lldk_latest_scanned_phase2);
819 }
820
821 static void lfsck_layout_cpu_to_le(struct lfsck_layout *des,
822                                    const struct lfsck_layout *src)
823 {
824         int i;
825
826         des->ll_magic = cpu_to_le32(src->ll_magic);
827         des->ll_status = cpu_to_le32(src->ll_status);
828         des->ll_flags = cpu_to_le32(src->ll_flags);
829         des->ll_success_count = cpu_to_le32(src->ll_success_count);
830         des->ll_run_time_phase1 = cpu_to_le32(src->ll_run_time_phase1);
831         des->ll_run_time_phase2 = cpu_to_le32(src->ll_run_time_phase2);
832         des->ll_time_last_complete = cpu_to_le64(src->ll_time_last_complete);
833         des->ll_time_latest_start = cpu_to_le64(src->ll_time_latest_start);
834         des->ll_time_last_checkpoint =
835                                 cpu_to_le64(src->ll_time_last_checkpoint);
836         des->ll_pos_latest_start = cpu_to_le64(src->ll_pos_latest_start);
837         des->ll_pos_last_checkpoint = cpu_to_le64(src->ll_pos_last_checkpoint);
838         des->ll_pos_first_inconsistent =
839                         cpu_to_le64(src->ll_pos_first_inconsistent);
840         des->ll_objs_checked_phase1 = cpu_to_le64(src->ll_objs_checked_phase1);
841         des->ll_objs_failed_phase1 = cpu_to_le64(src->ll_objs_failed_phase1);
842         des->ll_objs_checked_phase2 = cpu_to_le64(src->ll_objs_checked_phase2);
843         des->ll_objs_failed_phase2 = cpu_to_le64(src->ll_objs_failed_phase2);
844         for (i = 0; i < LLIT_MAX; i++)
845                 des->ll_objs_repaired[i] =
846                                 cpu_to_le64(src->ll_objs_repaired[i]);
847         des->ll_objs_skipped = cpu_to_le64(src->ll_objs_skipped);
848         des->ll_bitmap_size = cpu_to_le32(src->ll_bitmap_size);
849         lldk_cpu_to_le(&des->ll_lldk_latest_scanned_phase2,
850                        &src->ll_lldk_latest_scanned_phase2);
851 }
852
853 /**
854  * Load the OST bitmap from the lfsck_layout trace file.
855  *
856  * \param[in] env       pointer to the thread context
857  * \param[in] com       pointer to the lfsck component
858  *
859  * \retval              0 for success
860  * \retval              negative error number on failure or data corruption
861  */
862 static int lfsck_layout_load_bitmap(const struct lu_env *env,
863                                     struct lfsck_component *com)
864 {
865         struct dt_object                *obj    = com->lc_obj;
866         struct lfsck_assistant_data     *lad    = com->lc_data;
867         struct lfsck_layout             *lo     = com->lc_file_ram;
868         struct cfs_bitmap                       *bitmap = lad->lad_bitmap;
869         loff_t                           pos    = com->lc_file_size;
870         ssize_t                          size;
871         __u32                            nbits;
872         int                              rc;
873         ENTRY;
874
875         if (com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size >
876             lo->ll_bitmap_size)
877                 nbits = com->lc_lfsck->li_ost_descs.ltd_tgts_bitmap->size;
878         else
879                 nbits = lo->ll_bitmap_size;
880
881         if (unlikely(nbits < BITS_PER_LONG))
882                 nbits = BITS_PER_LONG;
883
884         if (nbits > bitmap->size) {
885                 __u32 new_bits = bitmap->size;
886                 struct cfs_bitmap *new_bitmap;
887
888                 while (new_bits < nbits)
889                         new_bits <<= 1;
890
891                 new_bitmap = CFS_ALLOCATE_BITMAP(new_bits);
892                 if (new_bitmap == NULL)
893                         RETURN(-ENOMEM);
894
895                 lad->lad_bitmap = new_bitmap;
896                 CFS_FREE_BITMAP(bitmap);
897                 bitmap = new_bitmap;
898         }
899
900         if (lo->ll_bitmap_size == 0) {
901                 lad->lad_incomplete = 0;
902                 CFS_RESET_BITMAP(bitmap);
903
904                 RETURN(0);
905         }
906
907         size = (lo->ll_bitmap_size + 7) >> 3;
908         rc = dt_read(env, obj, lfsck_buf_get(env, bitmap->data, size), &pos);
909         if (rc != size)
910                 RETURN(rc >= 0 ? -EINVAL : rc);
911
912         if (cfs_bitmap_check_empty(bitmap))
913                 lad->lad_incomplete = 0;
914         else
915                 lad->lad_incomplete = 1;
916
917         RETURN(0);
918 }
919
920 /**
921  * Load the layout LFSCK trace file from disk.
922  *
923  * The layout LFSCK trace file records the layout LFSCK status information
924  * and other statistics, such as how many objects have been scanned, and how
925  * many objects have been repaired, and etc. It also contains the bitmap for
926  * failed OSTs during the layout LFSCK. All these information will be loaded
927  * from disk to RAM when the layout LFSCK component setup.
928  *
929  * \param[in] env       pointer to the thread context
930  * \param[in] com       pointer to the lfsck component
931  *
932  * \retval              positive number for file data corruption, the caller
933  *                      should reset the layout LFSCK trace file
934  * \retval              0 for success
935  * \retval              negative error number on failure
936  */
937 static int lfsck_layout_load(const struct lu_env *env,
938                              struct lfsck_component *com)
939 {
940         struct lfsck_layout             *lo     = com->lc_file_ram;
941         ssize_t                          size   = com->lc_file_size;
942         loff_t                           pos    = 0;
943         int                              rc;
944
945         rc = dt_read(env, com->lc_obj,
946                      lfsck_buf_get(env, com->lc_file_disk, size), &pos);
947         if (rc == 0) {
948                 return -ENOENT;
949         } else if (rc < 0) {
950                 CDEBUG(D_LFSCK, "%s: failed to load lfsck_layout: rc = %d\n",
951                        lfsck_lfsck2name(com->lc_lfsck), rc);
952                 return rc;
953         } else if (rc != size) {
954                 CDEBUG(D_LFSCK, "%s: lfsck_layout size %u != %u; reset it\n",
955                        lfsck_lfsck2name(com->lc_lfsck), rc, (unsigned int)size);
956                 return 1;
957         }
958
959         lfsck_layout_le_to_cpu(lo, com->lc_file_disk);
960         if (lo->ll_magic != LFSCK_LAYOUT_MAGIC) {
961                 CDEBUG(D_LFSCK, "%s: invalid lfsck_layout magic %#x != %#x, "
962                        "to be reset\n", lfsck_lfsck2name(com->lc_lfsck),
963                        lo->ll_magic, LFSCK_LAYOUT_MAGIC);
964                 return 1;
965         }
966
967         return 0;
968 }
969
970 /**
971  * Store the layout LFSCK trace file on disk.
972  *
973  * The layout LFSCK trace file records the layout LFSCK status information
974  * and other statistics, such as how many objects have been scanned, and how
975  * many objects have been repaired, and etc. It also contains the bitmap for
976  * failed OSTs during the layout LFSCK. All these information will be synced
977  * from RAM to disk periodically.
978  *
979  * \param[in] env       pointer to the thread context
980  * \param[in] com       pointer to the lfsck component
981  *
982  * \retval              0 for success
983  * \retval              negative error number on failure
984  */
985 static int lfsck_layout_store(const struct lu_env *env,
986                               struct lfsck_component *com)
987 {
988         struct dt_object        *obj    = com->lc_obj;
989         struct lfsck_instance   *lfsck  = com->lc_lfsck;
990         struct lfsck_layout     *lo_ram = com->lc_file_ram;
991         struct lfsck_layout     *lo     = com->lc_file_disk;
992         struct thandle          *th;
993         struct dt_device        *dev    = lfsck_obj2dev(obj);
994         struct cfs_bitmap       *bitmap = NULL;
995         loff_t                   pos;
996         ssize_t                  size   = com->lc_file_size;
997         __u32                    nbits  = 0;
998         int                      rc;
999         ENTRY;
1000
1001         if (lfsck->li_master) {
1002                 struct lfsck_assistant_data *lad = com->lc_data;
1003
1004                 bitmap = lad->lad_bitmap;
1005                 nbits = bitmap->size;
1006
1007                 LASSERT(nbits > 0);
1008                 LASSERTF((nbits & 7) == 0, "Invalid nbits %u\n", nbits);
1009         }
1010
1011         lo_ram->ll_bitmap_size = nbits;
1012         lfsck_layout_cpu_to_le(lo, lo_ram);
1013         th = dt_trans_create(env, dev);
1014         if (IS_ERR(th))
1015                 GOTO(log, rc = PTR_ERR(th));
1016
1017         rc = dt_declare_record_write(env, obj, lfsck_buf_get(env, lo, size),
1018                                      (loff_t)0, th);
1019         if (rc != 0)
1020                 GOTO(out, rc);
1021
1022         if (bitmap != NULL) {
1023                 rc = dt_declare_record_write(env, obj,
1024                                 lfsck_buf_get(env, bitmap->data, nbits >> 3),
1025                                 (loff_t)size, th);
1026                 if (rc != 0)
1027                         GOTO(out, rc);
1028         }
1029
1030         rc = dt_trans_start_local(env, dev, th);
1031         if (rc != 0)
1032                 GOTO(out, rc);
1033
1034         pos = 0;
1035         rc = dt_record_write(env, obj, lfsck_buf_get(env, lo, size), &pos, th);
1036         if (rc != 0)
1037                 GOTO(out, rc);
1038
1039         if (bitmap != NULL) {
1040                 pos = size;
1041                 rc = dt_record_write(env, obj,
1042                                 lfsck_buf_get(env, bitmap->data, nbits >> 3),
1043                                 &pos, th);
1044         }
1045
1046         GOTO(out, rc);
1047
1048 out:
1049         dt_trans_stop(env, dev, th);
1050
1051 log:
1052         if (rc != 0)
1053                 CDEBUG(D_LFSCK, "%s: fail to store lfsck_layout: rc = %d\n",
1054                        lfsck_lfsck2name(lfsck), rc);
1055
1056         return rc;
1057 }
1058
1059 static int lfsck_layout_init(const struct lu_env *env,
1060                              struct lfsck_component *com)
1061 {
1062         struct lfsck_layout *lo = com->lc_file_ram;
1063         int rc;
1064
1065         memset(lo, 0, com->lc_file_size);
1066         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
1067         lo->ll_status = LS_INIT;
1068         down_write(&com->lc_sem);
1069         rc = lfsck_layout_store(env, com);
1070         if (rc == 0 && com->lc_lfsck->li_master)
1071                 rc = lfsck_load_sub_trace_files(env, com,
1072                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
1073         up_write(&com->lc_sem);
1074
1075         return rc;
1076 }
1077
1078 static int fid_is_for_ostobj(const struct lu_env *env,
1079                              struct lfsck_instance *lfsck,
1080                              struct dt_object *obj, const struct lu_fid *fid)
1081 {
1082         struct seq_server_site  *ss     = lfsck_dev_site(lfsck);
1083         struct lu_seq_range     *range  = &lfsck_env_info(env)->lti_range;
1084         struct lustre_ost_attrs *loa;
1085         int                      rc;
1086
1087         fld_range_set_any(range);
1088         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
1089         if (rc == 0) {
1090                 if (fld_range_is_ost(range))
1091                         return 1;
1092
1093                 return 0;
1094         }
1095
1096         loa = &lfsck_env_info(env)->lti_loa;
1097         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, loa, sizeof(*loa)),
1098                           XATTR_NAME_LMA);
1099         if (rc >= sizeof(struct lustre_mdt_attrs)) {
1100                 lustre_lma_swab(&loa->loa_lma);
1101
1102                 return loa->loa_lma.lma_compat & LMAC_FID_ON_OST ? 1 : 0;
1103         }
1104
1105         rc = dt_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_FID);
1106
1107         return rc > 0;
1108 }
1109
1110 static struct lfsck_layout_seq *
1111 lfsck_layout_seq_lookup(struct lfsck_layout_slave_data *llsd, __u64 seq)
1112 {
1113         struct lfsck_layout_seq *lls;
1114
1115         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1116                 if (lls->lls_seq == seq)
1117                         return lls;
1118
1119                 if (lls->lls_seq > seq)
1120                         return NULL;
1121         }
1122
1123         return NULL;
1124 }
1125
1126 static void
1127 lfsck_layout_seq_insert(struct lfsck_layout_slave_data *llsd,
1128                         struct lfsck_layout_seq *lls)
1129 {
1130         struct lfsck_layout_seq *tmp;
1131         struct list_head        *pos = &llsd->llsd_seq_list;
1132
1133         list_for_each_entry(tmp, &llsd->llsd_seq_list, lls_list) {
1134                 if (lls->lls_seq < tmp->lls_seq) {
1135                         pos = &tmp->lls_list;
1136                         break;
1137                 }
1138         }
1139         list_add_tail(&lls->lls_list, pos);
1140 }
1141
1142 static int
1143 lfsck_layout_lastid_create(const struct lu_env *env,
1144                            struct lfsck_instance *lfsck,
1145                            struct dt_object *obj)
1146 {
1147         struct lfsck_thread_info *info   = lfsck_env_info(env);
1148         struct lu_attr           *la     = &info->lti_la;
1149         struct dt_object_format  *dof    = &info->lti_dof;
1150         struct lfsck_bookmark    *bk     = &lfsck->li_bookmark_ram;
1151         struct dt_device         *dt     = lfsck_obj2dev(obj);
1152         struct thandle           *th;
1153         __u64                     lastid = 0;
1154         loff_t                    pos    = 0;
1155         int                       rc;
1156         ENTRY;
1157
1158         if (bk->lb_param & LPF_DRYRUN)
1159                 return 0;
1160
1161         memset(la, 0, sizeof(*la));
1162         la->la_mode = S_IFREG |  S_IRUGO | S_IWUSR;
1163         la->la_valid = LA_MODE | LA_UID | LA_GID;
1164         memset(dof, 0, sizeof(*dof));
1165         dof->dof_type = dt_mode_to_dft(S_IFREG);
1166
1167         th = dt_trans_create(env, dt);
1168         if (IS_ERR(th))
1169                 GOTO(log, rc = PTR_ERR(th));
1170
1171         rc = dt_declare_create(env, obj, la, NULL, dof, th);
1172         if (rc != 0)
1173                 GOTO(stop, rc);
1174
1175         rc = dt_declare_record_write(env, obj,
1176                                      lfsck_buf_get(env, &lastid,
1177                                                    sizeof(lastid)),
1178                                      pos, th);
1179         if (rc != 0)
1180                 GOTO(stop, rc);
1181
1182         rc = dt_trans_start_local(env, dt, th);
1183         if (rc != 0)
1184                 GOTO(stop, rc);
1185
1186         dt_write_lock(env, obj, 0);
1187         if (likely(dt_object_exists(obj) == 0)) {
1188                 rc = dt_create(env, obj, la, NULL, dof, th);
1189                 if (rc == 0)
1190                         rc = dt_record_write(env, obj,
1191                                 lfsck_buf_get(env, &lastid, sizeof(lastid)),
1192                                 &pos, th);
1193         }
1194         dt_write_unlock(env, obj);
1195
1196         GOTO(stop, rc);
1197
1198 stop:
1199         dt_trans_stop(env, dt, th);
1200
1201 log:
1202         CDEBUG(D_LFSCK, "%s: layout LFSCK will create LAST_ID for <seq> "
1203                "%#llx: rc = %d\n",
1204                lfsck_lfsck2name(lfsck), fid_seq(lfsck_dto2fid(obj)), rc);
1205
1206         return rc;
1207 }
1208
1209 static int
1210 lfsck_layout_lastid_reload(const struct lu_env *env,
1211                            struct lfsck_component *com,
1212                            struct lfsck_layout_seq *lls)
1213 {
1214         __u64   lastid;
1215         loff_t  pos     = 0;
1216         int     rc;
1217
1218         dt_read_lock(env, lls->lls_lastid_obj, 0);
1219         rc = dt_record_read(env, lls->lls_lastid_obj,
1220                             lfsck_buf_get(env, &lastid, sizeof(lastid)), &pos);
1221         dt_read_unlock(env, lls->lls_lastid_obj);
1222         if (unlikely(rc != 0))
1223                 return rc;
1224
1225         lastid = le64_to_cpu(lastid);
1226         if (lastid < lls->lls_lastid_known) {
1227                 struct lfsck_instance   *lfsck  = com->lc_lfsck;
1228                 struct lfsck_layout     *lo     = com->lc_file_ram;
1229
1230                 lls->lls_lastid = lls->lls_lastid_known;
1231                 lls->lls_dirty = 1;
1232                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1233                         LASSERT(lfsck->li_out_notify != NULL);
1234
1235                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1236                                              LE_LASTID_REBUILDING);
1237                         lo->ll_flags |= LF_CRASHED_LASTID;
1238
1239                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
1240                                "LAST_ID file (1) for the sequence %#llx"
1241                                ", old value %llu, known value %llu\n",
1242                                lfsck_lfsck2name(lfsck), lls->lls_seq,
1243                                lastid, lls->lls_lastid);
1244                 }
1245         } else if (lastid >= lls->lls_lastid) {
1246                 lls->lls_lastid = lastid;
1247                 lls->lls_dirty = 0;
1248         }
1249
1250         return 0;
1251 }
1252
1253 static int
1254 lfsck_layout_lastid_store(const struct lu_env *env,
1255                           struct lfsck_component *com)
1256 {
1257         struct lfsck_instance           *lfsck  = com->lc_lfsck;
1258         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
1259         struct dt_device                *dt     = lfsck->li_bottom;
1260         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
1261         struct lfsck_layout_seq         *lls;
1262         struct thandle                  *th;
1263         __u64                            lastid;
1264         int                              rc     = 0;
1265         int                              rc1    = 0;
1266
1267         list_for_each_entry(lls, &llsd->llsd_seq_list, lls_list) {
1268                 loff_t pos = 0;
1269
1270                 if (!lls->lls_dirty)
1271                         continue;
1272
1273                 CDEBUG(D_LFSCK, "%s: layout LFSCK will sync the LAST_ID for "
1274                        "<seq> %#llx as <oid> %llu\n",
1275                        lfsck_lfsck2name(lfsck), lls->lls_seq, lls->lls_lastid);
1276
1277                 if (bk->lb_param & LPF_DRYRUN) {
1278                         lls->lls_dirty = 0;
1279                         continue;
1280                 }
1281
1282                 th = dt_trans_create(env, dt);
1283                 if (IS_ERR(th)) {
1284                         rc1 = PTR_ERR(th);
1285                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1286                                "the LAST_ID for <seq> %#llx(1): rc = %d\n",
1287                                lfsck_lfsck2name(com->lc_lfsck),
1288                                lls->lls_seq, rc1);
1289                         continue;
1290                 }
1291
1292                 lastid = cpu_to_le64(lls->lls_lastid);
1293                 rc = dt_declare_record_write(env, lls->lls_lastid_obj,
1294                                              lfsck_buf_get(env, &lastid,
1295                                                            sizeof(lastid)),
1296                                              pos, th);
1297                 if (rc != 0)
1298                         goto stop;
1299
1300                 rc = dt_trans_start_local(env, dt, th);
1301                 if (rc != 0)
1302                         goto stop;
1303
1304                 dt_write_lock(env, lls->lls_lastid_obj, 0);
1305                 rc = dt_record_write(env, lls->lls_lastid_obj,
1306                                      lfsck_buf_get(env, &lastid,
1307                                      sizeof(lastid)), &pos, th);
1308                 dt_write_unlock(env, lls->lls_lastid_obj);
1309                 if (rc == 0)
1310                         lls->lls_dirty = 0;
1311
1312 stop:
1313                 dt_trans_stop(env, dt, th);
1314                 if (rc != 0) {
1315                         rc1 = rc;
1316                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to store "
1317                                "the LAST_ID for <seq> %#llx(2): rc = %d\n",
1318                                lfsck_lfsck2name(com->lc_lfsck),
1319                                lls->lls_seq, rc1);
1320                 }
1321         }
1322
1323         return rc1;
1324 }
1325
1326 static int
1327 lfsck_layout_lastid_load(const struct lu_env *env,
1328                          struct lfsck_component *com,
1329                          struct lfsck_layout_seq *lls)
1330 {
1331         struct lfsck_instance   *lfsck  = com->lc_lfsck;
1332         struct lfsck_layout     *lo     = com->lc_file_ram;
1333         struct lu_fid           *fid    = &lfsck_env_info(env)->lti_fid;
1334         struct dt_object        *obj;
1335         loff_t                   pos    = 0;
1336         int                      rc;
1337         ENTRY;
1338
1339         lu_last_id_fid(fid, lls->lls_seq, lfsck_dev_idx(lfsck));
1340         obj = dt_locate(env, lfsck->li_bottom, fid);
1341         if (IS_ERR(obj))
1342                 RETURN(PTR_ERR(obj));
1343
1344         /* LAST_ID crashed, to be rebuilt */
1345         if (dt_object_exists(obj) == 0) {
1346                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
1347                         LASSERT(lfsck->li_out_notify != NULL);
1348
1349                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1350                                              LE_LASTID_REBUILDING);
1351                         lo->ll_flags |= LF_CRASHED_LASTID;
1352
1353                         CDEBUG(D_LFSCK, "%s: layout LFSCK cannot find the "
1354                                "LAST_ID file for sequence %#llx\n",
1355                                lfsck_lfsck2name(lfsck), lls->lls_seq);
1356
1357                         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY4) &&
1358                             cfs_fail_val > 0) {
1359                                 struct l_wait_info lwi = LWI_TIMEOUT(
1360                                                 cfs_time_seconds(cfs_fail_val),
1361                                                 NULL, NULL);
1362
1363                                 /* Some others may changed the cfs_fail_val
1364                                  * as zero after above check, re-check it for
1365                                  * sure to avoid falling into wait for ever. */
1366                                 if (likely(lwi.lwi_timeout > 0)) {
1367                                         struct ptlrpc_thread *thread =
1368                                                 &lfsck->li_thread;
1369
1370                                         up_write(&com->lc_sem);
1371                                         l_wait_event(thread->t_ctl_waitq,
1372                                                      !thread_is_running(thread),
1373                                                      &lwi);
1374                                         down_write(&com->lc_sem);
1375                                 }
1376                         }
1377                 }
1378
1379                 rc = lfsck_layout_lastid_create(env, lfsck, obj);
1380         } else {
1381                 dt_read_lock(env, obj, 0);
1382                 rc = dt_read(env, obj,
1383                         lfsck_buf_get(env, &lls->lls_lastid, sizeof(__u64)),
1384                         &pos);
1385                 dt_read_unlock(env, obj);
1386                 if (rc != 0 && rc != sizeof(__u64))
1387                         GOTO(out, rc = (rc > 0 ? -EFAULT : rc));
1388
1389                 if (rc == 0 && !(lo->ll_flags & LF_CRASHED_LASTID)) {
1390                         LASSERT(lfsck->li_out_notify != NULL);
1391
1392                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
1393                                              LE_LASTID_REBUILDING);
1394                         lo->ll_flags |= LF_CRASHED_LASTID;
1395
1396                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds invalid "
1397                                "LAST_ID file for the sequence %#llx"
1398                                ": rc = %d\n",
1399                                lfsck_lfsck2name(lfsck), lls->lls_seq, rc);
1400                 }
1401
1402                 lls->lls_lastid = le64_to_cpu(lls->lls_lastid);
1403                 rc = 0;
1404         }
1405
1406         GOTO(out, rc);
1407
1408 out:
1409         if (rc != 0)
1410                 lfsck_object_put(env, obj);
1411         else
1412                 lls->lls_lastid_obj = obj;
1413
1414         return rc;
1415 }
1416
1417 static void lfsck_layout_record_failure(const struct lu_env *env,
1418                                         struct lfsck_instance *lfsck,
1419                                         struct lfsck_layout *lo)
1420 {
1421         __u64 cookie;
1422
1423         lo->ll_objs_failed_phase1++;
1424         cookie = lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
1425                                                         lfsck->li_di_oit);
1426         if (lo->ll_pos_first_inconsistent == 0 ||
1427             lo->ll_pos_first_inconsistent < cookie) {
1428                 lo->ll_pos_first_inconsistent = cookie;
1429
1430                 CDEBUG(D_LFSCK, "%s: layout LFSCK hit first non-repaired "
1431                        "inconsistency at the pos [%llu]\n",
1432                        lfsck_lfsck2name(lfsck),
1433                        lo->ll_pos_first_inconsistent);
1434         }
1435 }
1436
1437 static int lfsck_layout_double_scan_result(const struct lu_env *env,
1438                                            struct lfsck_component *com,
1439                                            int rc)
1440 {
1441         struct lfsck_instance   *lfsck = com->lc_lfsck;
1442         struct lfsck_layout     *lo    = com->lc_file_ram;
1443
1444         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan: rc = %d\n",
1445                lfsck_lfsck2name(lfsck), rc);
1446
1447         down_write(&com->lc_sem);
1448         lo->ll_run_time_phase2 += cfs_duration_sec(cfs_time_current() +
1449                                   HALF_SEC - com->lc_time_last_checkpoint);
1450         lo->ll_time_last_checkpoint = cfs_time_current_sec();
1451         lo->ll_objs_checked_phase2 += com->lc_new_checked;
1452
1453         if (rc > 0) {
1454                 if (lo->ll_flags & LF_INCOMPLETE) {
1455                         lo->ll_status = LS_PARTIAL;
1456                 } else {
1457                         if (lfsck->li_master) {
1458                                 struct lfsck_assistant_data *lad = com->lc_data;
1459
1460                                 if (lad->lad_incomplete)
1461                                         lo->ll_status = LS_PARTIAL;
1462                                 else
1463                                         lo->ll_status = LS_COMPLETED;
1464                         } else {
1465                                 lo->ll_status = LS_COMPLETED;
1466                         }
1467                 }
1468                 lo->ll_flags &= ~LF_SCANNED_ONCE;
1469                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_DRYRUN))
1470                         lo->ll_flags &= ~LF_INCONSISTENT;
1471                 lo->ll_time_last_complete = lo->ll_time_last_checkpoint;
1472                 lo->ll_success_count++;
1473         } else if (rc == 0) {
1474                 if (lfsck->li_status != 0)
1475                         lo->ll_status = lfsck->li_status;
1476                 else
1477                         lo->ll_status = LS_STOPPED;
1478         } else {
1479                 lo->ll_status = LS_FAILED;
1480         }
1481
1482         rc = lfsck_layout_store(env, com);
1483         up_write(&com->lc_sem);
1484
1485         CDEBUG(D_LFSCK, "%s: layout LFSCK double scan result %u: rc = %d\n",
1486                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
1487
1488         return rc;
1489 }
1490
1491 static int lfsck_layout_trans_stop(const struct lu_env *env,
1492                                    struct dt_device *dev,
1493                                    struct thandle *handle, int result)
1494 {
1495         int rc;
1496
1497         /* XXX: If there is something worng or it needs to repair nothing,
1498          *      then notify the lower to stop the modification. Currently,
1499          *      we use th_result for such purpose, that may be replaced by
1500          *      some rollback mechanism in the future. */
1501         handle->th_result = result;
1502         rc = dt_trans_stop(env, dev, handle);
1503         if (result != 0)
1504                 return result > 0 ? 0 : result;
1505
1506         return rc == 0 ? 1 : rc;
1507 }
1508
1509 static int lfsck_layout_ins_dangling_rec(const struct lu_env *env,
1510                                          struct lfsck_component *com,
1511                                          const struct lu_fid *pfid,
1512                                          const struct lu_fid *cfid,
1513                                          __u32 comp_id, __u32 ea_off,
1514                                          __u32 ost_idx)
1515 {
1516         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1517         struct lu_fid *rec = &lfsck_env_info(env)->lti_fid3;
1518         struct dt_device *dev;
1519         struct dt_object *obj;
1520         struct thandle *th = NULL;
1521         int idx;
1522         int rc = 0;
1523         ENTRY;
1524
1525         idx = lfsck_sub_trace_file_fid2idx(pfid);
1526         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1527         dev = lfsck_obj2dev(obj);
1528
1529         fid_cpu_to_be(&key->lldk_fid, pfid);
1530         key->lldk_comp_id = cpu_to_be32(comp_id);
1531         key->lldk_ea_off = cpu_to_be32(ea_off);
1532
1533         fid_cpu_to_be(rec, cfid);
1534         rec->f_ver = cpu_to_be32(ost_idx);
1535
1536         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1537
1538         th = dt_trans_create(env, dev);
1539         if (IS_ERR(th))
1540                 GOTO(unlock, rc = PTR_ERR(th));
1541
1542         rc = dt_declare_insert(env, obj,
1543                                (const struct dt_rec *)rec,
1544                                (const struct dt_key *)key, th);
1545         if (rc)
1546                 GOTO(unlock, rc);
1547
1548         rc = dt_trans_start_local(env, dev, th);
1549         if (rc)
1550                 GOTO(unlock, rc);
1551
1552         rc = dt_insert(env, obj, (const struct dt_rec *)rec,
1553                        (const struct dt_key *)key, th, 1);
1554
1555         GOTO(unlock, rc);
1556
1557 unlock:
1558         if (th && !IS_ERR(th))
1559                 dt_trans_stop(env, dev, th);
1560
1561         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1562
1563         CDEBUG(D_LFSCK, "%s: insert the paris "DFID" => "DFID", comp_id = %u, "
1564                "ea_off = %u, ost_idx = %u, into the trace file for further "
1565                "dangling check: rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
1566                PFID(pfid), PFID(cfid), comp_id, ea_off, ost_idx, rc);
1567
1568         return rc;
1569 }
1570
1571 static int lfsck_layout_del_dangling_rec(const struct lu_env *env,
1572                                          struct lfsck_component *com,
1573                                          const struct lu_fid *fid,
1574                                          __u32 comp_id, __u32 ea_off)
1575 {
1576         struct lfsck_layout_dangling_key *key = &lfsck_env_info(env)->lti_lldk;
1577         struct dt_device *dev;
1578         struct dt_object *obj;
1579         struct thandle *th = NULL;
1580         int idx;
1581         int rc = 0;
1582         ENTRY;
1583
1584         idx = lfsck_sub_trace_file_fid2idx(fid);
1585         obj = com->lc_sub_trace_objs[idx].lsto_obj;
1586         dev = lfsck_obj2dev(obj);
1587
1588         fid_cpu_to_be(&key->lldk_fid, fid);
1589         key->lldk_comp_id = cpu_to_be32(comp_id);
1590         key->lldk_ea_off = cpu_to_be32(ea_off);
1591
1592         mutex_lock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1593
1594         th = dt_trans_create(env, dev);
1595         if (IS_ERR(th))
1596                 GOTO(unlock, rc = PTR_ERR(th));
1597
1598         rc = dt_declare_delete(env, obj, (const struct dt_key *)key, th);
1599         if (rc)
1600                 GOTO(unlock, rc);
1601
1602         rc = dt_trans_start_local(env, dev, th);
1603         if (rc)
1604                 GOTO(unlock, rc);
1605
1606         rc = dt_delete(env, obj, (const struct dt_key *)key, th);
1607
1608         GOTO(unlock, rc);
1609
1610 unlock:
1611         if (th && !IS_ERR(th))
1612                 dt_trans_stop(env, dev, th);
1613
1614         mutex_unlock(&com->lc_sub_trace_objs[idx].lsto_mutex);
1615
1616         CDEBUG(D_LFSCK, "%s: delete the dangling record for "DFID
1617                ", comp_id = %u, ea_off = %u from the trace file: rc = %d\n",
1618                lfsck_lfsck2name(com->lc_lfsck), PFID(fid), comp_id, ea_off, rc);
1619
1620         return rc;
1621 }
1622
1623 /**
1624  * Get the system default stripe size.
1625  *
1626  * \param[in] env       pointer to the thread context
1627  * \param[in] lfsck     pointer to the lfsck instance
1628  * \param[out] size     pointer to the default stripe size
1629  *
1630  * \retval              0 for success
1631  * \retval              negative error number on failure
1632  */
1633 static int lfsck_layout_get_def_stripesize(const struct lu_env *env,
1634                                            struct lfsck_instance *lfsck,
1635                                            __u32 *size)
1636 {
1637         struct lov_user_md      *lum = &lfsck_env_info(env)->lti_lum;
1638         struct dt_object        *root;
1639         int                      rc;
1640
1641         root = dt_locate(env, lfsck->li_next, &lfsck->li_local_root_fid);
1642         if (IS_ERR(root))
1643                 return PTR_ERR(root);
1644
1645         /* Get the default stripe size via xattr_get on the backend root. */
1646         rc = dt_xattr_get(env, root, lfsck_buf_get(env, lum, sizeof(*lum)),
1647                           XATTR_NAME_LOV);
1648         if (rc > 0) {
1649                 /* The lum->lmm_stripe_size is LE mode. The *size also
1650                  * should be LE mode. So it is unnecessary to convert. */
1651                 *size = lum->lmm_stripe_size;
1652                 rc = 0;
1653         } else if (unlikely(rc == 0)) {
1654                 rc = -EINVAL;
1655         }
1656
1657         lfsck_object_put(env, root);
1658
1659         return rc;
1660 }
1661
1662 /**
1663  * \retval       +1: repaired
1664  * \retval        0: did nothing
1665  * \retval      -ve: on error
1666  */
1667 static int lfsck_layout_refill_lovea(const struct lu_env *env,
1668                                      struct lfsck_instance *lfsck,
1669                                      struct thandle *handle,
1670                                      struct dt_object *parent,
1671                                      const struct lu_fid *cfid,
1672                                      struct lu_buf *buf,
1673                                      struct lov_mds_md_v1 *lmm,
1674                                      struct lov_ost_data_v1 *slot,
1675                                      int fl, __u32 ost_idx, int size)
1676 {
1677         struct ost_id           *oi     = &lfsck_env_info(env)->lti_oi;
1678         struct lu_buf            ea_buf;
1679         int                      rc;
1680         __u32                    magic;
1681         __u32                    pattern;
1682         __u16                    count;
1683         ENTRY;
1684
1685         magic = le32_to_cpu(lmm->lmm_magic);
1686         pattern = le32_to_cpu(lmm->lmm_pattern);
1687         count = le16_to_cpu(lmm->lmm_stripe_count);
1688
1689         fid_to_ostid(cfid, oi);
1690         ostid_cpu_to_le(oi, &slot->l_ost_oi);
1691         slot->l_ost_gen = cpu_to_le32(0);
1692         slot->l_ost_idx = cpu_to_le32(ost_idx);
1693
1694         if (pattern & LOV_PATTERN_F_HOLE) {
1695                 struct lov_ost_data_v1 *objs;
1696                 int                     i;
1697
1698                 if (magic == LOV_MAGIC_V1)
1699                         objs = &lmm->lmm_objects[0];
1700                 else
1701                         objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
1702                 for (i = 0; i < count; i++, objs++) {
1703                         if (lovea_slot_is_dummy(objs))
1704                                 break;
1705                 }
1706
1707                 /* If the @slot is the last dummy slot to be refilled,
1708                  * then drop LOV_PATTERN_F_HOLE from lmm::lmm_pattern. */
1709                 if (i == count) {
1710                         lmm->lmm_pattern =
1711                                 cpu_to_le32(pattern & ~LOV_PATTERN_F_HOLE);
1712
1713                         CDEBUG(D_LFSCK, "%s: remove layout HOLE for "DFID
1714                                ": parent "DFID"\n", lfsck_lfsck2name(lfsck),
1715                                PFID(cfid), PFID(lfsck_dto2fid(parent)));
1716                 }
1717         }
1718
1719         lfsck_buf_init(&ea_buf, buf->lb_buf, size);
1720         rc = dt_xattr_set(env, parent, &ea_buf, XATTR_NAME_LOV, fl, handle);
1721         if (rc == 0)
1722                 rc = 1;
1723
1724         RETURN(rc);
1725 }
1726
1727 static struct lov_ost_data_v1 *
1728 __lfsck_layout_new_v1_lovea(struct lov_mds_md_v1 *lmm,
1729                             const struct lu_fid *pfid,
1730                             __u32 stripe_size, __u32 ea_off,
1731                             __u32 pattern, __u16 count)
1732 {
1733         lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
1734         lmm->lmm_pattern = cpu_to_le32(pattern);
1735         fid_to_lmm_oi(pfid, &lmm->lmm_oi);
1736         lmm_oi_cpu_to_le(&lmm->lmm_oi, &lmm->lmm_oi);
1737         lmm->lmm_stripe_size = cpu_to_le32(stripe_size);
1738         lmm->lmm_stripe_count = cpu_to_le16(count);
1739         lmm->lmm_layout_gen = cpu_to_le16(1);
1740         memset(&lmm->lmm_objects[0], 0,
1741                sizeof(struct lov_ost_data_v1) * count);
1742
1743         return &lmm->lmm_objects[ea_off];
1744 }
1745
1746 static int lfsck_layout_new_v1_lovea(const struct lu_env *env,
1747                                      struct lfsck_instance *lfsck,
1748                                      struct ost_layout *ol,
1749                                      struct dt_object *parent,
1750                                      struct lu_buf *buf, __u32 ea_off,
1751                                      struct lov_mds_md_v1 **lmm,
1752                                      struct lov_ost_data_v1 **objs)
1753 {
1754         int size;
1755         __u32 stripe_size = ol->ol_stripe_size;
1756         __u32 pattern = LOV_PATTERN_RAID0;
1757         __u16 count;
1758
1759         if (ol->ol_stripe_count != 0)
1760                 count = ol->ol_stripe_count;
1761         else
1762                 count = ea_off + 1;
1763
1764         size = lov_mds_md_size(count, LOV_MAGIC_V1);
1765         LASSERTF(buf->lb_len >= size,
1766                  "buffer len %d is less than real size %d\n",
1767                  (int)buf->lb_len, size);
1768
1769         if (stripe_size == 0) {
1770                 int rc;
1771
1772                 rc = lfsck_layout_get_def_stripesize(env, lfsck, &stripe_size);
1773                 if (rc)
1774                         return rc;
1775         }
1776
1777         *lmm = buf->lb_buf;
1778         if (ol->ol_stripe_count > 1 ||
1779             (ol->ol_stripe_count == 0 && ea_off != 0)) {
1780                 pattern |= LOV_PATTERN_F_HOLE;
1781                 memset(&(*lmm)->lmm_objects[0], 0,
1782                        count * sizeof(struct lov_ost_data_v1));
1783         }
1784
1785         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1786                                 stripe_size, ea_off, pattern, count);
1787
1788         return size;
1789 }
1790
1791 static int lfsck_layout_new_comp_lovea(const struct lu_env *env,
1792                                       struct ost_layout *ol,
1793                                       struct dt_object *parent,
1794                                       struct lu_buf *buf, __u32 ea_off,
1795                                       struct lov_mds_md_v1 **lmm,
1796                                       struct lov_ost_data_v1 **objs)
1797 {
1798         struct lov_comp_md_v1 *lcm;
1799         struct lov_comp_md_entry_v1 *lcme;
1800         __u32 pattern = LOV_PATTERN_RAID0;
1801         __u32 offset = sizeof(*lcm) + sizeof(*lcme);
1802         int lcme_size = lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1803         int size = offset + lcme_size;
1804
1805         LASSERTF(buf->lb_len >= size,
1806                  "buffer len %d is less than real size %d\n",
1807                  (int)buf->lb_len, size);
1808
1809         lcm = buf->lb_buf;
1810         lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
1811         lcm->lcm_size = cpu_to_le32(size);
1812         lcm->lcm_layout_gen = cpu_to_le32(1);
1813         lcm->lcm_flags = 0;
1814         lcm->lcm_entry_count = cpu_to_le16(1);
1815
1816         lcme = &lcm->lcm_entries[0];
1817         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1818         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1819         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1820         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1821         lcme->lcme_offset = cpu_to_le32(offset);
1822         lcme->lcme_size = cpu_to_le32(lcme_size);
1823         if (ol->ol_stripe_count > 1)
1824                 pattern |= LOV_PATTERN_F_HOLE;
1825
1826         *lmm = buf->lb_buf + offset;
1827         *objs = __lfsck_layout_new_v1_lovea(*lmm, lfsck_dto2fid(parent),
1828                                             ol->ol_stripe_size, ea_off,
1829                                             pattern, ol->ol_stripe_count);
1830
1831         return size;
1832 }
1833
1834 static int lfsck_layout_add_comp_comp(const struct lu_env *env,
1835                                      struct lfsck_instance *lfsck,
1836                                      struct thandle *handle,
1837                                      struct ost_layout *ol,
1838                                      struct dt_object *parent,
1839                                      const struct lu_fid *cfid,
1840                                      struct lu_buf *buf, __u32 ost_idx,
1841                                      __u32 ea_off, int pos)
1842 {
1843         struct lov_comp_md_v1 *lcm = buf->lb_buf;
1844         struct lov_comp_md_entry_v1 *lcme;
1845         struct lov_mds_md_v1 *lmm;
1846         struct lov_ost_data_v1 *objs;
1847         int added = sizeof(*lcme) +
1848                     lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
1849         int size = le32_to_cpu(lcm->lcm_size) + added;
1850         int rc;
1851         int i;
1852         __u32 offset;
1853         __u32 pattern = LOV_PATTERN_RAID0;
1854         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
1855         ENTRY;
1856
1857         lu_buf_check_and_grow(buf, size);
1858         /* set the lcm again because lu_buf_check_and_grow() may
1859          * have reallocated the buf. */
1860         lcm = buf->lb_buf;
1861         lcm->lcm_size = cpu_to_le32(size);
1862         le32_add_cpu(&lcm->lcm_layout_gen, 1);
1863         lcm->lcm_entry_count = cpu_to_le16(count + 1);
1864
1865         /* 1. Move the component bodies from [pos, count-1] to [pos+1, count]
1866          *    with distance of 'added'. */
1867         if (pos < count) {
1868                 size = 0;
1869                 for (i = pos; i < count; i++) {
1870                         lcme = &lcm->lcm_entries[i];
1871                         size += le32_to_cpu(lcme->lcme_size);
1872                 }
1873
1874                 offset = le32_to_cpu(lcm->lcm_entries[pos].lcme_offset);
1875                 memmove(buf->lb_buf + offset + added,
1876                         buf->lb_buf + offset, size);
1877         }
1878
1879         size = 0;
1880         /* 2. Move the component header [0, pos-1] to [0, pos-1] with distance
1881          *    of 'sizeof(struct lov_comp_md_entry_v1)' */
1882         if (pos > 0) {
1883                 for (i = 0; i < pos; i++) {
1884                         lcme = &lcm->lcm_entries[i];
1885                         size += le32_to_cpu(lcme->lcme_size);
1886                 }
1887
1888                 offset = le32_to_cpu(lcm->lcm_entries[0].lcme_offset);
1889                 memmove(buf->lb_buf + offset + sizeof(*lcme),
1890                         buf->lb_buf + offset, size);
1891         }
1892
1893         /* 3. Recalculate the enter offset for the component [pos, count-1] */
1894         for (i = count - 1; i >= pos; i--) {
1895                 lcm->lcm_entries[i + 1] = lcm->lcm_entries[i];
1896                 lcm->lcm_entries[i + 1].lcme_offset =
1897                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i + 1].
1898                                                 lcme_offset) + added);
1899         }
1900
1901         /* 4. Recalculate the enter offset for the component [0, pos) */
1902         for (i = 0; i < pos; i++) {
1903                 lcm->lcm_entries[i].lcme_offset =
1904                         cpu_to_le32(le32_to_cpu(lcm->lcm_entries[i].
1905                                                 lcme_offset) + sizeof(*lcme));
1906         }
1907
1908         offset = sizeof(*lcm) + sizeof(*lcme) * (count + 1) + size;
1909         /* 4. Insert the new component header (entry) at the slot 'pos'. */
1910         lcme = &lcm->lcm_entries[pos];
1911         lcme->lcme_id = cpu_to_le32(ol->ol_comp_id);
1912         lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
1913         lcme->lcme_extent.e_start = cpu_to_le64(ol->ol_comp_start);
1914         lcme->lcme_extent.e_end = cpu_to_le64(ol->ol_comp_end);
1915         lcme->lcme_offset = cpu_to_le32(offset);
1916         lcme->lcme_size = cpu_to_le32(lov_mds_md_size(ol->ol_stripe_count,
1917                                                       LOV_MAGIC_V1));
1918
1919         if (ol->ol_stripe_count > 1)
1920                 pattern |= LOV_PATTERN_F_HOLE;
1921
1922         lmm = buf->lb_buf + offset;
1923         /* 5. Insert teh new component body at the 'offset'. */
1924         objs = __lfsck_layout_new_v1_lovea(lmm, lfsck_dto2fid(parent),
1925                                            ol->ol_stripe_size, ea_off,
1926                                            pattern, ol->ol_stripe_count);
1927
1928         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
1929                                        lmm, objs, LU_XATTR_REPLACE, ost_idx,
1930                                        le32_to_cpu(lcm->lcm_size));
1931
1932         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant add new COMP for "
1933                DFID": parent "DFID", OST-index %u, stripe-index %u, "
1934                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
1935                "comp_end %llu, %s LOV EA hole: rc = %d\n",
1936                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
1937                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
1938                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
1939                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
1940                "with" : "without", rc);
1941
1942         RETURN(rc);
1943 }
1944
1945 static int lfsck_layout_extend_v1v3_lovea(const struct lu_env *env,
1946                                           struct lfsck_instance *lfsck,
1947                                           struct thandle *handle,
1948                                           struct ost_layout *ol,
1949                                           struct dt_object *parent,
1950                                           const struct lu_fid *cfid,
1951                                           struct lu_buf *buf, __u32 ost_idx,
1952                                           __u32 ea_off)
1953 {
1954         struct lov_mds_md_v1 *lmm = buf->lb_buf;
1955         struct lov_ost_data_v1 *objs;
1956         __u16 count = le16_to_cpu(lmm->lmm_stripe_count);
1957         __u32 magic = le32_to_cpu(lmm->lmm_magic);
1958         int size;
1959         int gap;
1960         int rc;
1961         ENTRY;
1962
1963         /* The original LOVEA maybe re-generated via old filter_fid, at
1964          * that time, we do not know the stripe count and stripe size. */
1965         if (ol->ol_stripe_count > count)
1966                 count = ol->ol_stripe_count;
1967         if (ol->ol_stripe_size != 0 &&
1968             ol->ol_stripe_size != le32_to_cpu(lmm->lmm_stripe_size))
1969                 lmm->lmm_stripe_size = cpu_to_le32(ol->ol_stripe_size);
1970
1971         if (magic == LOV_MAGIC_V1)
1972                 objs = &lmm->lmm_objects[count];
1973         else
1974                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[count];
1975
1976         gap = ea_off - count;
1977         if (gap >= 0)
1978                 count = ea_off + 1;
1979
1980         size = lov_mds_md_size(count, magic);
1981         LASSERTF(buf->lb_len >= size,
1982                  "buffer len %d is less than real size %d\n",
1983                  (int)buf->lb_len, size);
1984
1985         if (gap > 0) {
1986                 memset(objs, 0, gap * sizeof(*objs));
1987                 lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_HOLE);
1988         }
1989
1990         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
1991         lmm->lmm_stripe_count = cpu_to_le16(count);
1992         objs += gap;
1993
1994         rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid, buf,
1995                                 lmm, objs, LU_XATTR_REPLACE, ost_idx, size);
1996
1997         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant extend layout EA for "
1998                DFID": parent "DFID", OST-index %u, stripe-index %u, "
1999                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2000                "comp_end %llu, %s LOV EA hole: rc = %d\n",
2001                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2002                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2003                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end,
2004                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2005                "with" : "without", rc);
2006
2007         RETURN(rc);
2008 }
2009
2010 /**
2011  * \retval       +1: repaired
2012  * \retval        0: did nothing
2013  * \retval      -ve: on error
2014  */
2015 static int lfsck_layout_update_lovea(const struct lu_env *env,
2016                                      struct lfsck_instance *lfsck,
2017                                      struct thandle *handle,
2018                                      struct ost_layout *ol,
2019                                      struct dt_object *parent,
2020                                      const struct lu_fid *cfid,
2021                                      struct lu_buf *buf, int fl,
2022                                      __u32 ost_idx, __u32 ea_off)
2023 {
2024         struct lov_mds_md_v1 *lmm = NULL;
2025         struct lov_ost_data_v1 *objs = NULL;
2026         int rc = 0;
2027         ENTRY;
2028
2029         if (ol->ol_comp_id != 0)
2030                 rc = lfsck_layout_new_comp_lovea(env, ol, parent, buf, ea_off,
2031                                                 &lmm, &objs);
2032         else
2033                 rc = lfsck_layout_new_v1_lovea(env, lfsck, ol, parent, buf,
2034                                                ea_off, &lmm, &objs);
2035
2036         if (rc > 0)
2037                 rc = lfsck_layout_refill_lovea(env, lfsck, handle, parent, cfid,
2038                                                buf, lmm, objs, fl, ost_idx, rc);
2039
2040         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant created layout EA for "
2041                DFID": parent "DFID", OST-index %u, stripe-index %u, "
2042                "stripe_size %u, stripe_count %u, comp_id %u, comp_start %llu, "
2043                "comp_end %llu, fl %d, %s LOV EA hole: rc = %d\n",
2044                lfsck_lfsck2name(lfsck), PFID(cfid), PFID(lfsck_dto2fid(parent)),
2045                ost_idx, ea_off, ol->ol_stripe_size, ol->ol_stripe_count,
2046                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end, fl,
2047                le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_HOLE ?
2048                "with" : "without", rc);
2049
2050         RETURN(rc);
2051 }
2052
2053 static int __lfsck_layout_update_pfid(const struct lu_env *env,
2054                                       struct dt_object *child,
2055                                       const struct lu_fid *pfid,
2056                                       const struct ost_layout *ol, __u32 offset)
2057 {
2058         struct dt_device        *dev    = lfsck_obj2dev(child);
2059         struct filter_fid       *ff     = &lfsck_env_info(env)->lti_ff;
2060         struct thandle          *handle;
2061         struct lu_buf            buf    = { NULL };
2062         int                      rc;
2063
2064         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
2065         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
2066         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
2067          * MDT-object's FID::f_ver, instead it is the OST-object index in its
2068          * parent MDT-object's layout EA. */
2069         ff->ff_parent.f_stripe_idx = cpu_to_le32(offset);
2070         ost_layout_cpu_to_le(&ff->ff_layout, ol);
2071         lfsck_buf_init(&buf, ff, sizeof(*ff));
2072
2073         handle = dt_trans_create(env, dev);
2074         if (IS_ERR(handle))
2075                 RETURN(PTR_ERR(handle));
2076
2077         rc = dt_declare_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2078         if (rc != 0)
2079                 GOTO(stop, rc);
2080
2081         rc = dt_trans_start_local(env, dev, handle);
2082         if (rc != 0)
2083                 GOTO(stop, rc);
2084
2085         rc = dt_xattr_set(env, child, &buf, XATTR_NAME_FID, 0, handle);
2086
2087         GOTO(stop, rc);
2088
2089 stop:
2090         dt_trans_stop(env, dev, handle);
2091
2092         return rc;
2093 }
2094
2095 /**
2096  * \retval       +1: repaired
2097  * \retval        0: did nothing
2098  * \retval      -ve: on error
2099  */
2100 static int lfsck_layout_update_pfid(const struct lu_env *env,
2101                                     struct lfsck_component *com,
2102                                     struct dt_object *parent,
2103                                     struct lu_fid *cfid,
2104                                     struct dt_device *cdev,
2105                                     struct ost_layout *ol, __u32 ea_off)
2106 {
2107         struct dt_object        *child;
2108         int                      rc     = 0;
2109         ENTRY;
2110
2111         child = lfsck_object_find_by_dev(env, cdev, cfid);
2112         if (IS_ERR(child))
2113                 RETURN(PTR_ERR(child));
2114
2115         rc = __lfsck_layout_update_pfid(env, child,
2116                                         lu_object_fid(&parent->do_lu),
2117                                         ol, ea_off);
2118         lfsck_object_put(env, child);
2119
2120         RETURN(rc == 0 ? 1 : rc);
2121 }
2122
2123 static int lfsck_lovea_size(struct ost_layout *ol, __u32 ea_off)
2124 {
2125         if (ol->ol_comp_id != 0)
2126                 return sizeof(struct lov_comp_md_v1) +
2127                        sizeof(struct lov_comp_md_entry_v1) +
2128                        lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2129
2130         if (ol->ol_stripe_count != 0)
2131                 return lov_mds_md_size(ol->ol_stripe_count, LOV_MAGIC_V1);
2132
2133         return lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
2134 }
2135
2136 /**
2137  * This function will create the MDT-object with the given (partial) LOV EA.
2138  *
2139  * Under some data corruption cases, the MDT-object of the file may be lost,
2140  * but its OST-objects, or some of them are there. The layout LFSCK needs to
2141  * re-create the MDT-object with the orphan OST-object(s) information.
2142  *
2143  * On the other hand, the LFSCK may has created some OST-object for repairing
2144  * dangling LOV EA reference, but as the LFSCK processing, it may find that
2145  * the old OST-object is there and should replace the former new created OST
2146  * object. Unfortunately, some others have modified such newly created object.
2147  * To keep the data (both new and old), the LFSCK will create MDT-object with
2148  * new FID to reference the original OST-object.
2149  *
2150  * \param[in] env       pointer to the thread context
2151  * \param[in] com       pointer to the lfsck component
2152  * \param[in] ltd       pointer to target device descriptor
2153  * \param[in] rec       pointer to the record for the orphan OST-object
2154  * \param[in] cfid      pointer to FID for the orphan OST-object
2155  * \param[in] infix     additional information, such as the FID for original
2156  *                      MDT-object and the stripe offset in the LOV EA
2157  * \param[in] type      the type for describing why the orphan MDT-object is
2158  *                      created. The rules are as following:
2159  *
2160  *  type "C":           Multiple OST-objects claim the same MDT-object and the
2161  *                      same slot in the layout EA. Then the LFSCK will create
2162  *                      new MDT-object(s) to hold the conflict OST-object(s).
2163  *
2164  *  type "N":           The orphan OST-object does not know which one was the
2165  *                      real parent MDT-object, so the LFSCK uses new FID for
2166  *                      its parent MDT-object.
2167  *
2168  *  type "R":           The orphan OST-object knows its parent MDT-object FID,
2169  *                      but does not know the position (the file name) in the
2170  *                      layout.
2171  *
2172  *  type "D":           The MDT-object is a directory, it may knows its parent
2173  *                      but because there is no valid linkEA, the LFSCK cannot
2174  *                      know where to put it back to the namespace.
2175  *  type "O":           The MDT-object has no linkEA, and there is no name
2176  *                      entry that references the MDT-object.
2177  *
2178  *  type "P":           The orphan object to be created was a parent directory
2179  *                      of some MDT-object which linkEA shows that the @orphan
2180  *                      object is missing.
2181  *
2182  * The orphan name will be like:
2183  * ${FID}-${infix}-${type}-${conflict_version}
2184  *
2185  * \param[in] ea_off    the stripe offset in the LOV EA
2186  *
2187  * \retval              positive on repaired something
2188  * \retval              0 if needs to repair nothing
2189  * \retval              negative error number on failure
2190  */
2191 static int lfsck_layout_recreate_parent(const struct lu_env *env,
2192                                         struct lfsck_component *com,
2193                                         struct lfsck_tgt_desc *ltd,
2194                                         struct lu_orphan_rec_v2 *rec,
2195                                         struct lu_fid *cfid,
2196                                         const char *infix,
2197                                         const char *type,
2198                                         __u32 ea_off)
2199 {
2200         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2201         struct dt_insert_rec            *dtrec  = &info->lti_dt_rec;
2202         char                            *name   = info->lti_key;
2203         struct lu_attr                  *la     = &info->lti_la2;
2204         struct dt_object_format         *dof    = &info->lti_dof;
2205         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2206         struct ost_layout               *ol     = &rec->lor_layout;
2207         struct lu_fid                   *pfid   = &rec->lor_rec.lor_fid;
2208         struct lu_fid                   *tfid   = &info->lti_fid3;
2209         struct dt_device                *dev    = lfsck->li_bottom;
2210         struct dt_object                *lpf    = lfsck->li_lpf_obj;
2211         struct dt_object                *pobj   = NULL;
2212         struct dt_object                *cobj   = NULL;
2213         struct thandle                  *th     = NULL;
2214         struct lu_buf                   *ea_buf = &info->lti_big_buf;
2215         struct lu_buf                    lov_buf;
2216         struct lfsck_lock_handle        *llh    = &info->lti_llh;
2217         struct linkea_data               ldata  = { NULL };
2218         struct lu_buf                    linkea_buf;
2219         const struct lu_name            *pname;
2220         int                              size   = 0;
2221         int                              idx    = 0;
2222         int                              rc     = 0;
2223         ENTRY;
2224
2225         if (unlikely(lpf == NULL))
2226                 GOTO(log, rc = -ENXIO);
2227
2228         /* We use two separated transactions to repair the inconsistency.
2229          *
2230          * 1) create the MDT-object locally.
2231          * 2) update the OST-object's PFID EA if necessary.
2232          *
2233          * If 1) succeed, but 2) failed, then the OST-object's PFID EA will be
2234          * updated when the layout LFSCK run next time.
2235          *
2236          * If 1) failed, but 2) succeed, then such MDT-object will be re-created
2237          * when the layout LFSCK run next time. */
2238
2239         if (fid_is_zero(pfid)) {
2240                 rc = lfsck_fid_alloc(env, lfsck, pfid, false);
2241                 if (rc != 0)
2242                         GOTO(log, rc);
2243
2244                 cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
2245                 if (IS_ERR(cobj))
2246                         GOTO(log, rc = PTR_ERR(cobj));
2247         }
2248
2249         pobj = lfsck_object_find_by_dev(env, dev, pfid);
2250         if (IS_ERR(pobj))
2251                 GOTO(log, rc = PTR_ERR(pobj));
2252
2253         LASSERT(infix != NULL);
2254         LASSERT(type != NULL);
2255
2256         memset(la, 0, sizeof(*la));
2257         la->la_uid = rec->lor_rec.lor_uid;
2258         la->la_gid = rec->lor_rec.lor_gid;
2259         la->la_mode = S_IFREG | S_IRUSR;
2260         la->la_valid = LA_MODE | LA_UID | LA_GID;
2261
2262         memset(dof, 0, sizeof(*dof));
2263         dof->dof_type = dt_mode_to_dft(S_IFREG);
2264         /* Because the dof->dof_reg.striped = 0, the LOD will not create
2265          * the stripe(s). The LFSCK will specify the LOV EA via
2266          * lfsck_layout_update_lovea(). */
2267
2268         size = lfsck_lovea_size(ol, ea_off);
2269         if (ea_buf->lb_len < size) {
2270                 lu_buf_realloc(ea_buf, size);
2271                 if (ea_buf->lb_buf == NULL)
2272                         GOTO(log, rc = -ENOMEM);
2273         }
2274
2275 again:
2276         do {
2277                 snprintf(name, NAME_MAX, DFID"%s-%s-%d", PFID(pfid), infix,
2278                          type, idx++);
2279                 rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
2280                                (const struct dt_key *)name);
2281                 if (rc != 0 && rc != -ENOENT)
2282                         GOTO(log, rc);
2283         } while (rc == 0);
2284
2285         rc = lfsck_lock(env, lfsck, lfsck->li_lpf_obj, name, llh,
2286                         MDS_INODELOCK_UPDATE, LCK_PW);
2287         if (rc != 0)
2288                 GOTO(log, rc);
2289
2290         /* Re-check whether the name conflict with othrs after taken
2291          * the ldlm lock. */
2292         rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
2293                        (const struct dt_key *)name);
2294         if (unlikely(rc == 0)) {
2295                 lfsck_unlock(llh);
2296                 goto again;
2297         }
2298
2299         if (rc != -ENOENT)
2300                 GOTO(unlock, rc);
2301
2302         pname = lfsck_name_get_const(env, name, strlen(name));
2303         rc = linkea_links_new(&ldata, &lfsck_env_info(env)->lti_linkea_buf,
2304                               pname, lfsck_dto2fid(lfsck->li_lpf_obj));
2305         if (rc != 0)
2306                 GOTO(unlock, rc);
2307
2308         /* The 1st transaction. */
2309         th = dt_trans_create(env, dev);
2310         if (IS_ERR(th))
2311                 GOTO(unlock, rc = PTR_ERR(th));
2312
2313         rc = dt_declare_create(env, pobj, la, NULL, dof, th);
2314         if (rc != 0)
2315                 GOTO(stop, rc);
2316
2317         lfsck_buf_init(&lov_buf, ea_buf->lb_buf, size);
2318         rc = dt_declare_xattr_set(env, pobj, &lov_buf, XATTR_NAME_LOV,
2319                                   LU_XATTR_CREATE, th);
2320         if (rc != 0)
2321                 GOTO(stop, rc);
2322
2323         dtrec->rec_fid = pfid;
2324         dtrec->rec_type = S_IFREG;
2325         rc = dt_declare_insert(env, lpf,
2326                                (const struct dt_rec *)dtrec,
2327                                (const struct dt_key *)name, th);
2328         if (rc != 0)
2329                 GOTO(stop, rc);
2330
2331         lfsck_buf_init(&linkea_buf, ldata.ld_buf->lb_buf,
2332                        ldata.ld_leh->leh_len);
2333         rc = dt_declare_xattr_set(env, pobj, &linkea_buf,
2334                                   XATTR_NAME_LINK, 0, th);
2335         if (rc != 0)
2336                 GOTO(stop, rc);
2337
2338         rc = dt_trans_start_local(env, dev, th);
2339         if (rc != 0)
2340                 GOTO(stop, rc);
2341
2342         dt_write_lock(env, pobj, 0);
2343         rc = dt_create(env, pobj, la, NULL, dof, th);
2344         if (rc == 0)
2345                 rc = lfsck_layout_update_lovea(env, lfsck, th, ol, pobj, cfid,
2346                         &lov_buf, LU_XATTR_CREATE, ltd->ltd_index, ea_off);
2347         dt_write_unlock(env, pobj);
2348         if (rc < 0)
2349                 GOTO(stop, rc);
2350
2351         rc = dt_insert(env, lpf, (const struct dt_rec *)dtrec,
2352                        (const struct dt_key *)name, th, 1);
2353         if (rc != 0)
2354                 GOTO(stop, rc);
2355
2356         rc = dt_xattr_set(env, pobj, &linkea_buf, XATTR_NAME_LINK, 0, th);
2357         if (rc == 0 && cobj != NULL) {
2358                 dt_trans_stop(env, dev, th);
2359                 th = NULL;
2360
2361                 /* The 2nd transaction. */
2362                 rc = __lfsck_layout_update_pfid(env, cobj, pfid, ol, ea_off);
2363         }
2364
2365         GOTO(stop, rc);
2366
2367 stop:
2368         if (th != NULL)
2369                 dt_trans_stop(env, dev, th);
2370
2371 unlock:
2372         lfsck_unlock(llh);
2373
2374 log:
2375         if (cobj != NULL && !IS_ERR(cobj))
2376                 lfsck_object_put(env, cobj);
2377         if (pobj != NULL && !IS_ERR(pobj))
2378                 lfsck_object_put(env, pobj);
2379
2380         if (rc < 0)
2381                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant failed to "
2382                        "recreate the lost MDT-object: parent "DFID
2383                        ", child "DFID", OST-index %u, stripe-index %u, "
2384                        "infix %s, type %s: rc = %d\n",
2385                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
2386                        ltd->ltd_index, ea_off, infix, type, rc);
2387
2388         return rc >= 0 ? 1 : rc;
2389 }
2390
2391 static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
2392                                                    struct lfsck_component *com,
2393                                                    const struct lu_fid *fid,
2394                                                    __u32 index)
2395 {
2396         struct lfsck_thread_info *info  = lfsck_env_info(env);
2397         struct lfsck_request     *lr    = &info->lti_lr;
2398         struct lfsck_instance    *lfsck = com->lc_lfsck;
2399         struct lfsck_tgt_desc    *ltd;
2400         struct ptlrpc_request    *req;
2401         struct lfsck_request     *tmp;
2402         struct obd_export        *exp;
2403         int                       rc    = 0;
2404         ENTRY;
2405
2406         ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
2407         if (unlikely(ltd == NULL))
2408                 RETURN(-ENXIO);
2409
2410         exp = ltd->ltd_exp;
2411         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
2412                 GOTO(put, rc = -EOPNOTSUPP);
2413
2414         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
2415         if (req == NULL)
2416                 GOTO(put, rc = -ENOMEM);
2417
2418         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
2419         if (rc != 0) {
2420                 ptlrpc_request_free(req);
2421
2422                 GOTO(put, rc);
2423         }
2424
2425         memset(lr, 0, sizeof(*lr));
2426         lr->lr_event = LE_CONDITIONAL_DESTROY;
2427         lr->lr_active = LFSCK_TYPE_LAYOUT;
2428         lr->lr_fid = *fid;
2429
2430         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
2431         *tmp = *lr;
2432         ptlrpc_request_set_replen(req);
2433
2434         rc = ptlrpc_queue_wait(req);
2435         ptlrpc_req_finished(req);
2436
2437         GOTO(put, rc);
2438
2439 put:
2440         lfsck_tgt_put(ltd);
2441
2442         return rc;
2443 }
2444
2445 static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
2446                                                   struct lfsck_component *com,
2447                                                   struct lfsck_request *lr)
2448 {
2449         struct lfsck_thread_info        *info   = lfsck_env_info(env);
2450         struct lu_attr                  *la     = &info->lti_la;
2451         union ldlm_policy_data          *policy = &info->lti_policy;
2452         struct ldlm_res_id              *resid  = &info->lti_resid;
2453         struct lfsck_instance           *lfsck  = com->lc_lfsck;
2454         struct dt_device                *dev    = lfsck->li_bottom;
2455         struct lu_fid                   *fid    = &lr->lr_fid;
2456         struct dt_object                *obj;
2457         struct thandle                  *th     = NULL;
2458         struct lustre_handle             lh     = { 0 };
2459         __u64                            flags  = 0;
2460         int                              rc     = 0;
2461         ENTRY;
2462
2463         obj = lfsck_object_find_by_dev(env, dev, fid);
2464         if (IS_ERR(obj))
2465                 RETURN(PTR_ERR(obj));
2466
2467         dt_read_lock(env, obj, 0);
2468         if (dt_object_exists(obj) == 0 ||
2469             lfsck_is_dead_obj(obj)) {
2470                 dt_read_unlock(env, obj);
2471
2472                 GOTO(put, rc = -ENOENT);
2473         }
2474
2475         /* Get obj's attr without lock firstly. */
2476         rc = dt_attr_get(env, obj, la);
2477         dt_read_unlock(env, obj);
2478         if (rc != 0)
2479                 GOTO(put, rc);
2480
2481         if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
2482                 GOTO(put, rc = -ETXTBSY);
2483
2484         /* Acquire extent lock on [0, EOF] to sync with all possible written. */
2485         LASSERT(lfsck->li_namespace != NULL);
2486
2487         memset(policy, 0, sizeof(*policy));
2488         policy->l_extent.end = OBD_OBJECT_EOF;
2489         ost_fid_build_resid(fid, resid);
2490         rc = ldlm_cli_enqueue_local(lfsck->li_namespace, resid, LDLM_EXTENT,
2491                                     policy, LCK_EX, &flags, ldlm_blocking_ast,
2492                                     ldlm_completion_ast, NULL, NULL, 0,
2493                                     LVB_T_NONE, NULL, &lh);
2494         if (rc != ELDLM_OK)
2495                 GOTO(put, rc = -EIO);
2496
2497         dt_write_lock(env, obj, 0);
2498         /* Get obj's attr within lock again. */
2499         rc = dt_attr_get(env, obj, la);
2500         if (rc != 0)
2501                 GOTO(unlock, rc);
2502
2503         if (la->la_ctime != 0)
2504                 GOTO(unlock, rc = -ETXTBSY);
2505
2506         th = dt_trans_create(env, dev);
2507         if (IS_ERR(th))
2508                 GOTO(unlock, rc = PTR_ERR(th));
2509
2510         rc = dt_declare_ref_del(env, obj, th);
2511         if (rc != 0)
2512                 GOTO(stop, rc);
2513
2514         rc = dt_declare_destroy(env, obj, th);
2515         if (rc != 0)
2516                 GOTO(stop, rc);
2517
2518         rc = dt_trans_start_local(env, dev, th);
2519         if (rc != 0)
2520                 GOTO(stop, rc);
2521
2522         rc = dt_ref_del(env, obj, th);
2523         if (rc != 0)
2524                 GOTO(stop, rc);
2525
2526         rc = dt_destroy(env, obj, th);
2527         if (rc == 0)
2528                 CDEBUG(D_LFSCK, "%s: layout LFSCK destroyed the empty "
2529                        "OST-object "DFID" that was created for reparing "
2530                        "dangling referenced case. But the original missing "
2531                        "OST-object is found now.\n",
2532                        lfsck_lfsck2name(lfsck), PFID(fid));
2533
2534         GOTO(stop, rc);
2535
2536 stop:
2537         dt_trans_stop(env, dev, th);
2538
2539 unlock:
2540         dt_write_unlock(env, obj);
2541         ldlm_lock_decref(&lh, LCK_EX);
2542
2543 put:
2544         lfsck_object_put(env, obj);
2545
2546         return rc;
2547 }
2548
2549 /**
2550  * Some OST-object has occupied the specified layout EA slot.
2551  * Such OST-object may be generated by the LFSCK when repair
2552  * dangling referenced MDT-object, which can be indicated by
2553  * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
2554  * is true and such OST-object has not been modified yet, we
2555  * will replace it with the orphan OST-object; otherwise the
2556  * LFSCK will create new MDT-object to reference the orphan.
2557  *
2558  * \retval       +1: repaired
2559  * \retval        0: did nothing
2560  * \retval      -ve: on error
2561  */
2562 static int lfsck_layout_conflict_create(const struct lu_env *env,
2563                                         struct lfsck_component *com,
2564                                         struct lfsck_tgt_desc *ltd,
2565                                         struct lu_orphan_rec_v2 *rec,
2566                                         struct dt_object *parent,
2567                                         struct lu_fid *cfid,
2568                                         struct lu_buf *ea_buf,
2569                                         struct lov_mds_md_v1 *lmm,
2570                                         struct lov_ost_data_v1 *slot,
2571                                         __u32 ea_off, int lovea_size)
2572 {
2573         struct lfsck_thread_info *info          = lfsck_env_info(env);
2574         struct lu_fid            *cfid2         = &info->lti_fid2;
2575         struct ost_id            *oi            = &info->lti_oi;
2576         struct dt_device         *dev           = lfsck_obj2dev(parent);
2577         struct thandle           *th            = NULL;
2578         struct lustre_handle      lh            = { 0 };
2579         __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
2580         int                       rc            = 0;
2581         ENTRY;
2582
2583         while (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val)) {
2584                 if (unlikely(!thread_is_running(&com->lc_lfsck->li_thread)))
2585                         RETURN(0);
2586         }
2587
2588         ostid_le_to_cpu(&slot->l_ost_oi, oi);
2589         rc = ostid_to_fid(cfid2, oi, ost_idx2);
2590         if (rc != 0)
2591                 GOTO(out, rc);
2592
2593         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
2594                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2595                               LCK_EX);
2596         if (rc != 0)
2597                 GOTO(out, rc);
2598
2599         rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
2600
2601         /* If the conflict OST-obejct is not created for fixing dangling
2602          * referenced MDT-object in former LFSCK check/repair, or it has
2603          * been modified by others, then we cannot destroy it. Re-create
2604          * a new MDT-object for the orphan OST-object. */
2605         if (rc == -ETXTBSY) {
2606                 /* No need the layout lock on the original parent. */
2607                 lfsck_ibits_unlock(&lh, LCK_EX);
2608
2609                 fid_zero(&rec->lor_rec.lor_fid);
2610                 snprintf(info->lti_tmpbuf, sizeof(info->lti_tmpbuf),
2611                          "-"DFID"-%x", PFID(lu_object_fid(&parent->do_lu)),
2612                          ea_off);
2613                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
2614                                                 info->lti_tmpbuf, "C", ea_off);
2615
2616                 RETURN(rc);
2617         }
2618
2619         if (rc != 0 && rc != -ENOENT)
2620                 GOTO(unlock, rc);
2621
2622         th = dt_trans_create(env, dev);
2623         if (IS_ERR(th))
2624                 GOTO(unlock, rc = PTR_ERR(th));
2625
2626         rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
2627                                   LU_XATTR_REPLACE, th);
2628         if (rc != 0)
2629                 GOTO(stop, rc);
2630
2631         rc = dt_trans_start_local(env, dev, th);
2632         if (rc != 0)
2633                 GOTO(stop, rc);
2634
2635         dt_write_lock(env, parent, 0);
2636         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2637         rc = lfsck_layout_refill_lovea(env, com->lc_lfsck, th, parent, cfid,
2638                                        ea_buf, lmm, slot, LU_XATTR_REPLACE,
2639                                        ltd->ltd_index, lovea_size);
2640         dt_write_unlock(env, parent);
2641
2642         GOTO(stop, rc);
2643
2644 stop:
2645         dt_trans_stop(env, dev, th);
2646
2647 unlock:
2648         lfsck_ibits_unlock(&lh, LCK_EX);
2649
2650 out:
2651         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant replaced the conflict "
2652                "OST-object "DFID" on the OST %x with the orphan "DFID" on "
2653                "the OST %x: parent "DFID", stripe-index %u: rc = %d\n",
2654                lfsck_lfsck2name(com->lc_lfsck), PFID(cfid2), ost_idx2,
2655                PFID(cfid), ltd->ltd_index, PFID(lfsck_dto2fid(parent)),
2656                ea_off, rc);
2657
2658         return rc >= 0 ? 1 : rc;
2659 }
2660
2661 /**
2662  * \retval       +1: repaired
2663  * \retval        0: did nothing
2664  * \retval      -ve: on error
2665  */
2666 static int lfsck_layout_recreate_lovea(const struct lu_env *env,
2667                                        struct lfsck_component *com,
2668                                        struct lfsck_tgt_desc *ltd,
2669                                        struct lu_orphan_rec_v2 *rec,
2670                                        struct dt_object *parent,
2671                                        struct lu_fid *cfid,
2672                                        __u32 ost_idx, __u32 ea_off)
2673 {
2674         struct lfsck_thread_info *info          = lfsck_env_info(env);
2675         struct lu_buf            *buf           = &info->lti_big_buf;
2676         struct lu_fid            *fid           = &info->lti_fid2;
2677         struct ost_id            *oi            = &info->lti_oi;
2678         struct lfsck_instance    *lfsck         = com->lc_lfsck;
2679         struct dt_device         *dt            = lfsck_obj2dev(parent);
2680         struct lfsck_bookmark    *bk            = &lfsck->li_bookmark_ram;
2681         struct ost_layout        *ol            = &rec->lor_layout;
2682         struct lov_comp_md_v1    *lcm           = NULL;
2683         struct lov_comp_md_entry_v1 *lcme       = NULL;
2684         struct thandle           *handle        = NULL;
2685         size_t                    lovea_size;
2686         struct lov_mds_md_v1     *lmm;
2687         struct lov_ost_data_v1   *objs;
2688         struct lustre_handle      lh            = { 0 };
2689         __u32                     magic;
2690         __u32 flags = 0;
2691         int                       fl            = 0;
2692         int                       rc            = 0;
2693         int                       rc1;
2694         int                       i;
2695         __u16                     count;
2696         bool                      locked        = false;
2697         ENTRY;
2698
2699         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
2700                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
2701                               LCK_EX);
2702         if (rc != 0) {
2703                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant failed to recreate "
2704                        "LOV EA for "DFID": parent "DFID", OST-index %u, "
2705                        "stripe-index %u, comp_id %u, comp_start %llu, "
2706                        "comp_end %llu: rc = %d\n",
2707                        lfsck_lfsck2name(lfsck), PFID(cfid),
2708                        PFID(lfsck_dto2fid(parent)), ost_idx, ea_off,
2709                        ol->ol_comp_id, ol->ol_comp_start,
2710                        ol->ol_comp_end, rc);
2711
2712                 RETURN(rc);
2713         }
2714
2715 again:
2716         if (locked) {
2717                 dt_write_unlock(env, parent);
2718                 locked = false;
2719         }
2720
2721         if (handle != NULL) {
2722                 dt_trans_stop(env, dt, handle);
2723                 handle = NULL;
2724         }
2725
2726         if (rc < 0)
2727                 GOTO(unlock_layout, rc);
2728
2729         lovea_size = rc;
2730         if (buf->lb_len < lovea_size) {
2731                 lu_buf_realloc(buf, lovea_size);
2732                 if (buf->lb_buf == NULL)
2733                         GOTO(unlock_layout, rc = -ENOMEM);
2734         }
2735
2736         if (!(bk->lb_param & LPF_DRYRUN)) {
2737                 handle = dt_trans_create(env, dt);
2738                 if (IS_ERR(handle))
2739                         GOTO(unlock_layout, rc = PTR_ERR(handle));
2740
2741                 rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
2742                                           fl, handle);
2743                 if (rc != 0)
2744                         GOTO(stop, rc);
2745
2746                 rc = dt_trans_start_local(env, dt, handle);
2747                 if (rc != 0)
2748                         GOTO(stop, rc);
2749         }
2750
2751         dt_write_lock(env, parent, 0);
2752         locked = true;
2753         rc = dt_xattr_get(env, parent, buf, XATTR_NAME_LOV);
2754         if (rc == -ERANGE) {
2755                 rc = dt_xattr_get(env, parent, &LU_BUF_NULL, XATTR_NAME_LOV);
2756                 LASSERT(rc != 0);
2757                 goto again;
2758         } else if (rc == -ENODATA || rc == 0) {
2759                 lovea_size = lfsck_lovea_size(ol, ea_off);
2760                 /* If the declared is not big enough, re-try. */
2761                 if (buf->lb_len < lovea_size) {
2762                         rc = lovea_size;
2763                         goto again;
2764                 }
2765                 fl = LU_XATTR_CREATE;
2766         } else if (rc < 0) {
2767                 GOTO(unlock_parent, rc);
2768         } else if (unlikely(buf->lb_len == 0)) {
2769                 goto again;
2770         } else {
2771                 fl = LU_XATTR_REPLACE;
2772                 lovea_size = rc;
2773         }
2774
2775         if (fl == LU_XATTR_CREATE) {
2776                 if (bk->lb_param & LPF_DRYRUN)
2777                         GOTO(unlock_parent, rc = 1);
2778
2779                 LASSERT(buf->lb_len >= lovea_size);
2780
2781                 rc = lfsck_layout_update_lovea(env, lfsck, handle, ol, parent,
2782                                                cfid, buf, fl, ost_idx, ea_off);
2783
2784                 GOTO(unlock_parent, rc);
2785         }
2786
2787         lmm = buf->lb_buf;
2788         rc1 = lfsck_layout_verify_header(parent, lmm);
2789
2790         /* If the LOV EA crashed, the rebuild it. */
2791         if (rc1 == -EINVAL) {
2792                 if (bk->lb_param & LPF_DRYRUN)
2793                         GOTO(unlock_parent, rc = 1);
2794
2795                 LASSERT(buf->lb_len >= lovea_size);
2796
2797                 rc = lfsck_layout_update_lovea(env, lfsck, handle, ol, parent,
2798                                                cfid, buf, fl, ost_idx, ea_off);
2799
2800                 GOTO(unlock_parent, rc);
2801         }
2802
2803         /* For other unknown magic/pattern, keep the current LOV EA. */
2804         if (rc1 != 0)
2805                 GOTO(unlock_parent, rc = rc1);
2806
2807         magic = le32_to_cpu(lmm->lmm_magic);
2808         if (magic == LOV_MAGIC_COMP_V1) {
2809                 __u64 start;
2810                 __u64 end;
2811
2812                 if (bk->lb_param & LPF_DRYRUN)
2813                         GOTO(unlock_parent, rc = 1);
2814
2815                 lcm = buf->lb_buf;
2816                 count = le16_to_cpu(lcm->lcm_entry_count);
2817                 for (i = 0; i < count; i++) {
2818                         lcme = &lcm->lcm_entries[i];
2819                         start = le64_to_cpu(lcme->lcme_extent.e_start);
2820                         end = le64_to_cpu(lcme->lcme_extent.e_end);
2821
2822                         if (end <= ol->ol_comp_start)
2823                                 continue;
2824
2825                         if (start >= ol->ol_comp_end)
2826                                 break;
2827
2828                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
2829                         magic = le32_to_cpu(lmm->lmm_magic);
2830                         flags = le32_to_cpu(lcme->lcme_flags);
2831                         goto further;
2832                 }
2833
2834                 rc = lfsck_layout_add_comp_comp(env, lfsck, handle, ol, parent,
2835                                                cfid, buf, ost_idx, ea_off, i);
2836
2837                 GOTO(unlock_parent, rc);
2838         }
2839
2840 further:
2841         count = le16_to_cpu(lmm->lmm_stripe_count);
2842         if (count == 0)
2843                 GOTO(unlock_parent, rc = -EINVAL);
2844         LASSERT(count > 0);
2845
2846         /* Exceed the current end of MDT-object layout EA. Then extend it. */
2847         if (count <= ea_off) {
2848                 if (bk->lb_param & LPF_DRYRUN)
2849                         GOTO(unlock_parent, rc = 1);
2850
2851                 lovea_size = lov_mds_md_size(ea_off + 1, magic);
2852                 /* If the declared is not big enough, re-try. */
2853                 if (buf->lb_len < lovea_size) {
2854                         rc = lovea_size;
2855                         goto again;
2856                 }
2857
2858                 if (lcme && !(flags & LCME_FL_INIT))
2859                         lcme->lcme_flags = cpu_to_le32(flags | LCME_FL_INIT);
2860
2861                 rc = lfsck_layout_extend_v1v3_lovea(env, lfsck, handle, ol,
2862                                         parent, cfid, buf, ost_idx, ea_off);
2863
2864                 GOTO(unlock_parent, rc);
2865         }
2866
2867         LASSERTF(rc > 0, "invalid rc = %d\n", rc);
2868
2869         if (magic == LOV_MAGIC_V1) {
2870                 objs = &lmm->lmm_objects[0];
2871         } else {
2872                 LASSERT(magic == LOV_MAGIC_V3);
2873                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
2874         }
2875
2876         for (i = 0; i < count; i++, objs++) {
2877                 /* The MDT-object was created via lfsck_layout_recover_create()
2878                  * by others before, and we fill the dummy layout EA. */
2879                 if ((lcme && !(flags & LCME_FL_INIT)) ||
2880                      lovea_slot_is_dummy(objs)) {
2881                         if (i != ea_off)
2882                                 continue;
2883
2884                         if (bk->lb_param & LPF_DRYRUN)
2885                                 GOTO(unlock_parent, rc = 1);
2886
2887                         lmm->lmm_layout_gen =
2888                             cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
2889                         if (lcme) {
2890                                 LASSERT(lcm);
2891
2892                                 if (le32_to_cpu(lmm->lmm_stripe_size) !=
2893                                         ol->ol_stripe_size ||
2894                                     le16_to_cpu(lmm->lmm_stripe_count) !=
2895                                         ol->ol_stripe_count ||
2896                                     le64_to_cpu(lcme->lcme_extent.e_start) !=
2897                                         ol->ol_comp_start ||
2898                                     le64_to_cpu(lcme->lcme_extent.e_end) !=
2899                                         ol->ol_comp_end) {
2900                                         CDEBUG(D_LFSCK, "%s: found invalid "
2901                                         "component for "DFID ": parent "DFID
2902                                         ", stripe-index %u, stripe_size %u, "
2903                                         "stripe_count %u, comp_id %u, "
2904                                         "comp_start %llu, comp_end %llu, "
2905                                         "cur_stripe_size %u, "
2906                                         "cur_stripe_count %u, "
2907                                         "cur_comp_start %llu, "
2908                                         "cur_comp_end %llu\n",
2909                                         lfsck_lfsck2name(lfsck), PFID(cfid),
2910                                         PFID(lfsck_dto2fid(parent)), ea_off,
2911                                         ol->ol_stripe_size,
2912                                         ol->ol_stripe_count, ol->ol_comp_id,
2913                                         ol->ol_comp_start, ol->ol_comp_end,
2914                                         le32_to_cpu(lmm->lmm_stripe_size),
2915                                         le16_to_cpu(lmm->lmm_stripe_count),
2916                                         le64_to_cpu(lcme->lcme_extent.e_start),
2917                                         le64_to_cpu(lcme->lcme_extent.e_end));
2918
2919                                         GOTO(unlock_parent, rc = -EINVAL);
2920                                 }
2921
2922                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
2923                                 lovea_size = le32_to_cpu(lcm->lcm_size);
2924                                 if (!(flags & LCME_FL_INIT))
2925                                         lcme->lcme_flags = cpu_to_le32(flags |
2926                                                                 LCME_FL_INIT);
2927                         }
2928
2929                         LASSERTF(buf->lb_len >= lovea_size,
2930                                  "buffer len %d is less than real size %d\n",
2931                                  (int)buf->lb_len, (int)lovea_size);
2932
2933                         rc = lfsck_layout_refill_lovea(env, lfsck, handle,
2934                                                 parent, cfid, buf, lmm, objs,
2935                                                 fl, ost_idx, lovea_size);
2936
2937                         CDEBUG(D_LFSCK, "%s layout LFSCK assistant fill "
2938                                "dummy layout slot for "DFID": parent "DFID
2939                                ", OST-index %u, stripe-index %u: rc = %d\n",
2940                                lfsck_lfsck2name(lfsck), PFID(cfid),
2941                                PFID(lfsck_dto2fid(parent)), ost_idx, i, rc);
2942
2943                         GOTO(unlock_parent, rc);
2944                 }
2945
2946                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
2947                 rc = ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
2948                 if (rc != 0) {
2949                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
2950                                "invalid layout EA at the slot %d, index %u\n",
2951                                lfsck_lfsck2name(lfsck),
2952                                PFID(lfsck_dto2fid(parent)), i,
2953                                le32_to_cpu(objs->l_ost_idx));
2954
2955                         GOTO(unlock_parent, rc);
2956                 }
2957
2958                 /* It should be rare case, the slot is there, but the LFSCK
2959                  * does not handle it during the first-phase cycle scanning. */
2960                 if (unlikely(lu_fid_eq(fid, cfid))) {
2961                         if (i == ea_off) {
2962                                 GOTO(unlock_parent, rc = 0);
2963                         } else {
2964                                 /* Rare case that the OST-object index
2965                                  * does not match the parent MDT-object
2966                                  * layout EA. We trust the later one. */
2967                                 if (bk->lb_param & LPF_DRYRUN)
2968                                         GOTO(unlock_parent, rc = 1);
2969
2970                                 dt_write_unlock(env, parent);
2971                                 if (handle != NULL)
2972                                         dt_trans_stop(env, dt, handle);
2973                                 lfsck_ibits_unlock(&lh, LCK_EX);
2974                                 rc = lfsck_layout_update_pfid(env, com, parent,
2975                                                         cfid, ltd->ltd_tgt,
2976                                                         ol, i);
2977
2978                                 CDEBUG(D_LFSCK, "%s layout LFSCK assistant "
2979                                        "updated OST-object's pfid for "DFID
2980                                        ": parent "DFID", OST-index %u, "
2981                                        "stripe-index %u: rc = %d\n",
2982                                        lfsck_lfsck2name(lfsck), PFID(cfid),
2983                                        PFID(lfsck_dto2fid(parent)),
2984                                        ltd->ltd_index, i, rc);
2985
2986                                 RETURN(rc);
2987                         }
2988                 }
2989         }
2990
2991         /* The MDT-object exists, but related layout EA slot is occupied
2992          * by others. */
2993         if (bk->lb_param & LPF_DRYRUN)
2994                 GOTO(unlock_parent, rc = 1);
2995
2996         dt_write_unlock(env, parent);
2997         if (handle != NULL)
2998                 dt_trans_stop(env, dt, handle);
2999         lfsck_ibits_unlock(&lh, LCK_EX);
3000         if (magic == LOV_MAGIC_V1)
3001                 objs = &lmm->lmm_objects[ea_off];
3002         else
3003                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[ea_off];
3004         rc = lfsck_layout_conflict_create(env, com, ltd, rec, parent, cfid,
3005                                           buf, lmm, objs, ea_off, lovea_size);
3006
3007         RETURN(rc);
3008
3009 unlock_parent:
3010         if (locked)
3011                 dt_write_unlock(env, parent);
3012
3013 stop:
3014         if (handle != NULL)
3015                 dt_trans_stop(env, dt, handle);
3016
3017 unlock_layout:
3018         lfsck_ibits_unlock(&lh, LCK_EX);
3019
3020         return rc;
3021 }
3022
3023 static int lfsck_layout_scan_orphan_one(const struct lu_env *env,
3024                                         struct lfsck_component *com,
3025                                         struct lfsck_tgt_desc *ltd,
3026                                         struct lu_orphan_rec_v2 *rec,
3027                                         struct lu_fid *cfid)
3028 {
3029         struct lfsck_layout     *lo     = com->lc_file_ram;
3030         struct lu_fid           *pfid   = &rec->lor_rec.lor_fid;
3031         struct dt_object        *parent = NULL;
3032         __u32                    ea_off = pfid->f_stripe_idx;
3033         int                      rc     = 0;
3034         ENTRY;
3035
3036         if (!fid_is_sane(cfid))
3037                 GOTO(out, rc = -EINVAL);
3038
3039         pfid->f_ver = 0;
3040         if (fid_is_zero(pfid)) {
3041                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3042                                                   "", "N", ea_off);
3043                 GOTO(out, rc);
3044         }
3045
3046         if (!fid_is_sane(pfid))
3047                 GOTO(out, rc = -EINVAL);
3048
3049         parent = lfsck_object_find_by_dev(env, com->lc_lfsck->li_bottom, pfid);
3050         if (IS_ERR(parent))
3051                 GOTO(out, rc = PTR_ERR(parent));
3052
3053         if (unlikely(dt_object_remote(parent) != 0))
3054                 GOTO(put, rc = -EXDEV);
3055
3056         if (dt_object_exists(parent) == 0) {
3057                 lfsck_object_put(env, parent);
3058                 rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
3059                                                   "", "R", ea_off);
3060                 GOTO(out, rc);
3061         }
3062
3063         if (!S_ISREG(lu_object_attr(&parent->do_lu)))
3064                 GOTO(put, rc = -EISDIR);
3065
3066         /* The orphan OST-object claims to be the parent's stripe, then
3067          * related dangling record in the trace file is meaningless. */
3068         rc = lfsck_layout_del_dangling_rec(env, com, pfid,
3069                                            rec->lor_layout.ol_comp_id, ea_off);
3070         if (rc && rc != -ENOENT)
3071                 GOTO(put, rc);
3072
3073         rc = lfsck_layout_recreate_lovea(env, com, ltd, rec, parent, cfid,
3074                                          ltd->ltd_index, ea_off);
3075
3076         GOTO(put, rc);
3077
3078 put:
3079         if (rc <= 0)
3080                 lfsck_object_put(env, parent);
3081         else
3082                 /* The layout EA is changed, need to be reloaded next time. */
3083                 dt_object_put_nocache(env, parent);
3084
3085 out:
3086         down_write(&com->lc_sem);
3087         com->lc_new_scanned++;
3088         com->lc_new_checked++;
3089         if (rc > 0) {
3090                 lo->ll_objs_repaired[LLIT_ORPHAN - 1]++;
3091                 rc = 0;
3092         } else if (rc < 0) {
3093                 lo->ll_objs_failed_phase2++;
3094         }
3095         up_write(&com->lc_sem);
3096
3097         return rc;
3098 }
3099
3100 static int lfsck_layout_scan_orphan(const struct lu_env *env,
3101                                     struct lfsck_component *com,
3102                                     struct lfsck_tgt_desc *ltd)
3103 {
3104         struct lfsck_assistant_data     *lad    = com->lc_data;
3105         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3106         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
3107         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3108         struct lu_fid                   *fid    = &info->lti_fid;
3109         struct dt_object                *obj;
3110         const struct dt_it_ops          *iops;
3111         struct dt_it                    *di;
3112         int                              rc     = 0;
3113         ENTRY;
3114
3115         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant starts the orphan "
3116                "scanning for OST%04x\n",
3117                lfsck_lfsck2name(lfsck), ltd->ltd_index);
3118
3119         if (cfs_bitmap_check(lad->lad_bitmap, ltd->ltd_index)) {
3120                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant skip the orphan "
3121                        "scanning for OST%04x\n",
3122                        lfsck_lfsck2name(lfsck), ltd->ltd_index);
3123
3124                 RETURN(0);
3125         }
3126
3127         fid->f_seq = fid_idif_seq(0, ltd->ltd_index);
3128         fid->f_oid = fid->f_ver = 0;
3129
3130         obj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, fid);
3131         if (unlikely(IS_ERR(obj)))
3132                 GOTO(log, rc = PTR_ERR(obj));
3133
3134         rc = obj->do_ops->do_index_try(env, obj,
3135                                        &dt_lfsck_layout_orphan_features);
3136         if (rc != 0)
3137                 GOTO(put, rc);
3138
3139         iops = &obj->do_index_ops->dio_it;
3140         di = iops->init(env, obj, 0);
3141         if (IS_ERR(di))
3142                 GOTO(put, rc = PTR_ERR(di));
3143
3144         rc = iops->load(env, di, 0);
3145         if (rc == -ESRCH) {
3146                 /* -ESRCH means that the orphan OST-objects rbtree has been
3147                  * cleanup because of the OSS server restart or other errors. */
3148                 lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
3149                 GOTO(fini, rc);
3150         }
3151
3152         if (rc == 0)
3153                 rc = iops->next(env, di);
3154         else if (rc > 0)
3155                 rc = 0;
3156
3157         if (rc < 0)
3158                 GOTO(fini, rc);
3159
3160         if (rc > 0)
3161                 GOTO(fini, rc = 0);
3162
3163         do {
3164                 struct dt_key           *key;
3165                 struct lu_orphan_rec_v2 *rec = &info->lti_rec;
3166
3167                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
3168                     unlikely(!thread_is_running(&lfsck->li_thread)))
3169                         break;
3170
3171                 key = iops->key(env, di);
3172                 com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
3173                 /* Remote target OST may be runnning old LFSCK */
3174                 memset(rec, 0, sizeof(*rec));
3175                 rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
3176                 if (rc == 0)
3177                         rc = lfsck_layout_scan_orphan_one(env, com, ltd, rec,
3178                                         &com->lc_fid_latest_scanned_phase2);
3179                 if (rc != 0 && bk->lb_param & LPF_FAILOUT)
3180                         GOTO(fini, rc);
3181
3182                 lfsck_control_speed_by_self(com);
3183                 do {
3184                         rc = iops->next(env, di);
3185                 } while (rc < 0 && !(bk->lb_param & LPF_FAILOUT));
3186         } while (rc == 0);
3187
3188         GOTO(fini, rc);
3189
3190 fini:
3191         iops->put(env, di);
3192         iops->fini(env, di);
3193 put:
3194         lfsck_object_put(env, obj);
3195
3196 log:
3197         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant finished the orphan "
3198                "scanning for OST%04x: rc = %d\n",
3199                lfsck_lfsck2name(lfsck), ltd->ltd_index, rc);
3200
3201         return rc > 0 ? 0 : rc;
3202 }
3203
3204 static int lfsck_lmm2layout(struct lov_mds_md_v1 *lmm, struct ost_layout *ol,
3205                             __u32 comp_id)
3206 {
3207         __u32 magic = le32_to_cpu(lmm->lmm_magic);
3208         int rc = 0;
3209         ENTRY;
3210
3211         if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3) {
3212                 ol->ol_stripe_size = lmm->lmm_stripe_size;
3213                 ol->ol_stripe_count = lmm->lmm_stripe_count;
3214                 ol->ol_comp_start = 0;
3215                 ol->ol_comp_end = 0;
3216                 ol->ol_comp_id = 0;
3217         } else if (magic == LOV_MAGIC_COMP_V1) {
3218                 struct lov_comp_md_v1 *lcm = (struct lov_comp_md_v1 *)lmm;
3219                 struct lov_comp_md_entry_v1 *lcme = NULL;
3220                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3221                 int i;
3222
3223                 for (i = 0; i < count; i++) {
3224                         lcme = &lcm->lcm_entries[i];
3225                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3226                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3227                                         LCME_FL_INIT);
3228
3229                                 break;
3230                         }
3231                 }
3232
3233                 /* The comp has been removed, do nothing. */
3234                 if (i == count)
3235                         GOTO(out, rc = 1);
3236
3237                 lmm = (void *)lmm + le32_to_cpu(lcme->lcme_offset);
3238                 ol->ol_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
3239                 ol->ol_stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
3240                 ol->ol_comp_start = le64_to_cpu(lcme->lcme_extent.e_start);
3241                 ol->ol_comp_end = le64_to_cpu(lcme->lcme_extent.e_end);
3242                 ol->ol_comp_id = le32_to_cpu(lcme->lcme_id);
3243         } else {
3244                 GOTO(out, rc = -EINVAL);
3245         }
3246
3247         EXIT;
3248
3249 out:
3250         return rc;
3251 }
3252
3253 /**
3254  * Repair the MDT-object with dangling LOV EA reference.
3255  *
3256  * we need to repair the inconsistency according to the users' requirement:
3257  *
3258  * 1) Keep the inconsistency there and report the inconsistency case,
3259  *    then give the chance to the application to find related issues,
3260  *    and the users can make the decision about how to handle it with
3261  *    more human knownledge. (by default)
3262  *
3263  * 2) Re-create the missing OST-object with the FID/owner information.
3264  *
3265  * \param[in] env       pointer to the thread context
3266  * \param[in] com       the layout LFSCK component
3267  * \param[in] parent    the MDT-object with dangling LOV EA reference
3268  * \param[in] child     the OST-object to be created
3269  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3270  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3271  * \param[in] ost_idx   the index of OST on which the OST-object resides
3272  *
3273  * \retval              +1 for repair successfully
3274  * \retval              0 for did nothing
3275  * \retval              negative error number on failure
3276  */
3277 static int __lfsck_layout_repair_dangling(const struct lu_env *env,
3278                                           struct lfsck_component *com,
3279                                           struct dt_object *parent,
3280                                           struct dt_object *child,
3281                                           __u32 comp_id, __u32 ea_off,
3282                                           __u32 ost_idx, bool log)
3283 {
3284         struct lfsck_thread_info *info = lfsck_env_info(env);
3285         struct filter_fid *ff = &info->lti_ff;
3286         struct ost_layout *ol = &ff->ff_layout;
3287         struct dt_object_format *dof = &info->lti_dof;
3288         struct lu_attr *la = &info->lti_la;
3289         struct lfsck_instance *lfsck = com->lc_lfsck;
3290         struct dt_device *dev = lfsck_obj2dev(child);
3291         const struct lu_fid *pfid = lfsck_dto2fid(parent);
3292         const struct lu_fid *cfid = lfsck_dto2fid(child);
3293         struct lu_buf *tbuf = &info->lti_big_buf;
3294         struct thandle *handle;
3295         struct lu_buf *buf;
3296         struct lustre_handle lh = { 0 };
3297         int rc;
3298         ENTRY;
3299
3300         if (!(lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ))
3301                 GOTO(log, rc = 1);
3302
3303         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3304                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3305                               LCK_EX);
3306         if (rc != 0)
3307                 GOTO(log, rc);
3308
3309         rc = dt_attr_get(env, parent, la);
3310         if (rc != 0)
3311                 GOTO(unlock1, rc);
3312
3313         la->la_mode = S_IFREG | 0666;
3314         la->la_atime = la->la_mtime = la->la_ctime = 0;
3315         la->la_valid = LA_TYPE | LA_MODE | LA_UID | LA_GID |
3316                        LA_ATIME | LA_MTIME | LA_CTIME;
3317         memset(dof, 0, sizeof(*dof));
3318         ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
3319         ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
3320         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3321          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3322          * parent MDT-object's layout EA. */
3323         ff->ff_parent.f_stripe_idx = cpu_to_le32(ea_off);
3324
3325         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3326         if (rc < 0)
3327                 GOTO(unlock1, rc);
3328
3329         rc = lfsck_lmm2layout(tbuf->lb_buf, ol, comp_id);
3330         if (rc)
3331                 GOTO(unlock1, rc);
3332
3333         buf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
3334         handle = dt_trans_create(env, dev);
3335         if (IS_ERR(handle))
3336                 GOTO(unlock1, rc = PTR_ERR(handle));
3337
3338         rc = dt_declare_create(env, child, la, NULL, dof, handle);
3339         if (rc != 0)
3340                 GOTO(stop, rc);
3341
3342         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID,
3343                                   LU_XATTR_CREATE, handle);
3344         if (rc != 0)
3345                 GOTO(stop, rc);
3346
3347         rc = dt_trans_start_local(env, dev, handle);
3348         if (rc != 0)
3349                 GOTO(stop, rc);
3350
3351         dt_read_lock(env, parent, 0);
3352         if (unlikely(lfsck_is_dead_obj(parent)))
3353                 GOTO(unlock2, rc = 0);
3354
3355         if (lfsck->li_bookmark_ram.lb_param & LPF_DELAY_CREATE_OSTOBJ) {
3356                 struct ost_id *oi = &info->lti_oi;
3357                 struct lu_fid *tfid = &info->lti_fid2;
3358                 struct lu_buf *lovea = &info->lti_big_buf;
3359                 struct lov_mds_md_v1 *lmm;
3360                 struct lov_ost_data_v1 *objs;
3361                 __u32 magic;
3362                 int count;
3363                 int idx2;
3364
3365                 rc = lfsck_layout_get_lovea(env, parent, lovea);
3366                 if (rc <= 0)
3367                         GOTO(unlock2, rc);
3368
3369                 lmm = lovea->lb_buf;
3370                 magic = le32_to_cpu(lmm->lmm_magic);
3371                 if (magic == LOV_MAGIC_COMP_V1) {
3372                         struct lov_comp_md_v1 *lcm = buf->lb_buf;
3373                         struct lov_comp_md_entry_v1 *lcme;
3374                         __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3375                         int i;
3376
3377                         for (i = 0; i < count; i++) {
3378                                 lcme = &lcm->lcm_entries[i];
3379                                 if (le32_to_cpu(lcme->lcme_id) == comp_id) {
3380                                         LASSERT(le32_to_cpu(lcme->lcme_flags) &
3381                                                 LCME_FL_INIT);
3382
3383                                         lmm = lovea->lb_buf +
3384                                                 le32_to_cpu(lcme->lcme_offset);
3385                                         magic = le32_to_cpu(lmm->lmm_magic);
3386                                         goto check;
3387                                 }
3388                         }
3389
3390                         /* Someone removed the component, do nothing. */
3391                         GOTO(unlock2, rc = 0);
3392                 }
3393
3394 check:
3395                 count = le16_to_cpu(lmm->lmm_stripe_count);
3396                 /* Someone changed the LOV EA, do nothing. */
3397                 if (count <= ea_off)
3398                         GOTO(unlock2, rc = 0);
3399
3400                 if (magic == LOV_MAGIC_V1) {
3401                         objs = &lmm->lmm_objects[ea_off];
3402                 } else {
3403                         LASSERT(magic == LOV_MAGIC_V3);
3404
3405                         objs = &((struct lov_mds_md_v3 *)lmm)->\
3406                                                         lmm_objects[ea_off];
3407                 }
3408
3409                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
3410                 idx2 = le32_to_cpu(objs->l_ost_idx);
3411                 rc = ostid_to_fid(tfid, oi, idx2);
3412                 /* Someone changed the LOV EA, do nothing. */
3413                 if (rc != 0 || !lu_fid_eq(tfid, cfid))
3414                         GOTO(unlock2, rc);
3415         }
3416
3417         rc = dt_create(env, child, la, NULL, dof, handle);
3418         if (rc != 0)
3419                 GOTO(unlock2, rc);
3420
3421         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, LU_XATTR_CREATE,
3422                           handle);
3423
3424         GOTO(unlock2, rc);
3425
3426 unlock2:
3427         dt_read_unlock(env, parent);
3428
3429 stop:
3430         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3431
3432 unlock1:
3433         lfsck_ibits_unlock(&lh, LCK_EX);
3434
3435 log:
3436         if (rc && log)
3437                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3438                        "dangling reference for: parent "DFID", child "
3439                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: "
3440                        "rc = %d\n",
3441                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3442                        comp_id, ea_off, ost_idx,
3443                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3444                                 "Create the lost OST-object as required" :
3445                                 "Keep the MDT-object there by default", rc);
3446
3447         return rc;
3448 }
3449
3450 /**
3451  * Repair the MDT-object with dangling LOV EA reference.
3452  *
3453  * Prepare parameters and call __lfsck_layout_repair_dangling()
3454  * to repair the dangling LOV EA reference.
3455  *
3456  * \param[in] env       pointer to the thread context
3457  * \param[in] com       the layout LFSCK component
3458  * \param[in] pfid      the MDT-object's FID
3459  * \param[in] cfid      the FID for the OST-object to be created
3460  * \param[in] comp_id   the component ID of the OST-object in the LOV EA
3461  * \param[in] ea_off    the offset of the OST-object in the LOV EA
3462  * \param[in] ost_idx   the index of OST on which the OST-object resides
3463  *
3464  * \retval              +1 for repair successfully
3465  * \retval              0 for did nothing
3466  * \retval              negative error number on failure
3467  */
3468 static int lfsck_layout_repair_dangling(const struct lu_env *env,
3469                                         struct lfsck_component *com,
3470                                         const struct lu_fid *pfid,
3471                                         const struct lu_fid *cfid,
3472                                         __u32 comp_id, __u32 ea_off,
3473                                         __u32 ost_idx)
3474 {
3475         struct lfsck_instance *lfsck = com->lc_lfsck;
3476         struct dt_object *parent = NULL;
3477         struct dt_object *child = NULL;
3478         struct lfsck_tgt_desc *ltd;
3479         int rc;
3480         ENTRY;
3481
3482         parent = lfsck_object_find_bottom(env, lfsck, pfid);
3483         if (IS_ERR(parent))
3484                 GOTO(log, rc = PTR_ERR(parent));
3485
3486         /* The MDT-object has been removed. */
3487         if (dt_object_exists(parent) == 0)
3488                 GOTO(log, rc = 0);
3489
3490         ltd = lfsck_ltd2tgt(&lfsck->li_ost_descs, ost_idx);
3491         if (unlikely(ltd == NULL))
3492                 GOTO(log, rc = -ENODEV);
3493
3494         child = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
3495         if (IS_ERR(child))
3496                 GOTO(log, rc = PTR_ERR(child));
3497
3498         /* The OST-object has been created. */
3499         if (unlikely(dt_object_exists(child) != 0))
3500                 GOTO(log, rc = 0);
3501
3502         rc = __lfsck_layout_repair_dangling(env, com, parent, child,
3503                                             comp_id, ea_off, ost_idx, false);
3504
3505         GOTO(log, rc);
3506
3507 log:
3508         if (child != NULL && !IS_ERR(child))
3509                 lfsck_object_put(env, child);
3510
3511         if (parent != NULL && !IS_ERR(parent))
3512                 lfsck_object_put(env, parent);
3513
3514         if (rc)
3515                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant found "
3516                        "dangling reference for: parent "DFID", child "
3517                        DFID", comp_id %u, ea_off %u, ost_idx %u, %s: rc = %d\n",
3518                        lfsck_lfsck2name(lfsck), PFID(pfid), PFID(cfid),
3519                        comp_id, ea_off, ost_idx,
3520                        (lfsck->li_bookmark_ram.lb_param & LPF_CREATE_OSTOBJ) ?
3521                                 "Create the lost OST-object as required" :
3522                                 "Keep the MDT-object there by default", rc);
3523
3524         return rc;
3525 }
3526
3527 /* If the OST-object does not recognize the MDT-object as its parent, and
3528  * there is no other MDT-object claims as its parent, then just trust the
3529  * given MDT-object as its parent. So update the OST-object filter_fid. */
3530 static int lfsck_layout_repair_unmatched_pair(const struct lu_env *env,
3531                                               struct lfsck_component *com,
3532                                               struct dt_object *parent,
3533                                               struct lfsck_layout_req *llr,
3534                                               struct lu_attr *la)
3535 {
3536         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3537         struct filter_fid               *ff     = &info->lti_ff;
3538         struct ost_layout               *ol     = &ff->ff_layout;
3539         struct dt_object                *child  = llr->llr_child;
3540         struct dt_device                *dev    = lfsck_obj2dev(child);
3541         const struct lu_fid             *tfid   = lu_object_fid(&parent->do_lu);
3542         struct lu_buf                   *tbuf   = &info->lti_big_buf;
3543         struct thandle                  *handle;
3544         struct lu_buf                   *buf;
3545         struct lustre_handle             lh     = { 0 };
3546         int                              rc;
3547         ENTRY;
3548
3549         rc = lfsck_ibits_lock(env, com->lc_lfsck, parent, &lh,
3550                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3551                               LCK_EX);
3552         if (rc != 0)
3553                 GOTO(log, rc);
3554
3555         ff->ff_parent.f_seq = cpu_to_le64(tfid->f_seq);
3556         ff->ff_parent.f_oid = cpu_to_le32(tfid->f_oid);
3557         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
3558          * MDT-object's FID::f_ver, instead it is the OST-object index in its
3559          * parent MDT-object's layout EA. */
3560         ff->ff_parent.f_stripe_idx = cpu_to_le32(llr->llr_lov_idx);
3561
3562         rc = lfsck_layout_get_lovea(env, parent, tbuf);
3563         if (rc < 0)
3564                 GOTO(unlock1, rc);
3565
3566         rc = lfsck_lmm2layout(tbuf->lb_buf, ol, llr->llr_comp_id);
3567         if (rc)
3568                 GOTO(unlock1, rc);
3569
3570         buf = lfsck_buf_get(env, ff, sizeof(*ff));
3571
3572         handle = dt_trans_create(env, dev);
3573         if (IS_ERR(handle))
3574                 GOTO(unlock1, rc = PTR_ERR(handle));
3575
3576         rc = dt_declare_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3577         if (rc != 0)
3578                 GOTO(stop, rc);
3579
3580         rc = dt_attr_get(env, parent, la);
3581         if (rc != 0)
3582                 GOTO(stop, rc);
3583
3584         la->la_valid = LA_UID | LA_GID;
3585         rc = dt_declare_attr_set(env, child, la, handle);
3586         if (rc != 0)
3587                 GOTO(stop, rc);
3588
3589         rc = dt_trans_start_local(env, dev, handle);
3590         if (rc != 0)
3591                 GOTO(stop, rc);
3592
3593         dt_write_lock(env, parent, 0);
3594         if (unlikely(lfsck_is_dead_obj(parent)))
3595                 GOTO(unlock2, rc = 1);
3596
3597         rc = dt_xattr_set(env, child, buf, XATTR_NAME_FID, 0, handle);
3598         if (rc != 0)
3599                 GOTO(unlock2, rc);
3600
3601         /* Get the latest parent's owner. */
3602         rc = dt_attr_get(env, parent, la);
3603         if (rc != 0)
3604                 GOTO(unlock2, rc);
3605
3606         la->la_valid = LA_UID | LA_GID;
3607         rc = dt_attr_set(env, child, la, handle);
3608
3609         GOTO(unlock2, rc);
3610
3611 unlock2:
3612         dt_write_unlock(env, parent);
3613
3614 stop:
3615         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3616
3617 unlock1:
3618         lfsck_ibits_unlock(&lh, LCK_EX);
3619
3620 log:
3621         if (rc)
3622                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3623                        "unmatched MDT-OST pair for: parent "DFID
3624                        ", child "DFID", comp_id %u, OST-index %u, "
3625                        "stripe-index %u, owner %u/%u: rc = %d\n",
3626                        lfsck_lfsck2name(com->lc_lfsck),
3627                        PFID(lfsck_dto2fid(parent)),
3628                        PFID(lfsck_dto2fid(child)),
3629                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3630                        la->la_uid, la->la_gid, rc);
3631
3632         return rc;
3633 }
3634
3635 /* If there are more than one MDT-objects claim as the OST-object's parent,
3636  * and the OST-object only recognizes one of them, then we need to generate
3637  * new OST-object(s) with new fid(s) for the non-recognized MDT-object(s). */
3638 static int lfsck_layout_repair_multiple_references(const struct lu_env *env,
3639                                                    struct lfsck_component *com,
3640                                                    struct dt_object *parent,
3641                                                    struct lfsck_layout_req *llr,
3642                                                    struct lu_attr *la)
3643 {
3644         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3645         struct dt_allocation_hint       *hint   = &info->lti_hint;
3646         struct dt_object_format         *dof    = &info->lti_dof;
3647         struct ost_id                   *oi     = &info->lti_oi;
3648         struct lu_buf                   *buf    = &info->lti_big_buf;
3649         struct lfsck_instance           *lfsck  = com->lc_lfsck;
3650         struct dt_device                *dev;
3651         struct lu_device                *d      =
3652                                 &lfsck_obj2dev(llr->llr_child)->dd_lu_dev;
3653         struct lu_object                *o;
3654         struct lu_object                *n;
3655         struct dt_object                *child  = NULL;
3656         struct thandle                  *handle = NULL;
3657         struct lov_mds_md_v1            *lmm;
3658         struct lov_ost_data_v1          *objs;
3659         const struct lu_fid             *pfid   = lfsck_dto2fid(parent);
3660         struct lu_fid                    tfid;
3661         struct lustre_handle             lh     = { 0 };
3662         __u32                            magic;
3663         __u32                            index;
3664         int                              rc;
3665         ENTRY;
3666
3667         /* We use two separated transactions to repair the inconsistency.
3668          *
3669          * 1) create the child (OST-object).
3670          * 2) update the parent LOV EA according to the child's FID.
3671          *
3672          * If 1) succeed, but 2) failed or aborted, then such OST-object will be
3673          * handled as orphan when the layout LFSCK run next time.
3674          *
3675          * If 1) failed, but 2) succeed, then such OST-object will be re-created
3676          * as dangling referened case when the layout LFSCK run next time. */
3677
3678         /* The 1st transaction. */
3679         o = lu_object_anon(env, d, NULL);
3680         if (IS_ERR(o))
3681                 GOTO(log, rc = PTR_ERR(o));
3682
3683         n = lu_object_locate(o->lo_header, d->ld_type);
3684         if (unlikely(n == NULL)) {
3685                 lu_object_put_nocache(env, o);
3686
3687                 GOTO(log, rc = -EINVAL);
3688         }
3689
3690         child = container_of(n, struct dt_object, do_lu);
3691         memset(hint, 0, sizeof(*hint));
3692         rc = dt_attr_get(env, parent, la);
3693         if (rc != 0)
3694                 GOTO(log, rc);
3695
3696         la->la_valid = LA_UID | LA_GID;
3697         memset(dof, 0, sizeof(*dof));
3698
3699         dev = lfsck_obj2dev(child);
3700         handle = dt_trans_create(env, dev);
3701         if (IS_ERR(handle))
3702                 GOTO(log, rc = PTR_ERR(handle));
3703
3704         rc = dt_declare_create(env, child, la, hint, dof, handle);
3705         if (rc != 0)
3706                 GOTO(stop, rc);
3707
3708         rc = dt_trans_start_local(env, dev, handle);
3709         if (rc != 0)
3710                 GOTO(stop, rc);
3711
3712         rc = dt_create(env, child, la, hint, dof, handle);
3713         dt_trans_stop(env, dev, handle);
3714         handle = NULL;
3715         if (rc != 0)
3716                 GOTO(log, rc);
3717
3718         rc = lfsck_ibits_lock(env, lfsck, parent, &lh,
3719                               MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR,
3720                               LCK_EX);
3721         if (rc != 0)
3722                 GOTO(log, rc);
3723
3724         /* The 2nd transaction. */
3725
3726         /* XXX: Generally, we should use bottom device (OSD) to update parent
3727          *      LOV EA. But because the LOD-object still references the wrong
3728          *      OSP-object that should be detached after the parent's LOV EA
3729          *      refreshed. Unfortunately, there is no suitable API for that.
3730          *      So we have to make the LOD to re-load the OSP-object(s) via
3731          *      replacing the LOV EA against the LOD-object.
3732          *
3733          *      Once the DNE2 patches have been landed, we can replace the
3734          *      LOD device with the OSD device. LU-6230. */
3735
3736         dev = lfsck->li_next;
3737         parent = lfsck_object_locate(dev, parent);
3738         if (IS_ERR(parent))
3739                 GOTO(log, rc = PTR_ERR(parent));
3740
3741         handle = dt_trans_create(env, dev);
3742         if (IS_ERR(handle))
3743                 GOTO(log, rc = PTR_ERR(handle));
3744
3745         rc = dt_declare_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3746                                   LU_XATTR_REPLACE, handle);
3747         if (rc != 0)
3748                 GOTO(stop, rc);
3749
3750         rc = dt_trans_start_local(env, dev, handle);
3751         if (rc != 0)
3752                 GOTO(stop, rc);
3753
3754         dt_write_lock(env, parent, 0);
3755         if (unlikely(lfsck_is_dead_obj(parent)))
3756                 GOTO(unlock, rc = 0);
3757
3758         rc = lfsck_layout_get_lovea(env, parent, buf);
3759         if (unlikely(!rc || rc == -ENODATA))
3760                 GOTO(unlock, rc = 0);
3761
3762         lmm = buf->lb_buf;
3763         magic = le32_to_cpu(lmm->lmm_magic);
3764         if (magic == LOV_MAGIC_COMP_V1) {
3765                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
3766                 struct lov_comp_md_entry_v1 *lcme;
3767                 __u16 count = le16_to_cpu(lcm->lcm_entry_count);
3768                 int i;
3769
3770                 LASSERT(llr->llr_comp_id != 0);
3771
3772                 for (i = 0; i < count; i++) {
3773                         lcme = &lcm->lcm_entries[i];
3774                         if (le32_to_cpu(lcme->lcme_id) == llr->llr_comp_id) {
3775                                 LASSERT(le32_to_cpu(lcme->lcme_flags) &
3776                                         LCME_FL_INIT);
3777
3778                                 le32_add_cpu(&lcm->lcm_layout_gen, 1);
3779                                 lmm = buf->lb_buf +
3780                                         le32_to_cpu(lcme->lcme_offset);
3781                                 magic = le32_to_cpu(lmm->lmm_magic);
3782                                 goto set;
3783                         }
3784                 }
3785
3786                 GOTO(unlock, rc = 0);
3787         }
3788
3789 set:
3790         if (magic == LOV_MAGIC_V1) {
3791                 objs = &lmm->lmm_objects[llr->llr_lov_idx];
3792         } else {
3793                 LASSERT(magic == LOV_MAGIC_V3);
3794                 objs =
3795                 &((struct lov_mds_md_v3 *)lmm)->lmm_objects[llr->llr_lov_idx];
3796         }
3797
3798         ostid_le_to_cpu(&objs->l_ost_oi, oi);
3799         index = le32_to_cpu(objs->l_ost_idx);
3800         rc = ostid_to_fid(&tfid, oi, index);
3801         /* Someone changed layout during the LFSCK, no need to repair then. */
3802         if (rc == 0 && !lu_fid_eq(&tfid, lu_object_fid(&llr->llr_child->do_lu)))
3803                 GOTO(unlock, rc = 0);
3804
3805         lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
3806         fid_to_ostid(lu_object_fid(&child->do_lu), oi);
3807         ostid_cpu_to_le(oi, &objs->l_ost_oi);
3808         objs->l_ost_gen = cpu_to_le32(0);
3809         objs->l_ost_idx = cpu_to_le32(llr->llr_ost_idx);
3810         rc = dt_xattr_set(env, parent, buf, XATTR_NAME_LOV,
3811                           LU_XATTR_REPLACE, handle);
3812
3813         GOTO(unlock, rc = (rc == 0 ? 1 : rc));
3814
3815 unlock:
3816         dt_write_unlock(env, parent);
3817
3818 stop:
3819         if (handle != NULL)
3820                 dt_trans_stop(env, dev, handle);
3821
3822 log:
3823         lfsck_ibits_unlock(&lh, LCK_EX);
3824         if (child != NULL)
3825                 lfsck_object_put(env, child);
3826
3827         if (rc)
3828                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3829                        "multiple references for: parent "DFID", comp_id %u, "
3830                        "OST-index %u, stripe-index %u, owner %u/%u: rc = %d\n",
3831                        lfsck_lfsck2name(lfsck), PFID(pfid),
3832                        llr->llr_comp_id, llr->llr_ost_idx, llr->llr_lov_idx,
3833                        la->la_uid, la->la_gid, rc);
3834
3835         return rc;
3836 }
3837
3838 /* If the MDT-object and the OST-object have different owner information,
3839  * then trust the MDT-object, because the normal chown/chgrp handle order
3840  * is from MDT to OST, and it is possible that some chown/chgrp operation
3841  * is partly done. */
3842 static int lfsck_layout_repair_owner(const struct lu_env *env,
3843                                      struct lfsck_component *com,
3844                                      struct dt_object *parent,
3845                                      struct lfsck_layout_req *llr,
3846                                      struct lu_attr *pla,
3847                                      const struct lu_attr *cla)
3848 {
3849         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3850         struct lu_attr                  *tla    = &info->lti_la2;
3851         struct dt_object                *child  = llr->llr_child;
3852         struct dt_device                *dev    = lfsck_obj2dev(child);
3853         struct thandle                  *handle;
3854         int                              rc;
3855         ENTRY;
3856
3857         tla->la_uid = pla->la_uid;
3858         tla->la_gid = pla->la_gid;
3859         tla->la_valid = LA_UID | LA_GID;
3860         handle = dt_trans_create(env, dev);
3861         if (IS_ERR(handle))
3862                 GOTO(log, rc = PTR_ERR(handle));
3863
3864         rc = dt_declare_attr_set(env, child, tla, handle);
3865         if (rc != 0)
3866                 GOTO(stop, rc);
3867
3868         rc = dt_trans_start_local(env, dev, handle);
3869         if (rc != 0)
3870                 GOTO(stop, rc);
3871
3872         /* Use the dt_object lock to serialize with destroy and attr_set. */
3873         dt_read_lock(env, parent, 0);
3874         if (unlikely(lfsck_is_dead_obj(parent)))
3875                 GOTO(unlock, rc = 1);
3876
3877         /* Get the latest parent's owner. */
3878         rc = dt_attr_get(env, parent, pla);
3879         if (rc != 0)
3880                 GOTO(unlock, rc);
3881
3882         /* Some others chown/chgrp during the LFSCK, needs to do nothing. */
3883         if (unlikely(tla->la_uid != pla->la_uid ||
3884                      tla->la_gid != pla->la_gid))
3885                 rc = 1;
3886         else
3887                 rc = dt_attr_set(env, child, tla, handle);
3888
3889         GOTO(unlock, rc);
3890
3891 unlock:
3892         dt_read_unlock(env, parent);
3893
3894 stop:
3895         rc = lfsck_layout_trans_stop(env, dev, handle, rc);
3896
3897 log:
3898         if (rc != 0)
3899                 CDEBUG(D_LFSCK, "%s: layout LFSCK assistant repaired "
3900                        "inconsistent file owner for: parent "DFID", child "DFID
3901                        ", OST-index %u, stripe-index %u, old owner %u/%u, "
3902                        "new owner %u/%u: rc = %d\n",
3903                        lfsck_lfsck2name(com->lc_lfsck),
3904                        PFID(lfsck_dto2fid(parent)), PFID(lfsck_dto2fid(child)),
3905                        llr->llr_ost_idx, llr->llr_lov_idx,
3906                        cla->la_uid, cla->la_gid, tla->la_uid, tla->la_gid, rc);
3907
3908         return rc;
3909 }
3910
3911 /* Check whether the OST-object correctly back points to the
3912  * MDT-object (@parent) via the XATTR_NAME_FID xattr (@pfid). */
3913 static int lfsck_layout_check_parent(const struct lu_env *env,
3914                                      struct lfsck_component *com,
3915                                      struct lfsck_assistant_object *lso,
3916                                      struct filter_fid *ff,
3917                                      const struct lu_fid *cfid,
3918                                      const struct lu_attr *cla,
3919                                      struct lfsck_layout_req *llr)
3920 {
3921         struct lfsck_thread_info        *info   = lfsck_env_info(env);
3922         struct lu_buf                   *buf    = &info->lti_big_buf;
3923         struct lu_fid                   *pfid   = &info->lti_fid;
3924         struct dt_object                *tobj;
3925         struct lov_mds_md_v1            *lmm;
3926         struct lov_ost_data_v1          *objs;
3927         struct lustre_handle             lh     = { 0 };
3928         int                              rc;
3929         int                              i;
3930         __u32                            magic;
3931         __u32                            idx;
3932         __u16                            count;
3933         ENTRY;
3934
3935         *pfid = ff->ff_parent;
3936         idx = pfid->f_stripe_idx;
3937         pfid->f_ver = 0;
3938
3939         if (unlikely(!fid_is_sane(pfid)))
3940                 RETURN(LLIT_UNMATCHED_PAIR);
3941
3942         if (lu_fid_eq(pfid, &lso->lso_fid)) {
3943                 if (likely(llr->llr_lov_idx == idx))
3944                         RETURN(0);
3945
3946                 RETURN(LLIT_UNMATCHED_PAIR);
3947         }
3948
3949         tobj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
3950         if (IS_ERR(tobj))
3951                 RETURN(PTR_ERR(tobj));
3952
3953         if (dt_object_exists(tobj) == 0 || lfsck_is_dead_obj(tobj) ||
3954             !S_ISREG(lfsck_object_type(tobj)))
3955                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
3956
3957         /* Load the tobj's layout EA, in spite of it is a local MDT-object or
3958          * remote one on another MDT. Then check whether the given OST-object
3959          * is in such layout. If yes, it is multiple referenced, otherwise it
3960          * is unmatched referenced case. */
3961         rc = lfsck_layout_get_lovea(env, tobj, buf);
3962         if (rc == 0 || rc == -ENOENT)
3963                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
3964
3965         if (rc < 0)
3966                 GOTO(out, rc);
3967
3968         lmm = buf->lb_buf;
3969         magic = le32_to_cpu(lmm->lmm_magic);
3970         if (magic == LOV_MAGIC_COMP_V1) {
3971                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
3972                 struct lov_comp_md_entry_v1 *lcme;
3973
3974                 if (ff->ff_layout.ol_comp_id == 0)
3975                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
3976
3977                 count = le16_to_cpu(lcm->lcm_entry_count);
3978                 for (i = 0; i < count; i++) {
3979                         lcme = &lcm->lcm_entries[i];
3980                         if (le32_to_cpu(lcme->lcme_id) ==
3981                             ff->ff_layout.ol_comp_id) {
3982                                 lmm = buf->lb_buf +
3983                                         le32_to_cpu(lcme->lcme_offset);
3984                                 magic = le32_to_cpu(lmm->lmm_magic);
3985                                 if (!(le32_to_cpu(lcme->lcme_flags) &
3986                                       LCME_FL_INIT))
3987                                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
3988
3989                                 goto further;
3990                         }
3991                 }
3992
3993                 GOTO(out, rc = LLIT_UNMATCHED_PAIR);
3994         }
3995
3996 further:
3997         if (magic == LOV_MAGIC_V1) {
3998                 objs = &lmm->lmm_objects[0];
3999         } else {
4000                 LASSERT(magic == LOV_MAGIC_V3);
4001                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4002         }
4003
4004         count = le16_to_cpu(lmm->lmm_stripe_count);
4005         for (i = 0; i < count; i++, objs++) {
4006                 struct lu_fid           *tfid   = &info->lti_fid2;
4007                 struct ost_id           *oi     = &info->lti_oi;
4008                 __u32                    idx2;
4009
4010                 if (lovea_slot_is_dummy(objs))
4011                         continue;
4012
4013                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
4014                 idx2 = le32_to_cpu(objs->l_ost_idx);
4015                 rc = ostid_to_fid(tfid, oi, idx2);
4016                 if (rc != 0) {
4017                         CDEBUG(D_LFSCK, "%s: the parent "DFID" contains "
4018                                "invalid layout EA at the slot %d, index %u\n",
4019                                lfsck_lfsck2name(com->lc_lfsck),
4020                                PFID(pfid), i, idx2);
4021
4022                         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4023                 }
4024
4025                 if (lu_fid_eq(cfid, tfid)) {
4026                         rc = lfsck_ibits_lock(env, com->lc_lfsck, tobj, &lh,
4027                                               MDS_INODELOCK_UPDATE |
4028                                               MDS_INODELOCK_LAYOUT |
4029                                               MDS_INODELOCK_XATTR,
4030                                               LCK_EX);
4031                         if (rc != 0)
4032                                 GOTO(out, rc);
4033
4034                         dt_read_lock(env, tobj, 0);
4035
4036                         /* For local MDT-object, re-check existence
4037                          * after taken the lock. */
4038                         if (!dt_object_remote(tobj)) {
4039                                 if (dt_object_exists(tobj) == 0 ||
4040                                     lfsck_is_dead_obj(tobj))
4041                                         rc = LLIT_UNMATCHED_PAIR;
4042                                 else
4043                                         rc = LLIT_MULTIPLE_REFERENCED;
4044
4045                                 GOTO(unlock, rc);
4046                         }
4047
4048                         /* For migration case, the new MDT-object and old
4049                          * MDT-object may reference the same OST-object at
4050                          * some migration internal time.
4051                          *
4052                          * For remote MDT-object, the local MDT may not know
4053                          * whether it has been removed or not.  Try checking
4054                          * for a non-existent xattr to check if this object
4055                          * has been been removed or not. */
4056                         rc = dt_xattr_get(env, tobj, &LU_BUF_NULL,
4057                                           XATTR_NAME_DUMMY);
4058                         if (unlikely(rc == -ENOENT || rc >= 0))
4059                                 rc = LLIT_UNMATCHED_PAIR;
4060                         else if (rc == -ENODATA)
4061                                 rc = LLIT_MULTIPLE_REFERENCED;
4062
4063                         GOTO(unlock, rc);
4064                 }
4065         }
4066
4067         GOTO(out, rc = LLIT_UNMATCHED_PAIR);
4068
4069 unlock:
4070         if (lustre_handle_is_used(&lh)) {
4071                 dt_read_unlock(env, tobj);
4072                 lfsck_ibits_unlock(&lh, LCK_EX);
4073         }
4074
4075 out:
4076         lfsck_object_put(env, tobj);
4077
4078         return rc;
4079 }
4080
4081 static int lfsck_layout_assistant_handler_p1(const struct lu_env *env,
4082                                              struct lfsck_component *com,
4083                                              struct lfsck_assistant_req *lar)
4084 {
4085         struct lfsck_layout_req              *llr    =
4086                         container_of0(lar, struct lfsck_layout_req, llr_lar);
4087         struct lfsck_assistant_object        *lso    = lar->lar_parent;
4088         struct lfsck_layout                  *lo     = com->lc_file_ram;
4089         struct lfsck_thread_info             *info   = lfsck_env_info(env);
4090         struct filter_fid                    *ff     = &info->lti_ff;
4091         struct lu_buf buf = { .lb_buf = ff,
4092                               .lb_len = sizeof(*ff) };
4093         struct dt_object                     *parent = NULL;
4094         struct dt_object                     *child  = llr->llr_child;
4095         struct lu_attr                       *pla    = &lso->lso_attr;
4096         struct lu_attr                       *cla    = &info->lti_la;
4097         struct lfsck_instance                *lfsck  = com->lc_lfsck;
4098         struct lfsck_bookmark                *bk     = &lfsck->li_bookmark_ram;
4099         enum lfsck_layout_inconsistency_type  type   = LLIT_NONE;
4100         int                                   rc;
4101         ENTRY;
4102
4103         if (lso->lso_dead)
4104                 RETURN(0);
4105
4106         CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_ASSISTANT_DIRECT, cfs_fail_val);
4107
4108         rc = dt_attr_get(env, child, cla);
4109         if (rc == -ENOENT) {
4110                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4111                 if (IS_ERR(parent)) {
4112                         rc = PTR_ERR(parent);
4113
4114                         RETURN(rc == -ENOENT ? 0 : rc);
4115                 }
4116
4117                 type = LLIT_DANGLING;
4118                 goto repair;
4119         }
4120
4121         if (rc != 0)
4122                 GOTO(out, rc);
4123
4124         lfsck_buf_init(&buf, ff, sizeof(*ff));
4125         rc = dt_xattr_get(env, child, &buf, XATTR_NAME_FID);
4126         if (unlikely(rc > 0 && rc < sizeof(struct lu_fid))) {
4127                 type = LLIT_UNMATCHED_PAIR;
4128                 goto repair;
4129         }
4130
4131         if (rc < 0 && rc != -ENODATA)
4132                 GOTO(out, rc);
4133
4134         if (rc == 0 || rc == -ENODATA)
4135                 GOTO(check_owner, rc = 0);
4136
4137         filter_fid_le_to_cpu(ff, ff, sizeof(*ff));
4138         rc = lfsck_layout_check_parent(env, com, lso, ff,
4139                                        lu_object_fid(&child->do_lu), cla, llr);
4140         if (rc > 0) {
4141                 type = rc;
4142                 goto repair;
4143         }
4144
4145         if (rc < 0)
4146                 GOTO(out, rc);
4147
4148 check_owner:
4149         /* Someone may has changed the owner after the parent attr pre-loaded.
4150          * It can be handled later inside the lfsck_layout_repair_owner(). */
4151         if (unlikely(cla->la_uid != pla->la_uid ||
4152                      cla->la_gid != pla->la_gid)) {
4153                 type = LLIT_INCONSISTENT_OWNER;
4154                 goto repair;
4155         }
4156
4157 repair:
4158         if (type == LLIT_NONE)
4159                 GOTO(out, rc = 0);
4160
4161         if (bk->lb_param & LPF_DRYRUN)
4162                 GOTO(out, rc = 1);
4163
4164         if (parent == NULL) {
4165                 parent = lfsck_assistant_object_load(env, lfsck, lso);
4166                 if (IS_ERR(parent)) {
4167                         rc = PTR_ERR(parent);
4168
4169                         if (rc == -ENOENT)
4170                                 RETURN(0);
4171
4172                         GOTO(out, rc);
4173                 }
4174         }
4175
4176         switch (type) {
4177         case LLIT_DANGLING:
4178                 if (bk->lb_param & LPF_DELAY_CREATE_OSTOBJ)
4179                         rc = lfsck_layout_ins_dangling_rec(env, com,
4180                                 lfsck_dto2fid(parent), lfsck_dto2fid(child),
4181                                 llr->llr_comp_id, llr->llr_lov_idx,
4182                                 llr->llr_ost_idx);
4183                 else
4184                         rc = __lfsck_layout_repair_dangling(env, com, parent,
4185                                                             llr->llr_child,
4186                                                             llr->llr_comp_id,
4187                                                             llr->llr_lov_idx,
4188                                                             llr->llr_ost_idx,
4189                                                             true);
4190                 break;
4191         case LLIT_UNMATCHED_PAIR:
4192                 rc = lfsck_layout_repair_unmatched_pair(env, com, parent,
4193                                                         llr, pla);
4194                 break;
4195         case LLIT_MULTIPLE_REFERENCED:
4196                 rc = lfsck_layout_repair_multiple_references(env, com, parent,
4197                                                              llr, pla);
4198                 break;
4199         case LLIT_INCONSISTENT_OWNER:
4200                 rc = lfsck_layout_repair_owner(env, com, parent, llr, pla, cla);
4201                 break;
4202         default:
4203                 rc = 0;
4204                 break;
4205         }
4206
4207         GOTO(out, rc);
4208
4209 out:
4210         down_write(&com->lc_sem);
4211         if (rc < 0) {
4212                 struct lfsck_assistant_data *lad = com->lc_data;
4213
4214                 if (unlikely(lad->lad_exit)) {
4215                         rc = 0;
4216                 } else if (rc == -ENOTCONN || rc == -ESHUTDOWN ||
4217                            rc == -ETIMEDOUT || rc == -EHOSTDOWN ||
4218                            rc == -EHOSTUNREACH) {
4219                         /* If cannot touch the target server,
4220                          * mark the LFSCK as INCOMPLETE. */
4221                         CDEBUG(D_LFSCK, "%s: layout LFSCK assistant fail to "
4222                                "talk with OST %x: rc = %d\n",
4223                                lfsck_lfsck2name(lfsck), llr->llr_ost_idx, rc);
4224                         lfsck_lad_set_bitmap(env, com, llr->llr_ost_idx);
4225                         lo->ll_objs_skipped++;
4226                         rc = 0;
4227                 } else {
4228                         lfsck_layout_record_failure(env, lfsck, lo);
4229                 }
4230         } else if (rc > 0 && (type != LLIT_DANGLING ||
4231                               !(bk->lb_param & LPF_DELAY_CREATE_OSTOBJ))) {
4232                 LASSERTF(type > LLIT_NONE && type <= LLIT_MAX,
4233                          "unknown type = %d\n", type);
4234
4235                 lo->ll_objs_repaired[type - 1]++;
4236                 if (bk->lb_param & LPF_DRYRUN &&
4237                     unlikely(lo->ll_pos_first_inconsistent == 0))
4238                         lo->ll_pos_first_inconsistent =
4239                         lfsck->li_obj_oit->do_index_ops->dio_it.store(env,
4240                                                         lfsck->li_di_oit);
4241         }
4242         up_write(&com->lc_sem);
4243
4244         if (parent != NULL && !IS_ERR(parent))
4245                 lfsck_object_put(env, parent);
4246
4247         return rc;
4248 }
4249
4250 static int
4251 lfsck_layout_double_scan_one_trace_file(const struct lu_env *env,
4252                                         struct lfsck_component *com,
4253                                         struct dt_object *obj, bool first)
4254 {
4255         struct lfsck_instance *lfsck = com->lc_lfsck;
4256         struct ptlrpc_thread *thread = &lfsck->li_thread;
4257         struct lfsck_bookmark *bk = &lfsck->li_bookmark_ram;
4258         struct lfsck_layout *lo = com->lc_file_ram;
4259         const struct dt_it_ops *iops = &obj->do_index_ops->dio_it;
4260         struct dt_it *di;
4261         struct dt_key *key;
4262         struct lfsck_layout_dangling_key *parent =
4263                                         &lfsck_env_info(env)->lti_lldk;
4264         struct lu_fid *cfid = &lfsck_env_info(env)->lti_fid3;
4265         __u32 ost_idx;
4266         int rc;
4267         ENTRY;
4268
4269         di = iops->init(env, obj, 0);
4270         if (IS_ERR(di))
4271                 RETURN(PTR_ERR(di));
4272
4273         if (first)
4274                 lldk_cpu_to_be(parent, &lo->ll_lldk_latest_scanned_phase2);
4275         else
4276                 memset(parent, 0, sizeof(*parent));
4277         rc = iops->get(env, di, (const struct dt_key *)parent);
4278         if (rc < 0)
4279                 GOTO(fini, rc);
4280
4281         if (first) {
4282                 /* The start one either has been processed or does not exist,
4283                  * skip it. */
4284                 rc = iops->next(env, di);
4285                 if (rc != 0)
4286                         GOTO(put, rc);
4287         }
4288
4289         do {
4290                 if (CFS_FAIL_TIMEOUT(OBD_FAIL_LFSCK_DELAY3, cfs_fail_val) &&
4291                     unlikely(!thread_is_running(thread)))
4292                         GOTO(put, rc = 0);
4293
4294                 key = iops->key(env, di);
4295                 if (IS_ERR(key)) {
4296                         rc = PTR_ERR(key);
4297                         if (rc == -ENOENT)
4298                                 GOTO(put, rc = 1);
4299
4300                         goto checkpoint;
4301                 }
4302
4303                 lldk_be_to_cpu(parent,
4304                                 (const struct lfsck_layout_dangling_key *)key);
4305                 if (!fid_is_sane(&parent->lldk_fid)) {
4306                         rc = 0;
4307                         goto checkpoint;
4308                 }
4309
4310                 rc = iops->rec(env, di, (struct dt_rec *)cfid, 0);
4311                 if (rc == 0) {
4312                         fid_be_to_cpu(cfid, cfid);
4313                         ost_idx = cfid->f_ver;
4314                         cfid->f_ver = 0;
4315                         if (!fid_is_sane(cfid)) {
4316                                 rc = 0;
4317                                 goto checkpoint;
4318                         }
4319
4320                         rc = lfsck_layout_repair_dangling(env, com,
4321                                         &parent->lldk_fid, cfid,
4322                                         parent->lldk_comp_id,
4323                                         parent->lldk_ea_off, ost_idx);
4324                 }
4325
4326 checkpoint:
4327                 down_write(&com->lc_sem);
4328                 com->lc_new_checked++;
4329                 com->lc_new_scanned++;
4330                 if (rc >= 0)
4331                         lo->ll_lldk_latest_scanned_phase2 = *parent;
4332
4333                 if (rc > 0)
4334                         lo->ll_objs_repaired[LLIT_DANGLING - 1]++;
4335                 else if (rc < 0)
4336                         lo->ll_objs_failed_phase2++;
4337                 up_write(&com->lc_sem);
4338
4339                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
4340                         GOTO(put, rc);
4341
4342                 if (unlikely(cfs_time_beforeq(com->lc_time_next_checkpoint,
4343                                               cfs_time_current())) &&
4344                     com->lc_new_checked != 0) {
4345                         down_write(&com->lc_sem);
4346                         lo->ll_run_time_phase2 +=
4347                                 cfs_duration_sec(cfs_time_current() +
4348                                 HALF_SEC - com->lc_time_last_checkpoint);
4349                         lo->ll_time_last_checkpoint = cfs_time_current_sec();
4350                         lo->ll_objs_checked_phase2 += com->lc_new_checked;
4351                         com->lc_new_checked = 0;
4352                         lfsck_layout_store(env, com);
4353                         up_write(&com->lc_sem);
4354
4355                         com->lc_time_last_checkpoint = cfs_time_current();
4356                         com->lc_time_next_checkpoint =
4357                                 com->lc_time_last_checkpoint +
4358                                 cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL);
4359                 }
4360
4361                 lfsck_control_speed_by_self(com);
4362                 if (unlikely(!thread_is_running(thread)))
4363                         GOTO(put, rc = 0);
4364
4365                 rc = iops->next(env, di);
4366         } while (rc == 0);
4367
4368         GOTO(put, rc);
4369
4370 put:
4371         iops->put(env, di);
4372
4373 fini:
4374         iops->fini(env, di);
4375
4376         return rc;
4377 }
4378
4379 static int lfsck_layout_assistant_handler_p2(const struct lu_env *env,
4380                                              struct lfsck_component *com)
4381 {
4382         struct lfsck_assistant_data     *lad    = com->lc_data;
4383         struct lfsck_instance           *lfsck  = com->lc_lfsck;
4384         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
4385         struct lfsck_tgt_descs          *ltds   = &lfsck->li_ost_descs;
4386         struct lfsck_tgt_desc           *ltd;
4387         int                              rc     = 0;
4388         ENTRY;
4389
4390         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan start\n",
4391                lfsck_lfsck2name(lfsck));
4392
4393         spin_lock(&ltds->ltd_lock);
4394         while (!list_empty(&lad->lad_ost_phase2_list)) {
4395                 ltd = list_entry(lad->lad_ost_phase2_list.next,
4396                                  struct lfsck_tgt_desc,
4397                                  ltd_layout_phase_list);
4398                 list_del_init(&ltd->ltd_layout_phase_list);
4399                 if (bk->lb_param & LPF_OST_ORPHAN) {
4400                         spin_unlock(&ltds->ltd_lock);
4401                         rc = lfsck_layout_scan_orphan(env, com, ltd);
4402                         if (rc != 0 && bk->lb_param & LPF_FAILOUT)
4403                                 RETURN(rc);
4404
4405                         if (unlikely(lad->lad_exit ||
4406                                      !thread_is_running(&lfsck->li_thread)))
4407                                 RETURN(0);
4408                         spin_lock(&ltds->ltd_lock);
4409                 }
4410         }
4411
4412         if (list_empty(&lad->lad_ost_phase1_list))
4413                 rc = 1;
4414         else
4415                 rc = 0;
4416         spin_unlock(&ltds->ltd_lock);
4417
4418         if (rc == 1 && bk->lb_param & LPF_OST_ORPHAN) {
4419                 struct lfsck_layout *lo = com->lc_file_ram;
4420                 int i;
4421
4422                 com->lc_new_checked = 0;
4423                 com->lc_new_scanned = 0;
4424                 com->lc_time_last_checkpoint = cfs_time_current();
4425                 com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
4426                                 cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL);
4427
4428                 i = lfsck_sub_trace_file_fid2idx(
4429                                 &lo->ll_lldk_latest_scanned_phase2.lldk_fid);
4430                 rc = lfsck_layout_double_scan_one_trace_file(env, com,
4431                                 com->lc_sub_trace_objs[i].lsto_obj, true);
4432                 while (rc > 0 && ++i < LFSCK_STF_COUNT)
4433                         rc = lfsck_layout_double_scan_one_trace_file(env, com,
4434                                 com->lc_sub_trace_objs[i].lsto_obj, false);
4435
4436                 CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan dangling stop "
4437                        "at the No. %d trace file: rc = %d\n",
4438                        lfsck_lfsck2name(lfsck), i, rc);
4439         }
4440
4441         CDEBUG(D_LFSCK, "%s: layout LFSCK phase2 scan stop: rc = %d\n",
4442                lfsck_lfsck2name(lfsck), rc);
4443
4444         RETURN(rc);
4445 }
4446
4447 static int
4448 lfsck_layout_slave_async_interpret(const struct lu_env *env,
4449                                    struct ptlrpc_request *req,
4450                                    void *args, int rc)
4451 {
4452         struct lfsck_layout_slave_async_args *llsaa = args;
4453         struct obd_export                    *exp   = llsaa->llsaa_exp;
4454         struct lfsck_component               *com   = llsaa->llsaa_com;
4455         struct lfsck_layout_slave_target     *llst  = llsaa->llsaa_llst;
4456         struct lfsck_layout_slave_data       *llsd  = com->lc_data;
4457         struct lfsck_reply                   *lr    = NULL;
4458         bool                                  done  = false;
4459
4460         if (rc != 0) {
4461                 /* It is probably caused by network trouble, or target crash,
4462                  * it will try several times (depends on the obd_timeout, and
4463                  * will not less than 3 times). But to make the LFSCK can go
4464                  * ahead, we should not try for ever. After some try but still
4465                  * hit failure, it will assume that the target exit the LFSCK
4466                  * prcoessing and stop try. */
4467                 if (rc == -ENOTCONN || rc == -ESHUTDOWN) {
4468                         int max_try = max_t(int, obd_timeout / 30, 3);
4469
4470                         if (++(llst->llst_failures) > max_try)
4471                                 done = true;
4472                 } else {
4473                         done = true;
4474                 }
4475         } else {
4476                 llst->llst_failures = 0;
4477                 lr = req_capsule_server_get(&req->rq_pill, &RMF_LFSCK_REPLY);
4478                 if (lr->lr_status != LS_SCANNING_PHASE1 &&
4479                     lr->lr_status != LS_SCANNING_PHASE2)
4480                         done = true;
4481         }
4482
4483         if (done) {
4484                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave gets the MDT %x "
4485                        "status %d, failures_try %d\n", lfsck_lfsck2name(com->lc_lfsck),
4486                        llst->llst_index, lr != NULL ? lr->lr_status : rc,
4487                        llst->llst_failures);
4488
4489                 lfsck_layout_llst_del(llsd, llst);
4490         }
4491
4492         lfsck_layout_llst_put(llst);
4493         lfsck_component_put(env, com);
4494         class_export_put(exp);
4495
4496         return 0;
4497 }
4498
4499 static int lfsck_layout_async_query(const struct lu_env *env,
4500                                     struct lfsck_component *com,
4501                                     struct obd_export *exp,
4502                                     struct lfsck_layout_slave_target *llst,
4503                                     struct lfsck_request *lr,
4504                                     struct ptlrpc_request_set *set)
4505 {
4506         struct lfsck_layout_slave_async_args *llsaa;
4507         struct ptlrpc_request                *req;
4508         struct lfsck_request                 *tmp;
4509         int                                   rc;
4510         ENTRY;
4511
4512         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_QUERY);
4513         if (req == NULL)
4514                 RETURN(-ENOMEM);
4515
4516         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_QUERY);
4517         if (rc != 0) {
4518                 ptlrpc_request_free(req);
4519                 RETURN(rc);
4520         }
4521
4522         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4523         *tmp = *lr;
4524         ptlrpc_request_set_replen(req);
4525
4526         llsaa = ptlrpc_req_async_args(req);
4527         llsaa->llsaa_exp = exp;
4528         llsaa->llsaa_com = lfsck_component_get(com);
4529         llsaa->llsaa_llst = llst;
4530         req->rq_interpret_reply = lfsck_layout_slave_async_interpret;
4531         req->rq_allow_intr = 1;
4532         req->rq_no_delay = 1;
4533         ptlrpc_set_add_req(set, req);
4534
4535         RETURN(0);
4536 }
4537
4538 static int lfsck_layout_async_notify(const struct lu_env *env,
4539                                      struct obd_export *exp,
4540                                      struct lfsck_request *lr,
4541                                      struct ptlrpc_request_set *set)
4542 {
4543         struct ptlrpc_request   *req;
4544         struct lfsck_request    *tmp;
4545         int                      rc;
4546         ENTRY;
4547
4548         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4549         if (req == NULL)
4550                 RETURN(-ENOMEM);
4551
4552         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4553         if (rc != 0) {
4554                 ptlrpc_request_free(req);
4555                 RETURN(rc);
4556         }
4557
4558         tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4559         *tmp = *lr;
4560         ptlrpc_request_set_replen(req);
4561         req->rq_allow_intr = 1;
4562         req->rq_no_delay = 1;
4563         ptlrpc_set_add_req(set, req);
4564
4565         RETURN(0);
4566 }
4567
4568 static int
4569 lfsck_layout_slave_query_master(const struct lu_env *env,
4570                                 struct lfsck_component *com)
4571 {
4572         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4573         struct lfsck_instance            *lfsck = com->lc_lfsck;
4574         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4575         struct lfsck_layout_slave_target *llst;
4576         struct obd_export                *exp;
4577         struct ptlrpc_request_set        *set;
4578         int                               rc    = 0;
4579         int                               rc1   = 0;
4580         ENTRY;
4581
4582         set = ptlrpc_prep_set();
4583         if (set == NULL)
4584                 GOTO(log, rc = -ENOMEM);
4585
4586         memset(lr, 0, sizeof(*lr));
4587         lr->lr_event = LE_QUERY;
4588         lr->lr_active = LFSCK_TYPE_LAYOUT;
4589
4590         llsd->llsd_touch_gen++;
4591         spin_lock(&llsd->llsd_lock);
4592         while (!list_empty(&llsd->llsd_master_list)) {
4593                 llst = list_entry(llsd->llsd_master_list.next,
4594                                   struct lfsck_layout_slave_target,
4595                                   llst_list);
4596                 if (llst->llst_gen == llsd->llsd_touch_gen)
4597                         break;
4598
4599                 llst->llst_gen = llsd->llsd_touch_gen;
4600                 list_move_tail(&llst->llst_list,
4601                                &llsd->llsd_master_list);
4602                 atomic_inc(&llst->llst_ref);
4603                 spin_unlock(&llsd->llsd_lock);
4604
4605                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4606                                                llst->llst_index);
4607                 if (exp == NULL) {
4608                         lfsck_layout_llst_del(llsd, llst);
4609                         lfsck_layout_llst_put(llst);
4610                         spin_lock(&llsd->llsd_lock);
4611                         continue;
4612                 }
4613
4614                 rc = lfsck_layout_async_query(env, com, exp, llst, lr, set);
4615                 if (rc != 0) {
4616                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4617                                "query %s for layout: rc = %d\n",
4618                                lfsck_lfsck2name(lfsck),
4619                                exp->exp_obd->obd_name, rc);
4620
4621                         rc1 = rc;
4622                         lfsck_layout_llst_put(llst);
4623                         class_export_put(exp);
4624                 }
4625                 spin_lock(&llsd->llsd_lock);
4626         }
4627         spin_unlock(&llsd->llsd_lock);
4628
4629         rc = ptlrpc_set_wait(set);
4630         ptlrpc_set_destroy(set);
4631
4632         GOTO(log, rc = (rc1 != 0 ? rc1 : rc));
4633
4634 log:
4635         CDEBUG(D_LFSCK, "%s: layout LFSCK slave queries master: rc = %d\n",
4636                lfsck_lfsck2name(com->lc_lfsck), rc);
4637
4638         return rc;
4639 }
4640
4641 static void
4642 lfsck_layout_slave_notify_master(const struct lu_env *env,
4643                                  struct lfsck_component *com,
4644                                  enum lfsck_events event, int result)
4645 {
4646         struct lfsck_layout              *lo    = com->lc_file_ram;
4647         struct lfsck_instance            *lfsck = com->lc_lfsck;
4648         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
4649         struct lfsck_request             *lr    = &lfsck_env_info(env)->lti_lr;
4650         struct lfsck_layout_slave_target *llst;
4651         struct obd_export                *exp;
4652         struct ptlrpc_request_set        *set;
4653         int                               rc;
4654         ENTRY;
4655
4656         CDEBUG(D_LFSCK, "%s: layout LFSCK slave notifies master\n",
4657                lfsck_lfsck2name(com->lc_lfsck));
4658
4659         set = ptlrpc_prep_set();
4660         if (set == NULL)
4661                 RETURN_EXIT;
4662
4663         memset(lr, 0, sizeof(*lr));
4664         lr->lr_event = event;
4665         lr->lr_flags = LEF_FROM_OST;
4666         lr->lr_status = result;
4667         lr->lr_index = lfsck_dev_idx(lfsck);
4668         lr->lr_active = LFSCK_TYPE_LAYOUT;
4669         lr->lr_flags2 = lo->ll_flags;
4670         llsd->llsd_touch_gen++;
4671         spin_lock(&llsd->llsd_lock);
4672         while (!list_empty(&llsd->llsd_master_list)) {
4673                 llst = list_entry(llsd->llsd_master_list.next,
4674                                   struct lfsck_layout_slave_target,
4675                                   llst_list);
4676                 if (llst->llst_gen == llsd->llsd_touch_gen)
4677                         break;
4678
4679                 llst->llst_gen = llsd->llsd_touch_gen;
4680                 list_move_tail(&llst->llst_list,
4681                                &llsd->llsd_master_list);
4682                 atomic_inc(&llst->llst_ref);
4683                 spin_unlock(&llsd->llsd_lock);
4684
4685                 exp = lustre_find_lwp_by_index(lfsck->li_obd->obd_name,
4686                                                llst->llst_index);
4687                 if (exp == NULL) {
4688                         lfsck_layout_llst_del(llsd, llst);
4689                         lfsck_layout_llst_put(llst);
4690                         spin_lock(&llsd->llsd_lock);
4691                         continue;
4692                 }
4693
4694                 rc = lfsck_layout_async_notify(env, exp, lr, set);
4695                 if (rc != 0)
4696                         CDEBUG(D_LFSCK, "%s: layout LFSCK slave fail to "
4697                                "notify %s for layout: rc = %d\n",
4698                                lfsck_lfsck2name(lfsck),
4699                                exp->exp_obd->obd_name, rc);
4700
4701                 lfsck_layout_llst_put(llst);
4702                 class_export_put(exp);
4703                 spin_lock(&llsd->llsd_lock);
4704         }
4705         spin_unlock(&llsd->llsd_lock);
4706
4707         ptlrpc_set_wait(set);
4708         ptlrpc_set_destroy(set);
4709
4710         RETURN_EXIT;
4711 }
4712
4713 /*
4714  * \ret -ENODATA: unrecognized stripe
4715  * \ret = 0     : recognized stripe
4716  * \ret < 0     : other failures
4717  */
4718 static int lfsck_layout_master_check_pairs(const struct lu_env *env,
4719                                            struct lfsck_component *com,
4720                                            struct lu_fid *cfid,
4721                                            struct lu_fid *pfid, __u32 comp_id)
4722 {
4723         struct lfsck_thread_info        *info   = lfsck_env_info(env);
4724         struct lu_buf                   *buf    = &info->lti_big_buf;
4725         struct ost_id                   *oi     = &info->lti_oi;
4726         struct dt_object                *obj;
4727         struct lov_mds_md_v1            *lmm;
4728         struct lov_ost_data_v1          *objs;
4729         __u32                            idx    = pfid->f_stripe_idx;
4730         __u32                            magic;
4731         int                              rc     = 0;
4732         int                              i;
4733         __u16                            count;
4734         ENTRY;
4735
4736         pfid->f_ver = 0;
4737         obj = lfsck_object_find_bottom(env, com->lc_lfsck, pfid);
4738         if (IS_ERR(obj))
4739                 RETURN(PTR_ERR(obj));
4740
4741         dt_read_lock(env, obj, 0);
4742         if (unlikely(dt_object_exists(obj) == 0 ||
4743                      lfsck_is_dead_obj(obj)))
4744                 GOTO(unlock, rc = -ENOENT);
4745
4746         if (!S_ISREG(lfsck_object_type(obj)))
4747                 GOTO(unlock, rc = -ENODATA);
4748
4749         rc = lfsck_layout_get_lovea(env, obj, buf);
4750         if (rc < 0)
4751                 GOTO(unlock, rc);
4752
4753         if (rc == 0)
4754                 GOTO(unlock, rc = -ENODATA);
4755
4756         lmm = buf->lb_buf;
4757         magic = le32_to_cpu(lmm->lmm_magic);
4758         if (magic == LOV_MAGIC_COMP_V1) {
4759                 struct lov_comp_md_v1 *lcm = buf->lb_buf;
4760                 struct lov_comp_md_entry_v1 *lcme;
4761
4762                 if (comp_id == 0)
4763                         GOTO(unlock, rc = -ENODATA);
4764
4765                 count = le16_to_cpu(lcm->lcm_entry_count);
4766                 for (i = 0; i < count; i++) {
4767                         lcme = &lcm->lcm_entries[i];
4768                         if (le32_to_cpu(lcme->lcme_id) == comp_id) {
4769                                 lmm = buf->lb_buf +
4770                                         le32_to_cpu(lcme->lcme_offset);
4771                                 magic = le32_to_cpu(lmm->lmm_magic);
4772                                 if (!(le32_to_cpu(lcme->lcme_flags) &
4773                                       LCME_FL_INIT))
4774                                         GOTO(unlock, rc = -ENODATA);
4775
4776                                 goto further;
4777                         }
4778                 }
4779
4780                 GOTO(unlock, rc = -ENODATA);
4781         }
4782
4783 further:
4784         if (magic == LOV_MAGIC_V1) {
4785                 objs = &lmm->lmm_objects[0];
4786         } else {
4787                 LASSERT(magic == LOV_MAGIC_V3);
4788                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
4789         }
4790
4791         fid_to_ostid(cfid, oi);
4792         count = le16_to_cpu(lmm->lmm_stripe_count);
4793         for (i = 0; i < count; i++, objs++) {
4794                 struct ost_id oi2;
4795
4796                 ostid_le_to_cpu(&objs->l_ost_oi, &oi2);
4797                 if (memcmp(oi, &oi2, sizeof(*oi)) == 0)
4798                         GOTO(unlock, rc = (i != idx ? -ENODATA : 0));
4799         }
4800
4801         GOTO(unlock, rc = -ENODATA);
4802
4803 unlock:
4804         dt_read_unlock(env, obj);
4805         lfsck_object_put(env, obj);
4806
4807         return rc;
4808 }
4809
4810 /*
4811  * The LFSCK-on-OST will ask the LFSCK-on-MDT to check whether the given
4812  * MDT-object/OST-object pairs match or not to aviod transfer MDT-object
4813  * layout EA from MDT to OST. On one hand, the OST no need to understand
4814  * the layout EA structure; on the other hand, it may cause trouble when
4815  * transfer large layout EA from MDT to OST via normal OUT RPC.
4816  *
4817  * \ret > 0: unrecognized stripe
4818  * \ret = 0: recognized stripe
4819  * \ret < 0: other failures
4820  */
4821 static int lfsck_layout_slave_check_pairs(const struct lu_env *env,
4822                                           struct lfsck_component *com,
4823                                           struct lu_fid *cfid,
4824                                           struct lu_fid *pfid, __u32 comp_id)
4825 {
4826         struct lfsck_instance    *lfsck  = com->lc_lfsck;
4827         struct obd_device        *obd    = lfsck->li_obd;
4828         struct seq_server_site   *ss     = lfsck_dev_site(lfsck);
4829         struct obd_export        *exp    = NULL;
4830         struct ptlrpc_request    *req    = NULL;
4831         struct lfsck_request     *lr;
4832         struct lu_seq_range      *range  = &lfsck_env_info(env)->lti_range;
4833         int                       rc     = 0;
4834         ENTRY;
4835
4836         if (unlikely(fid_is_idif(pfid)))
4837                 RETURN(1);
4838
4839         fld_range_set_any(range);
4840         rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(pfid), range);
4841         if (rc != 0)
4842                 RETURN(rc == -ENOENT ? 1 : rc);
4843
4844         if (unlikely(!fld_range_is_mdt(range)))
4845                 RETURN(1);
4846
4847         exp = lustre_find_lwp_by_index(obd->obd_name, range->lsr_index);
4848         if (unlikely(exp == NULL))
4849                 RETURN(1);
4850
4851         if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
4852                 GOTO(out, rc = -EOPNOTSUPP);
4853
4854         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
4855         if (req == NULL)
4856                 GOTO(out, rc = -ENOMEM);
4857
4858         rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
4859         if (rc != 0) {
4860                 ptlrpc_request_free(req);
4861
4862                 GOTO(out, rc);
4863         }
4864
4865         lr = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
4866         memset(lr, 0, sizeof(*lr));
4867         lr->lr_event = LE_PAIRS_VERIFY;
4868         lr->lr_active = LFSCK_TYPE_LAYOUT;
4869         lr->lr_fid = *cfid; /* OST-object itself FID. */
4870         lr->lr_fid2 = *pfid; /* The claimed parent FID. */
4871         lr->lr_comp_id = comp_id;
4872
4873         ptlrpc_request_set_replen(req);
4874         rc = ptlrpc_queue_wait(req);
4875         ptlrpc_req_finished(req);
4876
4877         if (rc == -ENOENT || rc == -ENODATA)
4878                 rc = 1;
4879
4880         GOTO(out, rc);
4881
4882 out:
4883         if (exp != NULL)
4884                 class_export_put(exp);
4885
4886         return rc;
4887 }
4888
4889 static int lfsck_layout_slave_repair_pfid(const struct lu_env *env,
4890                                           struct lfsck_component *com,
4891                                           struct lfsck_req_local *lrl)
4892 {
4893         struct dt_object        *obj;
4894         int                      rc     = 0;
4895         ENTRY;
4896
4897         obj = lfsck_object_find_bottom(env, com->lc_lfsck, &lrl->lrl_fid);
4898         if (IS_ERR(obj))
4899                 GOTO(log, rc = PTR_ERR(obj));
4900
4901         dt_write_lock(env, obj, 0);
4902         if (unlikely(dt_object_exists(obj) == 0 ||
4903                      lfsck_is_dead_obj(obj)))
4904                 GOTO(unlock, rc = 0);
4905
4906         rc = __lfsck_layout_update_pfid(env, obj, &lrl->lrl_ff_client.ff_parent,
4907                                         &lrl->lrl_ff_client.ff_layout,
4908                                         lrl->lrl_ff_client.ff_parent.f_ver);
4909
4910         GOTO(unlock, rc);
4911
4912 unlock:
4913         dt_write_unlock(env, obj);
4914         lfsck_object_put(env, obj);
4915
4916 log:
4917         CDEBUG(D_LFSCK, "%s: layout LFSCK slave repaired pfid for "DFID
4918                ", parent "DFID": rc = %d\n", lfsck_lfsck2name(com->lc_lfsck),
4919                PFID(&lrl->lrl_fid), PFID(&lrl->lrl_ff_client.ff_parent), rc);
4920
4921         return rc;
4922 }
4923
4924 /* layout APIs */
4925
4926 static void lfsck_layout_slave_quit(const struct lu_env *env,
4927                                     struct lfsck_component *com);
4928
4929 static int lfsck_layout_reset(const struct lu_env *env,
4930                               struct lfsck_component *com, bool init)
4931 {
4932         struct lfsck_layout     *lo    = com->lc_file_ram;
4933         int                      rc;
4934
4935         down_write(&com->lc_sem);
4936         if (init) {
4937                 memset(lo, 0, com->lc_file_size);
4938         } else {
4939                 __u32 count = lo->ll_success_count;
4940                 __u64 last_time = lo->ll_time_last_complete;
4941
4942                 memset(lo, 0, com->lc_file_size);
4943                 lo->ll_success_count = count;
4944                 lo->ll_time_last_complete = last_time;
4945         }
4946
4947         lo->ll_magic = LFSCK_LAYOUT_MAGIC;
4948         lo->ll_status = LS_INIT;
4949
4950         if (com->lc_lfsck->li_master) {
4951                 struct lfsck_assistant_data *lad = com->lc_data;
4952
4953                 lad->lad_incomplete = 0;
4954                 CFS_RESET_BITMAP(lad->lad_bitmap);
4955         }
4956
4957         rc = lfsck_layout_store(env, com);
4958         if (rc == 0 && com->lc_lfsck->li_master)
4959                 rc = lfsck_load_sub_trace_files(env, com,
4960                         &dt_lfsck_layout_dangling_features, LFSCK_LAYOUT, true);
4961         up_write(&com->lc_sem);
4962
4963         CDEBUG(D_LFSCK, "%s: layout LFSCK reset: rc = %d\n",
4964                lfsck_lfsck2name(com->lc_lfsck), rc);
4965
4966         return rc;
4967 }
4968
4969 static void lfsck_layout_fail(const struct lu_env *env,
4970                               struct lfsck_component *com, bool new_checked)
4971 {
4972         struct lfsck_layout *lo = com->lc_file_ram;
4973
4974         down_write(&com->lc_sem);
4975         if (new_checked)
4976                 com->lc_new_checked++;
4977         lfsck_layout_record_failure(env, com->lc_lfsck, lo);
4978         up_write(&com->lc_sem);
4979 }
4980
4981 static int lfsck_layout_master_checkpoint(const struct lu_env *env,
4982                                           struct lfsck_component *com, bool init)
4983 {
4984         struct lfsck_instance   *lfsck   = com->lc_lfsck;
4985         struct lfsck_layout     *lo      = com->lc_file_ram;
4986         int                      rc;
4987
4988         if (!init) {
4989                 rc = lfsck_checkpoint_generic(env, com);
4990                 if (rc != 0)
4991                         return rc > 0 ? 0 : rc;
4992         }
4993
4994         down_write(&com->lc_sem);
4995         if (init) {
4996                 lo->ll_pos_latest_start =
4997                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
4998         } else {
4999                 lo->ll_pos_last_checkpoint =
5000                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5001                 lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() +
5002                                 HALF_SEC - lfsck->li_time_last_checkpoint);
5003                 lo->ll_time_last_checkpoint = cfs_time_current_sec();
5004                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5005                 com->lc_new_checked = 0;
5006         }
5007
5008         rc = lfsck_layout_store(env, com);
5009         up_write(&com->lc_sem);
5010
5011         CDEBUG(D_LFSCK, "%s: layout LFSCK master checkpoint at the pos ["
5012                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5013                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5014
5015         return rc;
5016 }
5017
5018 static int lfsck_layout_slave_checkpoint(const struct lu_env *env,
5019                                          struct lfsck_component *com, bool init)
5020 {
5021         struct lfsck_instance   *lfsck = com->lc_lfsck;
5022         struct lfsck_layout     *lo    = com->lc_file_ram;
5023         int                      rc;
5024
5025         if (com->lc_new_checked == 0 && !init)
5026                 return 0;
5027
5028         down_write(&com->lc_sem);
5029         if (init) {
5030                 lo->ll_pos_latest_start =
5031                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5032         } else {
5033                 lo->ll_pos_last_checkpoint =
5034                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5035                 lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() +
5036                                 HALF_SEC - lfsck->li_time_last_checkpoint);
5037                 lo->ll_time_last_checkpoint = cfs_time_current_sec();
5038                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5039                 com->lc_new_checked = 0;
5040         }
5041
5042         rc = lfsck_layout_store(env, com);
5043         up_write(&com->lc_sem);
5044
5045         CDEBUG(D_LFSCK, "%s: layout LFSCK slave checkpoint at the pos ["
5046                "%llu], status = %d: rc = %d\n", lfsck_lfsck2name(lfsck),
5047                lfsck->li_pos_current.lp_oit_cookie, lo->ll_status, rc);
5048
5049         return rc;
5050 }
5051
5052 static int lfsck_layout_prep(const struct lu_env *env,
5053                              struct lfsck_component *com,
5054                              struct lfsck_start *start)
5055 {
5056         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5057         struct lfsck_layout     *lo     = com->lc_file_ram;
5058         struct lfsck_position   *pos    = &com->lc_pos_start;
5059
5060         fid_zero(&pos->lp_dir_parent);
5061         pos->lp_dir_cookie = 0;
5062         if (lo->ll_status == LS_COMPLETED ||
5063             lo->ll_status == LS_PARTIAL ||
5064             /* To handle orphan, must scan from the beginning. */
5065             (start != NULL && start->ls_flags & LPF_OST_ORPHAN)) {
5066                 int rc;
5067
5068                 rc = lfsck_layout_reset(env, com, false);
5069                 if (rc == 0)
5070                         rc = lfsck_set_param(env, lfsck, start, true);
5071
5072                 if (rc != 0) {
5073                         CDEBUG(D_LFSCK, "%s: layout LFSCK prep failed: "
5074                                "rc = %d\n", lfsck_lfsck2name(lfsck), rc);
5075
5076                         return rc;
5077                 }
5078         }
5079
5080         down_write(&com->lc_sem);
5081         lo->ll_time_latest_start = cfs_time_current_sec();
5082         spin_lock(&lfsck->li_lock);
5083         if (lo->ll_flags & LF_SCANNED_ONCE) {
5084                 if (!lfsck->li_drop_dryrun ||
5085                     lo->ll_pos_first_inconsistent == 0) {
5086                         lo->ll_status = LS_SCANNING_PHASE2;
5087                         list_move_tail(&com->lc_link,
5088                                        &lfsck->li_list_double_scan);
5089                         pos->lp_oit_cookie = 0;
5090                 } else {
5091                         int i;
5092
5093                         lo->ll_status = LS_SCANNING_PHASE1;
5094                         lo->ll_run_time_phase1 = 0;
5095                         lo->ll_run_time_phase2 = 0;
5096                         lo->ll_objs_checked_phase1 = 0;
5097                         lo->ll_objs_checked_phase2 = 0;
5098                         lo->ll_objs_failed_phase1 = 0;
5099                         lo->ll_objs_failed_phase2 = 0;
5100                         for (i = 0; i < LLIT_MAX; i++)
5101                                 lo->ll_objs_repaired[i] = 0;
5102
5103                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5104                         fid_zero(&com->lc_fid_latest_scanned_phase2);
5105                 }
5106         } else {
5107                 lo->ll_status = LS_SCANNING_PHASE1;
5108                 if (!lfsck->li_drop_dryrun ||
5109                     lo->ll_pos_first_inconsistent == 0)
5110                         pos->lp_oit_cookie = lo->ll_pos_last_checkpoint + 1;
5111                 else
5112                         pos->lp_oit_cookie = lo->ll_pos_first_inconsistent;
5113         }
5114         spin_unlock(&lfsck->li_lock);
5115         up_write(&com->lc_sem);
5116
5117         return 0;
5118 }
5119
5120 static int lfsck_layout_slave_prep(const struct lu_env *env,
5121                                    struct lfsck_component *com,
5122                                    struct lfsck_start_param *lsp)
5123 {
5124         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5125         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5126         struct lfsck_layout             *lo     = com->lc_file_ram;
5127         struct lfsck_start              *start  = lsp->lsp_start;
5128         int                              rc;
5129
5130         rc = lfsck_layout_prep(env, com, start);
5131         if (rc != 0)
5132                 return rc;
5133
5134         if (lo->ll_flags & LF_CRASHED_LASTID &&
5135             list_empty(&llsd->llsd_master_list)) {
5136                 LASSERT(lfsck->li_out_notify != NULL);
5137
5138                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5139                                      LE_LASTID_REBUILDING);
5140         }
5141
5142         if (!lsp->lsp_index_valid)
5143                 return 0;
5144
5145         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
5146         if (rc == 0 && start != NULL && start->ls_flags & LPF_OST_ORPHAN) {
5147                 LASSERT(!llsd->llsd_rbtree_valid);
5148
5149                 write_lock(&llsd->llsd_rb_lock);
5150                 rc = lfsck_rbtree_setup(env, com);
5151                 write_unlock(&llsd->llsd_rb_lock);
5152         }
5153
5154         CDEBUG(D_LFSCK, "%s: layout LFSCK slave prep done, start pos ["
5155                "%llu]\n", lfsck_lfsck2name(lfsck),
5156                com->lc_pos_start.lp_oit_cookie);
5157
5158         return rc;
5159 }
5160
5161 static int lfsck_layout_master_prep(const struct lu_env *env,
5162                                     struct lfsck_component *com,
5163                                     struct lfsck_start_param *lsp)
5164 {
5165         int rc;
5166         ENTRY;
5167
5168         rc = lfsck_layout_load_bitmap(env, com);
5169         if (rc != 0) {
5170                 rc = lfsck_layout_reset(env, com, false);
5171                 if (rc == 0)
5172                         rc = lfsck_set_param(env, com->lc_lfsck,
5173                                              lsp->lsp_start, true);
5174
5175                 if (rc != 0)
5176                         GOTO(log, rc);
5177         }
5178
5179         rc = lfsck_layout_prep(env, com, lsp->lsp_start);
5180         if (rc != 0)
5181                 RETURN(rc);
5182
5183         rc = lfsck_start_assistant(env, com, lsp);
5184
5185         GOTO(log, rc);
5186
5187 log:
5188         CDEBUG(D_LFSCK, "%s: layout LFSCK master prep done, start pos ["
5189                "%llu]\n", lfsck_lfsck2name(com->lc_lfsck),
5190                com->lc_pos_start.lp_oit_cookie);
5191
5192         return 0;
5193 }
5194
5195 /* Pre-fetch the attribute for each stripe in the given layout EA. */
5196 static int lfsck_layout_scan_stripes(const struct lu_env *env,
5197                                      struct lfsck_component *com,
5198                                      struct dt_object *parent,
5199                                      struct lov_mds_md_v1 *lmm, __u32 comp_id)
5200 {
5201         struct lfsck_thread_info        *info    = lfsck_env_info(env);
5202         struct lfsck_instance           *lfsck   = com->lc_lfsck;
5203         struct lfsck_bookmark           *bk      = &lfsck->li_bookmark_ram;
5204         struct lfsck_layout             *lo      = com->lc_file_ram;
5205         struct lfsck_assistant_data     *lad     = com->lc_data;
5206         struct lfsck_assistant_object   *lso     = NULL;
5207         struct lov_ost_data_v1          *objs;
5208         struct lfsck_tgt_descs          *ltds    = &lfsck->li_ost_descs;
5209         struct ptlrpc_thread            *mthread = &lfsck->li_thread;
5210         struct ptlrpc_thread            *athread = &lad->lad_thread;
5211         struct l_wait_info               lwi     = { 0 };
5212         struct lu_buf                    buf;
5213         int                              rc      = 0;
5214         int                              i;
5215         __u32                            magic;
5216         __u16                            count;
5217         ENTRY;
5218
5219         lfsck_buf_init(&buf, &info->lti_ff, sizeof(struct filter_fid));
5220         magic = le32_to_cpu(lmm->lmm_magic);
5221         if (magic == LOV_MAGIC_V1) {
5222                 objs = &lmm->lmm_objects[0];
5223         } else {
5224                 LASSERT(magic == LOV_MAGIC_V3);
5225                 objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
5226         }
5227
5228         count = le16_to_cpu(lmm->lmm_stripe_count);
5229         for (i = 0; i < count; i++, objs++) {
5230                 struct lu_fid           *fid    = &info->lti_fid;
5231                 struct ost_id           *oi     = &info->lti_oi;
5232                 struct lfsck_layout_req *llr;
5233                 struct lfsck_tgt_desc   *tgt    = NULL;
5234                 struct dt_object        *cobj   = NULL;
5235                 __u32                    index;
5236                 bool                     wakeup = false;
5237
5238                 if (unlikely(lovea_slot_is_dummy(objs)))
5239                         continue;
5240
5241                 l_wait_event(mthread->t_ctl_waitq,
5242                              lad->lad_prefetched < bk->lb_async_windows ||
5243                              !thread_is_running(mthread) ||
5244                              thread_is_stopped(athread),
5245                              &lwi);
5246
5247                 if (unlikely(!thread_is_running(mthread)) ||
5248                              thread_is_stopped(athread))
5249                         GOTO(out, rc = 0);
5250
5251                 if (unlikely(lfsck_is_dead_obj(parent)))
5252                         GOTO(out, rc = 0);
5253
5254                 ostid_le_to_cpu(&objs->l_ost_oi, oi);
5255                 index = le32_to_cpu(objs->l_ost_idx);
5256                 rc = ostid_to_fid(fid, oi, index);
5257                 if (rc != 0) {
5258                         CDEBUG(D_LFSCK, "%s: get invalid layout EA for "DFID
5259                                ": "DOSTID", idx %u, comp_id %u\n",
5260                                lfsck_lfsck2name(lfsck),
5261                                PFID(lfsck_dto2fid(parent)), POSTID(oi),
5262                                index, comp_id);
5263                         goto next;
5264                 }
5265
5266                 tgt = lfsck_tgt_get(ltds, index);
5267                 if (unlikely(tgt == NULL)) {
5268                         CDEBUG(D_LFSCK, "%s: cannot talk with OST %x which "
5269                                "did not join the layout LFSCK, comp_id %u\n",
5270                                lfsck_lfsck2name(lfsck), index, comp_id);
5271                         lfsck_lad_set_bitmap(env, com, index);
5272                         goto next;
5273                 }
5274
5275                 /* There is potential deadlock race condition between object
5276                  * destroy and layout LFSCK. Consider the following scenario:
5277                  *
5278                  * 1) The LFSCK thread obtained the parent object firstly, at
5279                  *    that time, the parent object has not been destroyed yet.
5280                  *
5281                  * 2) One RPC service thread destroyed the parent and all its
5282                  *    children objects. Because the LFSCK is referencing the
5283                  *    parent object, then the parent object will be marked as
5284                  *    dying in RAM. On the other hand, the parent object is
5285                  *    referencing all its children objects, then all children
5286                  *    objects will be marked as dying in RAM also.
5287                  *
5288                  * 3) The LFSCK thread tries to find some child object with
5289                  *    the parent object referenced. Then it will find that the
5290                  *    child object is dying. According to the object visibility
5291                  *    rules: the object with dying flag cannot be returned to
5292                  *    others. So the LFSCK thread has to wait until the dying
5293                  *    object has been purged from RAM, then it can allocate a
5294                  *    new object (with the same FID) in RAM. Unfortunately, the
5295                  *    LFSCK thread itself is referencing the parent object, and
5296                  *    cause the parent object cannot be purged, then cause the
5297                  *    child object cannot be purged also. So the LFSCK thread
5298                  *    will fall into deadlock.
5299                  */
5300                 cobj = lfsck_object_find_by_dev(env, tgt->ltd_tgt, fid);
5301                 if (IS_ERR(cobj)) {
5302                         if (lfsck_is_dead_obj(parent)) {
5303                                 lfsck_tgt_put(tgt);
5304
5305                                 GOTO(out, rc = 0);
5306                         }
5307
5308                         rc = PTR_ERR(cobj);
5309                         goto next;
5310                 }
5311
5312                 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_ASSISTANT_DIRECT)) {
5313                         rc = dt_declare_attr_get(env, cobj);
5314                         if (rc != 0)
5315                                 goto next;
5316
5317                         rc = dt_declare_xattr_get(env, cobj, &buf,
5318                                                   XATTR_NAME_FID);
5319                         if (rc != 0)
5320                                 goto next;
5321                 }
5322
5323                 if (lso == NULL) {
5324                         struct lu_attr *attr = &info->lti_la;
5325
5326                         rc = dt_attr_get(env, parent, attr);
5327                         if (rc != 0)
5328                                 goto next;
5329
5330                         lso = lfsck_assistant_object_init(env,
5331                                 lfsck_dto2fid(parent), attr,
5332                                 lfsck->li_pos_current.lp_oit_cookie, false);
5333                         if (IS_ERR(lso)) {
5334                                 rc = PTR_ERR(lso);
5335                                 lso = NULL;
5336
5337                                 goto next;
5338                         }
5339                 }
5340
5341                 llr = lfsck_layout_assistant_req_init(lso, cobj, comp_id,
5342                                                       index, i);
5343                 if (IS_ERR(llr)) {
5344                         rc = PTR_ERR(llr);
5345                         goto next;
5346                 }
5347
5348                 cobj = NULL;
5349                 spin_lock(&lad->lad_lock);
5350                 if (lad->lad_assistant_status < 0) {
5351                         spin_unlock(&lad->lad_lock);
5352                         lfsck_layout_assistant_req_fini(env, &llr->llr_lar);
5353                         lfsck_tgt_put(tgt);
5354                         RETURN(lad->lad_assistant_status);
5355                 }
5356
5357                 list_add_tail(&llr->llr_lar.lar_list, &lad->lad_req_list);
5358                 if (lad->lad_prefetched == 0)
5359                         wakeup = true;
5360
5361                 lad->lad_prefetched++;
5362                 spin_unlock(&lad->lad_lock);
5363                 if (wakeup)
5364                         wake_up_all(&athread->t_ctl_waitq);
5365
5366 next:
5367                 down_write(&com->lc_sem);
5368                 com->lc_new_checked++;
5369                 if (rc < 0)
5370                         lfsck_layout_record_failure(env, lfsck, lo);
5371                 up_write(&com->lc_sem);
5372
5373                 if (cobj != NULL && !IS_ERR(cobj))
5374                         lfsck_object_put(env, cobj);
5375
5376                 if (likely(tgt != NULL))
5377                         lfsck_tgt_put(tgt);
5378
5379                 if (rc < 0 && bk->lb_param & LPF_FAILOUT)
5380                         GOTO(out, rc);
5381         }
5382
5383         GOTO(out, rc = 0);
5384
5385 out:
5386         if (lso != NULL)
5387                 lfsck_assistant_object_put(env, lso);
5388
5389         return rc;
5390 }
5391
5392 /* For the given object, read its layout EA locally. For each stripe, pre-fetch
5393  * the OST-object's attribute and generate an structure lfsck_layout_req on the
5394  * list ::lad_req_list.
5395  *
5396  * For each request on above list, the lfsck_layout_assistant thread compares
5397  * the OST side attribute with local attribute, if inconsistent, then repair it.
5398  *
5399  * All above processing is async mode with pipeline. */
5400 static int lfsck_layout_master_exec_oit(const struct lu_env *env,
5401                                         struct lfsck_component *com,
5402                                         struct dt_object *obj)
5403 {
5404         struct lfsck_thread_info        *info   = lfsck_env_info(env);
5405         struct ost_id                   *oi     = &info->lti_oi;
5406         struct lfsck_layout             *lo     = com->lc_file_ram;
5407         struct lfsck_assistant_data     *lad    = com->lc_data;
5408         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5409         struct lfsck_bookmark           *bk     = &lfsck->li_bookmark_ram;
5410         struct thandle                  *handle = NULL;
5411         struct lu_buf                   *buf    = &info->lti_big_buf;
5412         struct lov_mds_md_v1            *lmm    = NULL;
5413         struct dt_device                *dev    = lfsck_obj2dev(obj);
5414         struct lustre_handle             lh     = { 0 };
5415         struct lu_buf                    ea_buf = { NULL };
5416         struct lov_comp_md_v1           *lcm    = NULL;
5417         struct lov_comp_md_entry_v1     *lcme   = NULL;
5418         int                              rc     = 0;
5419         int                              size   = 0;
5420         __u32                            magic  = 0;
5421         __u16                            count  = 0;
5422         bool                             locked = false;
5423         bool                             stripe = false;
5424         bool                             bad_oi = false;
5425         ENTRY;
5426
5427         if (!S_ISREG(lfsck_object_type(obj)))
5428                 GOTO(out, rc = 0);
5429
5430         if (lad->lad_assistant_status < 0)
5431                 GOTO(out, rc = -ESRCH);
5432
5433         fid_to_lmm_oi(lfsck_dto2fid(obj), oi);
5434         lmm_oi_cpu_to_le(oi, oi);
5435         dt_read_lock(env, obj, 0);
5436         locked = true;
5437
5438 again:
5439         bad_oi = false;
5440         if (dt_object_exists(obj) == 0 ||
5441             lfsck_is_dead_obj(obj))
5442                 GOTO(out, rc = 0);
5443
5444         rc = lfsck_layout_get_lovea(env, obj, buf);
5445         if (rc <= 0)
5446                 /* Skip bad lov EA during the 1st cycle scanning, and
5447                  * try to recover it via orphan in the 2nd scanning. */
5448                 GOTO(out, rc = (rc == -EINVAL ? 0 : rc));
5449
5450         size = rc;
5451         lmm = buf->lb_buf;
5452         magic = le32_to_cpu(lmm->lmm_magic);
5453         if (magic == LOV_MAGIC_COMP_V1) {
5454                 int i;
5455
5456                 lcm = buf->lb_buf;
5457                 count = le16_to_cpu(lcm->lcm_entry_count);
5458                 for (i = 0; i < count; i++) {
5459                         lcme = &lcm->lcm_entries[i];
5460                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5461                         if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) != 0)
5462                                 goto fix;
5463                 }
5464
5465                 GOTO(out, stripe = true);
5466         } else if (memcmp(oi, &lmm->lmm_oi, sizeof(*oi)) == 0) {
5467                 GOTO(out, stripe = true);
5468         }
5469
5470 fix:
5471         /* Inconsistent lmm_oi, should be repaired. */
5472         bad_oi = true;
5473
5474         if (bk->lb_param & LPF_DRYRUN) {
5475                 lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5476
5477                 GOTO(out, stripe = true);
5478         }
5479
5480         if (!lustre_handle_is_used(&lh)) {
5481                 dt_read_unlock(env, obj);
5482                 locked = false;
5483                 rc = lfsck_ibits_lock(env, lfsck, obj, &lh,
5484                                       MDS_INODELOCK_LAYOUT |
5485                                       MDS_INODELOCK_XATTR, LCK_EX);
5486                 if (rc != 0)
5487                         GOTO(out, rc);
5488
5489                 handle = dt_trans_create(env, dev);
5490                 if (IS_ERR(handle))
5491                         GOTO(out, rc = PTR_ERR(handle));
5492
5493                 lfsck_buf_init(&ea_buf, lmm, size);
5494                 rc = dt_declare_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5495                                           LU_XATTR_REPLACE, handle);
5496                 if (rc != 0)
5497                         GOTO(out, rc);
5498
5499                 rc = dt_trans_start_local(env, dev, handle);
5500                 if (rc != 0)
5501                         GOTO(out, rc);
5502
5503                 dt_write_lock(env, obj, 0);
5504                 locked = true;
5505
5506                 goto again;
5507         }
5508
5509         if (magic == LOV_MAGIC_COMP_V1) {
5510                 int i;
5511
5512                 for (i = 0; i < count; i++) {
5513                         lcme = &lcm->lcm_entries[i];
5514                         lmm = buf->lb_buf + le32_to_cpu(lcme->lcme_offset);
5515                         lmm->lmm_oi = *oi;
5516                 }
5517         } else {
5518                 lmm->lmm_oi = *oi;
5519         }
5520
5521         rc = dt_xattr_set(env, obj, &ea_buf, XATTR_NAME_LOV,
5522                           LU_XATTR_REPLACE, handle);
5523         if (rc != 0)
5524                 GOTO(out, rc);
5525
5526         lo->ll_objs_repaired[LLIT_OTHERS - 1]++;
5527
5528         GOTO(out, stripe = true);
5529
5530 out:
5531         if (locked) {
5532                 if (lustre_handle_is_used(&lh))
5533                         dt_write_unlock(env, obj);
5534                 else
5535                         dt_read_unlock(env, obj);
5536         }
5537
5538         if (handle != NULL && !IS_ERR(handle))
5539                 dt_trans_stop(env, dev, handle);
5540
5541         lfsck_ibits_unlock(&lh, LCK_EX);
5542
5543         if (bad_oi)
5544                 CDEBUG(D_LFSCK, "%s: layout LFSCK master %s bad lmm_oi for "
5545                        DFID": rc = %d\n", lfsck_lfsck2name(lfsck),
5546                        bk->lb_param & LPF_DRYRUN ? "found" : "repaired",
5547                        PFID(lfsck_dto2fid(obj)), rc);
5548
5549         if (stripe) {
5550                 if (magic == LOV_MAGIC_COMP_V1) {
5551                         int i;
5552
5553                         for (i = 0; i < count; i++) {
5554                                 lcme = &lcm->lcm_entries[i];
5555                                 if (!(le32_to_cpu(lcme->lcme_flags) &
5556                                       LCME_FL_INIT))
5557                                         continue;
5558
5559                                 rc = lfsck_layout_scan_stripes(env, com, obj,
5560                                         (struct lov_mds_md_v1 *)(buf->lb_buf +
5561                                         le32_to_cpu(lcme->lcme_offset)),
5562                                         le32_to_cpu(lcme->lcme_id));
5563                         }
5564                 } else {
5565                         rc = lfsck_layout_scan_stripes(env, com, obj, lmm, 0);
5566                 }
5567         } else {
5568                 down_write(&com->lc_sem);
5569                 com->lc_new_checked++;
5570                 if (rc < 0)
5571                         lfsck_layout_record_failure(env, lfsck, lo);
5572                 up_write(&com->lc_sem);
5573         }
5574
5575         return rc;
5576 }
5577
5578 static int lfsck_layout_slave_exec_oit(const struct lu_env *env,
5579                                        struct lfsck_component *com,
5580                                        struct dt_object *obj)
5581 {
5582         struct lfsck_instance           *lfsck  = com->lc_lfsck;
5583         struct lfsck_layout             *lo     = com->lc_file_ram;
5584         const struct lu_fid             *fid    = lfsck_dto2fid(obj);
5585         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
5586         struct lfsck_layout_seq         *lls;
5587         __u64                            seq;
5588         __u64                            oid;
5589         int                              rc;
5590         ENTRY;
5591
5592         LASSERT(llsd != NULL);
5593
5594         if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY5) &&
5595             cfs_fail_val == lfsck_dev_idx(lfsck)) {
5596                 struct l_wait_info       lwi = LWI_TIMEOUT(cfs_time_seconds(1),
5597                                                            NULL, NULL);
5598                 struct ptlrpc_thread    *thread = &lfsck->li_thread;
5599
5600                 l_wait_event(thread->t_ctl_waitq,
5601                              !thread_is_running(thread),
5602                              &lwi);
5603         }
5604
5605         lfsck_rbtree_update_bitmap(env, com, fid, false);
5606
5607         down_write(&com->lc_sem);
5608         if (fid_is_idif(fid))
5609                 seq = 0;
5610         else if (!fid_is_norm(fid) ||
5611                  !fid_is_for_ostobj(env, lfsck, obj, fid))
5612                 GOTO(unlock, rc = 0);
5613         else
5614                 seq = fid_seq(fid);
5615         com->lc_new_checked++;
5616
5617         lls = lfsck_layout_seq_lookup(llsd, seq);
5618         if (lls == NULL) {
5619                 OBD_ALLOC_PTR(lls);
5620                 if (unlikely(lls == NULL))
5621                         GOTO(unlock, rc = -ENOMEM);
5622
5623                 INIT_LIST_HEAD(&lls->lls_list);
5624                 lls->lls_seq = seq;
5625                 rc = lfsck_layout_lastid_load(env, com, lls);
5626                 if (rc != 0) {
5627                         CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5628                               "load LAST_ID for %#llx: rc = %d\n",
5629                               lfsck_lfsck2name(com->lc_lfsck), seq, rc);
5630                         lo->ll_objs_failed_phase1++;
5631                         OBD_FREE_PTR(lls);
5632                         GOTO(unlock, rc);
5633                 }
5634
5635                 lfsck_layout_seq_insert(llsd, lls);
5636         }
5637
5638         if (unlikely(fid_is_last_id(fid)))
5639                 GOTO(unlock, rc = 0);
5640
5641         if (fid_is_idif(fid))
5642                 oid = fid_idif_id(fid_seq(fid), fid_oid(fid), fid_ver(fid));
5643         else
5644                 oid = fid_oid(fid);
5645
5646         if (oid > lls->lls_lastid_known)
5647                 lls->lls_lastid_known = oid;
5648
5649         if (oid > lls->lls_lastid) {
5650                 if (!(lo->ll_flags & LF_CRASHED_LASTID)) {
5651                         /* OFD may create new objects during LFSCK scanning. */
5652                         rc = lfsck_layout_lastid_reload(env, com, lls);
5653                         if (unlikely(rc != 0)) {
5654                                 CDEBUG(D_LFSCK, "%s: layout LFSCK failed to "
5655                                       "reload LAST_ID for %#llx: rc = %d\n",
5656                                       lfsck_lfsck2name(com->lc_lfsck),
5657                                       lls->lls_seq, rc);
5658
5659                                 GOTO(unlock, rc);
5660                         }
5661
5662                         if (oid <= lls->lls_lastid ||
5663                             lo->ll_flags & LF_CRASHED_LASTID)
5664                                 GOTO(unlock, rc = 0);
5665
5666                         LASSERT(lfsck->li_out_notify != NULL);
5667
5668                         lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5669                                              LE_LASTID_REBUILDING);
5670                         lo->ll_flags |= LF_CRASHED_LASTID;
5671
5672                         CDEBUG(D_LFSCK, "%s: layout LFSCK finds crashed "
5673                                "LAST_ID file (2) for the sequence %#llx"
5674                                ", old value %llu, known value %llu\n",
5675                                lfsck_lfsck2name(lfsck), lls->lls_seq,
5676                                lls->lls_lastid, oid);
5677                 }
5678
5679                 lls->lls_lastid = oid;
5680                 lls->lls_dirty = 1;
5681         }
5682
5683         GOTO(unlock, rc = 0);
5684
5685 unlock:
5686         up_write(&com->lc_sem);
5687
5688         return rc;
5689 }
5690
5691 static int lfsck_layout_exec_dir(const struct lu_env *env,
5692                                  struct lfsck_component *com,
5693                                  struct lfsck_assistant_object *lso,
5694                                  struct lu_dirent *ent, __u16 type)
5695 {
5696         return 0;
5697 }
5698
5699 static int lfsck_layout_master_post(const struct lu_env *env,
5700                                     struct lfsck_component *com,
5701                                     int result, bool init)
5702 {
5703         struct lfsck_instance   *lfsck  = com->lc_lfsck;
5704         struct lfsck_layout     *lo     = com->lc_file_ram;
5705         int                      rc;
5706         ENTRY;
5707
5708         lfsck_post_generic(env, com, &result);
5709
5710         down_write(&com->lc_sem);
5711         spin_lock(&lfsck->li_lock);
5712         if (!init)
5713                 lo->ll_pos_last_checkpoint =
5714                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5715
5716         if (result > 0) {
5717                 if (lo->ll_flags & LF_INCOMPLETE)
5718                         lo->ll_status = LS_PARTIAL;
5719                 else
5720                         lo->ll_status = LS_SCANNING_PHASE2;
5721                 lo->ll_flags |= LF_SCANNED_ONCE;
5722                 lo->ll_flags &= ~LF_UPGRADE;
5723                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
5724         } else if (result == 0) {
5725                 if (lfsck->li_status != 0)
5726                         lo->ll_status = lfsck->li_status;
5727                 else
5728                         lo->ll_status = LS_STOPPED;
5729                 if (lo->ll_status != LS_PAUSED)
5730                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5731         } else {
5732                 lo->ll_status = LS_FAILED;
5733                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5734         }
5735         spin_unlock(&lfsck->li_lock);
5736
5737         if (!init) {
5738                 lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() +
5739                                 HALF_SEC - lfsck->li_time_last_checkpoint);
5740                 lo->ll_time_last_checkpoint = cfs_time_current_sec();
5741                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5742                 com->lc_new_checked = 0;
5743         }
5744
5745         rc = lfsck_layout_store(env, com);
5746         up_write(&com->lc_sem);
5747
5748         CDEBUG(D_LFSCK, "%s: layout LFSCK master post done: rc = %d\n",
5749                lfsck_lfsck2name(lfsck), rc);
5750
5751         RETURN(rc);
5752 }
5753
5754 static int lfsck_layout_slave_post(const struct lu_env *env,
5755                                    struct lfsck_component *com,
5756                                    int result, bool init)
5757 {
5758         struct lfsck_instance   *lfsck = com->lc_lfsck;
5759         struct lfsck_layout     *lo    = com->lc_file_ram;
5760         int                      rc;
5761         bool                     done  = false;
5762
5763         down_write(&com->lc_sem);
5764         rc = lfsck_layout_lastid_store(env, com);
5765         if (rc != 0)
5766                 result = rc;
5767
5768         LASSERT(lfsck->li_out_notify != NULL);
5769
5770         spin_lock(&lfsck->li_lock);
5771         if (!init)
5772                 lo->ll_pos_last_checkpoint =
5773                                 lfsck->li_pos_checkpoint.lp_oit_cookie;
5774
5775         if (result > 0) {
5776                 lo->ll_status = LS_SCANNING_PHASE2;
5777                 lo->ll_flags |= LF_SCANNED_ONCE;
5778                 if (lo->ll_flags & LF_CRASHED_LASTID) {
5779                         done = true;
5780                         lo->ll_flags &= ~LF_CRASHED_LASTID;
5781
5782                         CDEBUG(D_LFSCK, "%s: layout LFSCK has rebuilt "
5783                                "crashed LAST_ID files successfully\n",
5784                                lfsck_lfsck2name(lfsck));
5785                 }
5786                 lo->ll_flags &= ~LF_UPGRADE;
5787                 list_move_tail(&com->lc_link, &lfsck->li_list_double_scan);
5788         } else if (result == 0) {
5789                 if (lfsck->li_status != 0)
5790                         lo->ll_status = lfsck->li_status;
5791                 else
5792                         lo->ll_status = LS_STOPPED;
5793                 if (lo->ll_status != LS_PAUSED)
5794                         list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5795         } else {
5796                 lo->ll_status = LS_FAILED;
5797                 list_move_tail(&com->lc_link, &lfsck->li_list_idle);
5798         }
5799         spin_unlock(&lfsck->li_lock);
5800
5801         if (done)
5802                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
5803                                      LE_LASTID_REBUILT);
5804
5805         if (!init) {
5806                 lo->ll_run_time_phase1 += cfs_duration_sec(cfs_time_current() +
5807                                 HALF_SEC - lfsck->li_time_last_checkpoint);
5808                 lo->ll_time_last_checkpoint = cfs_time_current_sec();
5809                 lo->ll_objs_checked_phase1 += com->lc_new_checked;
5810                 com->lc_new_checked = 0;
5811         }
5812
5813         rc = lfsck_layout_store(env, com);
5814         up_write(&com->lc_sem);
5815
5816         lfsck_layout_slave_notify_master(env, com, LE_PHASE1_DONE, result);
5817
5818         CDEBUG(D_LFSCK, "%s: layout LFSCK slave post done: rc = %d\n",
5819                lfsck_lfsck2name(lfsck), rc);
5820
5821         return rc;
5822 }
5823
5824 static void lfsck_layout_dump(const struct lu_env *env,
5825                               struct lfsck_component *com, struct seq_file *m)
5826 {
5827         struct lfsck_instance   *lfsck = com->lc_lfsck;
5828         struct lfsck_bookmark   *bk    = &lfsck->li_bookmark_ram;
5829         struct lfsck_layout     *lo    = com->lc_file_ram;
5830         const char *prefix;
5831
5832         down_read(&com->lc_sem);
5833         if (bk->lb_param & LPF_DRYRUN)
5834                 prefix = "inconsistent";
5835         else
5836                 prefix = "repaired";
5837
5838         seq_printf(m, "name: lfsck_layout\n"
5839                    "magic: %#x\n"
5840                    "version: %d\n"
5841                    "status: %s\n",
5842                    lo->ll_magic,
5843                    bk->lb_version,
5844                    lfsck_status2name(lo->ll_status));
5845
5846         lfsck_bits_dump(m, lo->ll_flags, lfsck_flags_names, "flags");
5847
5848         lfsck_bits_dump(m, bk->lb_param, lfsck_param_names, "param");
5849
5850         lfsck_time_dump(m, lo->ll_time_last_complete, "last_completed");
5851
5852         lfsck_time_dump(m, lo->ll_time_latest_start, "latest_start");
5853
5854         lfsck_time_dump(m, lo->ll_time_last_checkpoint, "last_checkpoint");
5855
5856         seq_printf(m, "latest_start_position: %llu\n"
5857                    "last_checkpoint_position: %llu\n"
5858                    "first_failure_position: %llu\n",
5859                    lo->ll_pos_latest_start,
5860                    lo->ll_pos_last_checkpoint,
5861                    lo->ll_pos_first_inconsistent);
5862
5863         seq_printf(m, "success_count: %u\n"
5864                    "%s_dangling: %llu\n"
5865                    "%s_unmatched_pair: %llu\n"
5866                    "%s_multiple_referenced: %llu\n"
5867                    "%s_orphan: %llu\n"
5868                    "%s_inconsistent_owner: %llu\n"
5869                    "%s_others: %llu\n"
5870                    "skipped: %llu\n"
5871                    "failed_phase1: %llu\n"
5872                    "failed_phase2: %llu\n",
5873                    lo->ll_success_count,
5874                    prefix, lo->ll_objs_repaired[LLIT_DANGLING - 1],
5875                    prefix, lo->ll_objs_repaired[LLIT_UNMATCHED_PAIR - 1],
5876                    prefix, lo->ll_objs_repaired[LLIT_MULTIPLE_REFERENCED - 1],
5877                    prefix, lo->ll_objs_repaired[LLIT_ORPHAN - 1],
5878                    prefix, lo->ll_objs_repaired[LLIT_INCONSISTENT_OWNER - 1],
5879                    prefix, lo->ll_objs_repaired[LLIT_OTHERS - 1],
5880                    lo->ll_objs_skipped,
5881                    lo->ll_objs_failed_phase1,
5882                    lo->ll_objs_failed_phase2);
5883
5884         if (lo->ll_status == LS_SCANNING_PHASE1) {
5885                 __u64 pos;
5886                 cfs_duration_t duration = cfs_time_current() -
5887                                           lfsck->li_time_last_checkpoint;
5888                 __u64 checked = lo->ll_objs_checked_phase1 +
5889                                 com->lc_new_checked;
5890                 __u64 speed = checked;
5891                 __u64 new_checked = com->lc_new_checked *
5892                                     msecs_to_jiffies(MSEC_PER_SEC);
5893                 __u32 rtime = lo->ll_run_time_phase1 +
5894                               cfs_duration_sec(duration + HALF_SEC);
5895
5896                 if (duration != 0)
5897                         new_checked = div64_s64(new_checked, duration);
5898                 if (rtime != 0)
5899                         speed = div64_s64(speed, rtime);
5900                 seq_printf(m, "checked_phase1: %llu\n"
5901                            "checked_phase2: %llu\n"
5902                            "run_time_phase1: %u seconds\n"
5903                            "run_time_phase2: %u seconds\n"
5904                            "average_speed_phase1: %llu items/sec\n"
5905                            "average_speed_phase2: N/A\n"
5906                            "real-time_speed_phase1: %llu items/sec\n"
5907                            "real-time_speed_phase2: N/A\n",
5908                            checked,
5909                            lo->ll_objs_checked_phase2,
5910                            rtime,
5911                            lo->ll_run_time_phase2,
5912                            speed,
5913                            new_checked);
5914
5915                 if (likely(lfsck->li_di_oit)) {
5916                         const struct dt_it_ops *iops =
5917                                 &lfsck->li_obj_oit->do_index_ops->dio_it;
5918
5919                         /* The low layer otable-based iteration position may NOT
5920                          * exactly match the layout-based directory traversal
5921                          * cookie. Generally, it is not a serious issue. But the
5922                          * caller should NOT make assumption on that. */
5923                         pos = iops->store(env, lfsck->li_di_oit);
5924                         if (!lfsck->li_current_oit_processed)
5925                                 pos--;
5926                 } else {
5927                         pos = lo->ll_pos_last_checkpoint;
5928                 }
5929
5930                 seq_printf(m, "current_position: %llu\n", pos);
5931         } else if (lo->ll_status == LS_SCANNING_PHASE2) {
5932                 cfs_duration_t duration = cfs_time_current() -
5933                                           com->lc_time_last_checkpoint;
5934                 __u64 checked = lo->ll_objs_checked_phase2 +
5935                                 com->lc_new_checked;
5936                 __u64 speed1 = lo->ll_objs_checked_phase1;
5937                 __u64 speed2 = checked;
5938                 __u64 new_checked = com->lc_new_checked *
5939                                     msecs_to_jiffies(MSEC_PER_SEC);
5940                 __u32 rtime = lo->ll_run_time_phase2 +
5941                               cfs_duration_sec(duration + HALF_SEC);
5942
5943                 if (duration != 0)
5944                         new_checked = div64_s64(new_checked, duration);
5945                 if (lo->ll_run_time_phase1 != 0)
5946                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
5947                 if (rtime != 0)
5948                         speed2 = div64_s64(speed2, rtime);
5949                 seq_printf(m, "checked_phase1: %llu\n"
5950                            "checked_phase2: %llu\n"
5951                            "run_time_phase1: %u seconds\n"
5952                            "run_time_phase2: %u seconds\n"
5953                            "average_speed_phase1: %llu items/sec\n"
5954                            "average_speed_phase2: %llu items/sec\n"
5955                            "real-time_speed_phase1: N/A\n"
5956                            "real-time_speed_phase2: %llu items/sec\n"
5957                            "current_position: "DFID"\n",
5958                            lo->ll_objs_checked_phase1,
5959                            checked,
5960                            lo->ll_run_time_phase1,
5961                            rtime,
5962                            speed1,
5963                            speed2,
5964                            new_checked,
5965                            PFID(&com->lc_fid_latest_scanned_phase2));
5966         } else {
5967                 __u64 speed1 = lo->ll_objs_checked_phase1;
5968                 __u64 speed2 = lo->ll_objs_checked_phase2;
5969
5970                 if (lo->ll_run_time_phase1 != 0)
5971                         speed1 = div64_s64(speed1, lo->ll_run_time_phase1);
5972                 if (lo->ll_run_time_phase2 != 0)
5973                         speed2 = div64_s64(speed2, lo->ll_run_time_phase2);
5974                 seq_printf(m, "checked_phase1: %llu\n"
5975                            "checked_phase2: %llu\n"
5976                            "run_time_phase1: %u seconds\n"
5977                            "run_time_phase2: %u seconds\n"
5978                            "average_speed_phase1: %llu items/sec\n"
5979                            "average_speed_phase2: %llu objs/sec\n"
5980                            "real-time_speed_phase1: N/A\n"
5981                            "real-time_speed_phase2: N/A\n"
5982                            "current_position: N/A\n",
5983                            lo->ll_objs_checked_phase1,
5984                            lo->ll_objs_checked_phase2,
5985                            lo->ll_run_time_phase1,
5986                            lo->ll_run_time_phase2,
5987                            speed1,
5988                            speed2);
5989         }
5990
5991         up_read(&com->lc_sem);
5992 }
5993
5994 static int lfsck_layout_master_double_scan(const struct lu_env *env,
5995                                            struct lfsck_component *com)
5996 {
5997         struct lfsck_layout             *lo     = com->lc_file_ram;
5998         struct lfsck_assistant_data     *lad    = com->lc_data;
5999         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6000         struct lfsck_tgt_descs          *ltds;
6001         struct lfsck_tgt_desc           *ltd;
6002         struct lfsck_tgt_desc           *next;
6003         int                              rc;
6004
6005         rc = lfsck_double_scan_generic(env, com, lo->ll_status);
6006
6007         if (thread_is_stopped(&lad->lad_thread)) {
6008                 LASSERT(list_empty(&lad->lad_req_list));
6009                 LASSERT(list_empty(&lad->lad_ost_phase1_list));
6010                 LASSERT(list_empty(&lad->lad_mdt_phase1_list));
6011
6012                 ltds = &lfsck->li_ost_descs;
6013                 spin_lock(&ltds->ltd_lock);
6014                 list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6015                                          ltd_layout_phase_list) {
6016                         list_del_init(&ltd->ltd_layout_phase_list);
6017                 }
6018                 spin_unlock(&ltds->ltd_lock);
6019
6020                 ltds = &lfsck->li_mdt_descs;
6021                 spin_lock(&ltds->ltd_lock);
6022                 list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6023                                          ltd_layout_phase_list) {
6024                         list_del_init(&ltd->ltd_layout_phase_list);
6025                 }
6026                 spin_unlock(&ltds->ltd_lock);
6027         }
6028
6029         return rc;
6030 }
6031
6032 static int lfsck_layout_slave_double_scan(const struct lu_env *env,
6033                                           struct lfsck_component *com)
6034 {
6035         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6036         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
6037         struct lfsck_layout             *lo     = com->lc_file_ram;
6038         struct ptlrpc_thread            *thread = &lfsck->li_thread;
6039         int                              rc;
6040         ENTRY;
6041
6042         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan start\n",
6043                lfsck_lfsck2name(lfsck));
6044
6045         atomic_inc(&lfsck->li_double_scan_count);
6046
6047         if (lo->ll_flags & LF_INCOMPLETE)
6048                 GOTO(done, rc = 1);
6049
6050         com->lc_new_checked = 0;
6051         com->lc_new_scanned = 0;
6052         com->lc_time_last_checkpoint = cfs_time_current();
6053         com->lc_time_next_checkpoint = com->lc_time_last_checkpoint +
6054                                 cfs_time_seconds(LFSCK_CHECKPOINT_INTERVAL);
6055
6056         while (1) {
6057                 struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(30),
6058                                                      NULL, NULL);
6059
6060                 rc = lfsck_layout_slave_query_master(env, com);
6061                 if (list_empty(&llsd->llsd_master_list)) {
6062                         if (unlikely(!thread_is_running(thread)))
6063                                 rc = 0;
6064                         else
6065                                 rc = 1;
6066
6067                         GOTO(done, rc);
6068                 }
6069
6070                 if (rc < 0)
6071                         GOTO(done, rc);
6072
6073                 rc = l_wait_event(thread->t_ctl_waitq,
6074                                   !thread_is_running(thread) ||
6075                                   lo->ll_flags & LF_INCOMPLETE ||
6076                                   list_empty(&llsd->llsd_master_list),
6077                                   &lwi);
6078                 if (unlikely(!thread_is_running(thread)))
6079                         GOTO(done, rc = 0);
6080
6081                 if (lo->ll_flags & LF_INCOMPLETE)
6082                         GOTO(done, rc = 1);
6083
6084                 if (rc == -ETIMEDOUT)
6085                         continue;
6086
6087                 GOTO(done, rc = (rc < 0 ? rc : 1));
6088         }
6089
6090 done:
6091         rc = lfsck_layout_double_scan_result(env, com, rc);
6092         lfsck_layout_slave_notify_master(env, com, LE_PHASE2_DONE,
6093                         (rc > 0 && lo->ll_flags & LF_INCOMPLETE) ? 0 : rc);
6094         lfsck_layout_slave_quit(env, com);
6095         if (atomic_dec_and_test(&lfsck->li_double_scan_count))
6096                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6097
6098         CDEBUG(D_LFSCK, "%s: layout LFSCK slave phase2 scan finished, "
6099                "status %d: rc = %d\n",
6100                lfsck_lfsck2name(lfsck), lo->ll_status, rc);
6101
6102         return rc;
6103 }
6104
6105 static void lfsck_layout_master_data_release(const struct lu_env *env,
6106                                              struct lfsck_component *com)
6107 {
6108         struct lfsck_assistant_data     *lad    = com->lc_data;
6109         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6110         struct lfsck_tgt_descs          *ltds;
6111         struct lfsck_tgt_desc           *ltd;
6112         struct lfsck_tgt_desc           *next;
6113
6114         LASSERT(lad != NULL);
6115         LASSERT(thread_is_init(&lad->lad_thread) ||
6116                 thread_is_stopped(&lad->lad_thread));
6117         LASSERT(list_empty(&lad->lad_req_list));
6118
6119         com->lc_data = NULL;
6120
6121         ltds = &lfsck->li_ost_descs;
6122         spin_lock(&ltds->ltd_lock);
6123         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6124                                  ltd_layout_phase_list) {
6125                 list_del_init(&ltd->ltd_layout_phase_list);
6126         }
6127         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6128                                  ltd_layout_phase_list) {
6129                 list_del_init(&ltd->ltd_layout_phase_list);
6130         }
6131         list_for_each_entry_safe(ltd, next, &lad->lad_ost_list,
6132                                  ltd_layout_list) {
6133                 list_del_init(&ltd->ltd_layout_list);
6134         }
6135         spin_unlock(&ltds->ltd_lock);
6136
6137         ltds = &lfsck->li_mdt_descs;
6138         spin_lock(&ltds->ltd_lock);
6139         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6140                                  ltd_layout_phase_list) {
6141                 list_del_init(&ltd->ltd_layout_phase_list);
6142         }
6143         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6144                                  ltd_layout_phase_list) {
6145                 list_del_init(&ltd->ltd_layout_phase_list);
6146         }
6147         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_list,
6148                                  ltd_layout_list) {
6149                 list_del_init(&ltd->ltd_layout_list);
6150         }
6151         spin_unlock(&ltds->ltd_lock);
6152
6153         if (likely(lad->lad_bitmap != NULL))
6154                 CFS_FREE_BITMAP(lad->lad_bitmap);
6155
6156         OBD_FREE_PTR(lad);
6157 }
6158
6159 static void lfsck_layout_slave_data_release(const struct lu_env *env,
6160                                             struct lfsck_component *com)
6161 {
6162         struct lfsck_layout_slave_data *llsd = com->lc_data;
6163
6164         lfsck_layout_slave_quit(env, com);
6165         com->lc_data = NULL;
6166         OBD_FREE_PTR(llsd);
6167 }
6168
6169 static void lfsck_layout_master_quit(const struct lu_env *env,
6170                                      struct lfsck_component *com)
6171 {
6172         struct lfsck_assistant_data     *lad    = com->lc_data;
6173         struct lfsck_instance           *lfsck  = com->lc_lfsck;
6174         struct lfsck_tgt_descs          *ltds;
6175         struct lfsck_tgt_desc           *ltd;
6176         struct lfsck_tgt_desc           *next;
6177
6178         LASSERT(lad != NULL);
6179
6180         lfsck_quit_generic(env, com);
6181
6182         LASSERT(thread_is_init(&lad->lad_thread) ||
6183                 thread_is_stopped(&lad->lad_thread));
6184         LASSERT(list_empty(&lad->lad_req_list));
6185
6186         ltds = &lfsck->li_ost_descs;
6187         spin_lock(&ltds->ltd_lock);
6188         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase1_list,
6189                                  ltd_layout_phase_list) {
6190                 list_del_init(&ltd->ltd_layout_phase_list);
6191         }
6192         list_for_each_entry_safe(ltd, next, &lad->lad_ost_phase2_list,
6193                                  ltd_layout_phase_list) {
6194                 list_del_init(&ltd->ltd_layout_phase_list);
6195         }
6196         spin_unlock(&ltds->ltd_lock);
6197
6198         ltds = &lfsck->li_mdt_descs;
6199         spin_lock(&ltds->ltd_lock);
6200         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase1_list,
6201                                  ltd_layout_phase_list) {
6202                 list_del_init(&ltd->ltd_layout_phase_list);
6203         }
6204         list_for_each_entry_safe(ltd, next, &lad->lad_mdt_phase2_list,
6205                                  ltd_layout_phase_list) {
6206                 list_del_init(&ltd->ltd_layout_phase_list);
6207         }
6208         spin_unlock(&ltds->ltd_lock);
6209 }
6210
6211 static void lfsck_layout_slave_quit(const struct lu_env *env,
6212                                     struct lfsck_component *com)
6213 {
6214         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6215         struct lfsck_layout_seq          *lls;
6216         struct lfsck_layout_seq          *next;
6217         struct lfsck_layout_slave_target *llst;
6218
6219         LASSERT(llsd != NULL);
6220
6221         down_write(&com->lc_sem);
6222         list_for_each_entry_safe(lls, next, &llsd->llsd_seq_list,
6223                                  lls_list) {
6224                 list_del_init(&lls->lls_list);
6225                 lfsck_object_put(env, lls->lls_lastid_obj);
6226                 OBD_FREE_PTR(lls);
6227         }
6228         up_write(&com->lc_sem);
6229
6230         spin_lock(&llsd->llsd_lock);
6231         while (!list_empty(&llsd->llsd_master_list)) {
6232                 llst = list_entry(llsd->llsd_master_list.next,
6233                                   struct lfsck_layout_slave_target, llst_list);
6234                 list_del_init(&llst->llst_list);
6235                 spin_unlock(&llsd->llsd_lock);
6236                 lfsck_layout_llst_put(llst);
6237                 spin_lock(&llsd->llsd_lock);
6238         }
6239         spin_unlock(&llsd->llsd_lock);
6240
6241         lfsck_rbtree_cleanup(env, com);
6242 }
6243
6244 static int lfsck_layout_master_in_notify(const struct lu_env *env,
6245                                          struct lfsck_component *com,
6246                                          struct lfsck_request *lr)
6247 {
6248         struct lfsck_instance           *lfsck = com->lc_lfsck;
6249         struct lfsck_layout             *lo    = com->lc_file_ram;
6250         struct lfsck_assistant_data     *lad   = com->lc_data;
6251         struct lfsck_tgt_descs          *ltds;
6252         struct lfsck_tgt_desc           *ltd;
6253         bool                             fail  = false;
6254         ENTRY;
6255
6256         if (lr->lr_event == LE_PAIRS_VERIFY) {
6257                 int rc;
6258
6259                 rc = lfsck_layout_master_check_pairs(env, com, &lr->lr_fid,
6260                                                      &lr->lr_fid2,
6261                                                      lr->lr_comp_id);
6262
6263                 RETURN(rc);
6264         }
6265
6266         CDEBUG(D_LFSCK, "%s: layout LFSCK master handles notify %u "
6267                "from %s %x, status %d, flags %x, flags2 %x\n",
6268                lfsck_lfsck2name(lfsck), lr->lr_event,
6269                (lr->lr_flags & LEF_FROM_OST) ? "OST" : "MDT",
6270                lr->lr_index, lr->lr_status, lr->lr_flags, lr->lr_flags2);
6271
6272         if (lr->lr_event != LE_PHASE1_DONE &&
6273             lr->lr_event != LE_PHASE2_DONE &&
6274             lr->lr_event != LE_PEER_EXIT)
6275                 RETURN(-EINVAL);
6276
6277         if (lr->lr_flags & LEF_FROM_OST)
6278                 ltds = &lfsck->li_ost_descs;
6279         else
6280                 ltds = &lfsck->li_mdt_descs;
6281         spin_lock(&ltds->ltd_lock);
6282         ltd = lfsck_ltd2tgt(ltds, lr->lr_index);
6283         if (ltd == NULL) {
6284                 spin_unlock(&ltds->ltd_lock);
6285
6286                 RETURN(-ENXIO);
6287         }
6288
6289         list_del_init(&ltd->ltd_layout_phase_list);
6290         switch (lr->lr_event) {
6291         case LE_PHASE1_DONE:
6292                 if (lr->lr_status <= 0 || lr->lr_flags2 & LF_INCOMPLETE) {
6293                         if (lr->lr_flags2 & LF_INCOMPLETE) {
6294                                 if (lr->lr_flags & LEF_FROM_OST)
6295                                         lfsck_lad_set_bitmap(env, com,
6296                                                              ltd->ltd_index);
6297                                 else
6298                                         lo->ll_flags |= LF_INCOMPLETE;
6299                         }
6300                         ltd->ltd_layout_done = 1;
6301                         list_del_init(&ltd->ltd_layout_list);
6302                         fail = true;
6303                         break;
6304                 }
6305
6306                 if (lr->lr_flags & LEF_FROM_OST) {
6307                         if (list_empty(&ltd->ltd_layout_list))
6308                                 list_add_tail(&ltd->ltd_layout_list,
6309                                               &lad->lad_ost_list);
6310                         list_add_tail(&ltd->ltd_layout_phase_list,
6311                                       &lad->lad_ost_phase2_list);
6312                 } else {
6313                         if (list_empty(&ltd->ltd_layout_list))
6314                                 list_add_tail(&ltd->ltd_layout_list,
6315                                               &lad->lad_mdt_list);
6316                         list_add_tail(&ltd->ltd_layout_phase_list,
6317                                       &lad->lad_mdt_phase2_list);
6318                 }
6319                 break;
6320         case LE_PHASE2_DONE:
6321                 ltd->ltd_layout_done = 1;
6322                 if (!list_empty(&ltd->ltd_layout_list))
6323                         list_del_init(&ltd->ltd_layout_list);
6324
6325                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6326                         lfsck_lad_set_bitmap(env, com, ltd->ltd_index);
6327                         fail = true;
6328                 }
6329
6330                 break;
6331         case LE_PEER_EXIT:
6332                 fail = true;
6333                 ltd->ltd_layout_done = 1;
6334                 list_del_init(&ltd->ltd_layout_list);
6335                 if (!(lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) &&
6336                     !(lr->lr_flags & LEF_FROM_OST))
6337                                 lo->ll_flags |= LF_INCOMPLETE;
6338                 break;
6339         default:
6340                 break;
6341         }
6342         spin_unlock(&ltds->ltd_lock);
6343
6344         if (fail && lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT) {
6345                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6346
6347                 memset(stop, 0, sizeof(*stop));
6348                 stop->ls_status = lr->lr_status;
6349                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6350                 lfsck_stop(env, lfsck->li_bottom, stop);
6351         } else if (lfsck_phase2_next_ready(lad)) {
6352                 wake_up_all(&lad->lad_thread.t_ctl_waitq);
6353         }
6354
6355         RETURN(0);
6356 }
6357
6358 static int lfsck_layout_slave_in_notify_local(const struct lu_env *env,
6359                                               struct lfsck_component *com,
6360                                               struct lfsck_req_local *lrl,
6361                                               struct thandle *th)
6362 {
6363         ENTRY;
6364
6365         switch (lrl->lrl_event) {
6366         case LEL_FID_ACCESSED:
6367                 lfsck_rbtree_update_bitmap(env, com, &lrl->lrl_fid, true);
6368                 RETURN(0);
6369         case LEL_PAIRS_VERIFY_LOCAL: {
6370                 int rc;
6371
6372                 lrl->lrl_status = LPVS_INIT;
6373                 /* Firstly, if the MDT-object which is claimed via OST-object
6374                  * local stored PFID xattr recognizes the OST-object, then it
6375                  * must be that the client given PFID is wrong. */
6376                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6377                                 &lrl->lrl_ff_local.ff_parent,
6378                                 lrl->lrl_ff_local.ff_layout.ol_comp_id);
6379                 if (rc <= 0)
6380                         RETURN(0);
6381
6382                 lrl->lrl_status = LPVS_INCONSISTENT;
6383                 /* The OST-object local stored PFID xattr is stale. We need to
6384                  * check whether the MDT-object that is claimed via the client
6385                  * given PFID information recognizes the OST-object or not. If
6386                  * matches, then need to update the OST-object's PFID xattr. */
6387                 rc = lfsck_layout_slave_check_pairs(env, com, &lrl->lrl_fid,
6388                                 &lrl->lrl_ff_client.ff_parent,
6389                                 lrl->lrl_ff_client.ff_layout.ol_comp_id);
6390                 /* For rc < 0 case:
6391                  * We are not sure whether the client given PFID information
6392                  * is correct or not, do nothing to avoid improper fixing.
6393                  *
6394                  * For rc > 0 case:
6395                  * The client given PFID information is also invalid, we can
6396                  * NOT fix the OST-object inconsistency.
6397                  */
6398                 if (!rc) {
6399                         lrl->lrl_status = LPVS_INCONSISTENT_TOFIX;
6400                         rc = lfsck_layout_slave_repair_pfid(env, com, lrl);
6401                 }
6402
6403                 RETURN(rc);
6404         }
6405         default:
6406                 break;
6407         }
6408
6409         RETURN(-EOPNOTSUPP);
6410 }
6411
6412 static int lfsck_layout_slave_in_notify(const struct lu_env *env,
6413                                         struct lfsck_component *com,
6414                                         struct lfsck_request *lr)
6415 {
6416         struct lfsck_instance *lfsck = com->lc_lfsck;
6417         struct lfsck_layout_slave_data *llsd = com->lc_data;
6418         struct lfsck_layout_slave_target *llst;
6419         int rc;
6420         ENTRY;
6421
6422         switch (lr->lr_event) {
6423         case LE_CONDITIONAL_DESTROY:
6424                 rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
6425                 RETURN(rc);
6426         case LE_PHASE1_DONE: {
6427                 if (lr->lr_flags2 & LF_INCOMPLETE) {
6428                         struct lfsck_layout *lo = com->lc_file_ram;
6429
6430                         lo->ll_flags |= LF_INCOMPLETE;
6431                         llst = lfsck_layout_llst_find_and_del(llsd,
6432                                                               lr->lr_index,
6433                                                               true);
6434                         if (llst != NULL) {
6435                                 lfsck_layout_llst_put(llst);
6436                                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6437                         }
6438                 }
6439
6440                 RETURN(0);
6441         }
6442         case LE_PHASE2_DONE:
6443         case LE_PEER_EXIT:
6444                 CDEBUG(D_LFSCK, "%s: layout LFSCK slave handle notify %u "
6445                        "from MDT %x, status %d\n", lfsck_lfsck2name(lfsck),
6446                        lr->lr_event, lr->lr_index, lr->lr_status);
6447                 break;
6448         default:
6449                 RETURN(-EINVAL);
6450         }
6451
6452         llst = lfsck_layout_llst_find_and_del(llsd, lr->lr_index, true);
6453         if (llst == NULL)
6454                 RETURN(0);
6455
6456         lfsck_layout_llst_put(llst);
6457         if (list_empty(&llsd->llsd_master_list))
6458                 wake_up_all(&lfsck->li_thread.t_ctl_waitq);
6459
6460         if (lr->lr_event == LE_PEER_EXIT &&
6461             (lfsck->li_bookmark_ram.lb_param & LPF_FAILOUT ||
6462              (list_empty(&llsd->llsd_master_list) &&
6463               (lr->lr_status == LS_STOPPED ||
6464                lr->lr_status == LS_CO_STOPPED)))) {
6465                 struct lfsck_stop *stop = &lfsck_env_info(env)->lti_stop;
6466
6467                 memset(stop, 0, sizeof(*stop));
6468                 stop->ls_status = lr->lr_status;
6469                 stop->ls_flags = lr->lr_param & ~LPF_BROADCAST;
6470                 lfsck_stop(env, lfsck->li_bottom, stop);
6471         }
6472
6473         RETURN(0);
6474 }
6475
6476 static void lfsck_layout_repaired(struct lfsck_layout *lo, __u64 *count)
6477 {
6478         int i;
6479
6480         for (i = 0; i < LLIT_MAX; i++)
6481                 *count += lo->ll_objs_repaired[i];
6482 }
6483
6484 static int lfsck_layout_query_all(const struct lu_env *env,
6485                                   struct lfsck_component *com,
6486                                   __u32 *mdts_count, __u32 *osts_count,
6487                                   __u64 *repaired)
6488 {
6489         struct lfsck_layout *lo = com->lc_file_ram;
6490         struct lfsck_tgt_descs *ltds;
6491         struct lfsck_tgt_desc *ltd;
6492         int idx;
6493         int rc;
6494         ENTRY;
6495
6496         rc = lfsck_query_all(env, com);
6497         if (rc != 0)
6498                 RETURN(rc);
6499
6500         ltds = &com->lc_lfsck->li_mdt_descs;
6501         down_read(&ltds->ltd_rw_sem);
6502         cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
6503                 ltd = lfsck_ltd2tgt(ltds, idx);
6504                 LASSERT(ltd != NULL);
6505
6506                 mdts_count[ltd->ltd_layout_status]++;
6507                 *repaired += ltd->ltd_layout_repaired;
6508         }
6509         up_read(&ltds->ltd_rw_sem);
6510
6511         ltds = &com->lc_lfsck->li_ost_descs;
6512         down_read(&ltds->ltd_rw_sem);
6513         cfs_foreach_bit(ltds->ltd_tgts_bitmap, idx) {
6514                 ltd = lfsck_ltd2tgt(ltds, idx);
6515                 LASSERT(ltd != NULL);
6516
6517                 osts_count[ltd->ltd_layout_status]++;
6518                 *repaired += ltd->ltd_layout_repaired;
6519         }
6520         up_read(&ltds->ltd_rw_sem);
6521
6522         down_read(&com->lc_sem);
6523         mdts_count[lo->ll_status]++;
6524         lfsck_layout_repaired(lo, repaired);
6525         up_read(&com->lc_sem);
6526
6527         RETURN(0);
6528 }
6529
6530 static int lfsck_layout_query(const struct lu_env *env,
6531                               struct lfsck_component *com,
6532                               struct lfsck_request *req,
6533                               struct lfsck_reply *rep,
6534                               struct lfsck_query *que, int idx)
6535 {
6536         struct lfsck_layout *lo = com->lc_file_ram;
6537         int rc = 0;
6538
6539         if (que != NULL) {
6540                 LASSERT(com->lc_lfsck->li_master);
6541
6542                 rc = lfsck_layout_query_all(env, com,
6543                                             que->lu_mdts_count[idx],
6544                                             que->lu_osts_count[idx],
6545                                             &que->lu_repaired[idx]);
6546         } else {
6547                 down_read(&com->lc_sem);
6548                 rep->lr_status = lo->ll_status;
6549                 if (req->lr_flags & LEF_QUERY_ALL)
6550                         lfsck_layout_repaired(lo, &rep->lr_repaired);
6551                 up_read(&com->lc_sem);
6552         }
6553
6554         return rc;
6555 }
6556
6557 /* with lfsck::li_lock held */
6558 static int lfsck_layout_slave_join(const struct lu_env *env,
6559                                    struct lfsck_component *com,
6560                                    struct lfsck_start_param *lsp)
6561 {
6562         struct lfsck_instance            *lfsck = com->lc_lfsck;
6563         struct lfsck_layout_slave_data   *llsd  = com->lc_data;
6564         struct lfsck_layout_slave_target *llst;
6565         struct lfsck_start               *start = lsp->lsp_start;
6566         int                               rc    = 0;
6567         ENTRY;
6568
6569         if (start == NULL || !(start->ls_flags & LPF_OST_ORPHAN))
6570                 RETURN(0);
6571
6572         if (!lsp->lsp_index_valid)
6573                 RETURN(-EINVAL);
6574
6575         /* If someone is running the LFSCK without orphan handling,
6576          * it will not maintain the object accessing rbtree. So we
6577          * cannot join it for orphan handling. */
6578         if (!llsd->llsd_rbtree_valid)
6579                 RETURN(-EBUSY);
6580
6581         spin_unlock(&lfsck->li_lock);
6582         rc = lfsck_layout_llst_add(llsd, lsp->lsp_index);
6583         spin_lock(&lfsck->li_lock);
6584         if (rc == 0 && !thread_is_running(&lfsck->li_thread)) {
6585                 spin_unlock(&lfsck->li_lock);
6586                 llst = lfsck_layout_llst_find_and_del(llsd, lsp->lsp_index,
6587                                                       true);
6588                 if (llst != NULL)
6589                         lfsck_layout_llst_put(llst);
6590                 spin_lock(&lfsck->li_lock);
6591                 rc = -EAGAIN;
6592         }
6593
6594         RETURN(rc);
6595 }
6596
6597 static struct lfsck_operations lfsck_layout_master_ops = {
6598         .lfsck_reset            = lfsck_layout_reset,
6599         .lfsck_fail             = lfsck_layout_fail,
6600         .lfsck_checkpoint       = lfsck_layout_master_checkpoint,
6601         .lfsck_prep             = lfsck_layout_master_prep,
6602         .lfsck_exec_oit         = lfsck_layout_master_exec_oit,
6603         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6604         .lfsck_post             = lfsck_layout_master_post,
6605         .lfsck_dump             = lfsck_layout_dump,
6606         .lfsck_double_scan      = lfsck_layout_master_double_scan,
6607         .lfsck_data_release     = lfsck_layout_master_data_release,
6608         .lfsck_quit             = lfsck_layout_master_quit,
6609         .lfsck_in_notify        = lfsck_layout_master_in_notify,
6610         .lfsck_query            = lfsck_layout_query,
6611 };
6612
6613 static struct lfsck_operations lfsck_layout_slave_ops = {
6614         .lfsck_reset            = lfsck_layout_reset,
6615         .lfsck_fail             = lfsck_layout_fail,
6616         .lfsck_checkpoint       = lfsck_layout_slave_checkpoint,
6617         .lfsck_prep             = lfsck_layout_slave_prep,
6618         .lfsck_exec_oit         = lfsck_layout_slave_exec_oit,
6619         .lfsck_exec_dir         = lfsck_layout_exec_dir,
6620         .lfsck_post             = lfsck_layout_slave_post,
6621         .lfsck_dump             = lfsck_layout_dump,
6622         .lfsck_double_scan      = lfsck_layout_slave_double_scan,
6623         .lfsck_data_release     = lfsck_layout_slave_data_release,
6624         .lfsck_quit             = lfsck_layout_slave_quit,
6625         .lfsck_in_notify_local  = lfsck_layout_slave_in_notify_local,
6626         .lfsck_in_notify        = lfsck_layout_slave_in_notify,
6627         .lfsck_query            = lfsck_layout_query,
6628         .lfsck_join             = lfsck_layout_slave_join,
6629 };
6630
6631 static void lfsck_layout_assistant_fill_pos(const struct lu_env *env,
6632                                             struct lfsck_component *com,
6633                                             struct lfsck_position *pos)
6634 {
6635         struct lfsck_assistant_data     *lad = com->lc_data;
6636         struct lfsck_layout_req         *llr;
6637
6638         if (((struct lfsck_layout *)(com->lc_file_ram))->ll_status !=
6639             LS_SCANNING_PHASE1)
6640                 return;
6641
6642         if (list_empty(&lad->lad_req_list))
6643                 return;
6644
6645         llr = list_entry(lad->lad_req_list.next,
6646                          struct lfsck_layout_req,
6647                          llr_lar.lar_list);
6648         pos->lp_oit_cookie = llr->llr_lar.lar_parent->lso_oit_cookie - 1;
6649 }
6650
6651 struct lfsck_assistant_operations lfsck_layout_assistant_ops = {
6652         .la_handler_p1          = lfsck_layout_assistant_handler_p1,
6653         .la_handler_p2          = lfsck_layout_assistant_handler_p2,
6654         .la_fill_pos            = lfsck_layout_assistant_fill_pos,
6655         .la_double_scan_result  = lfsck_layout_double_scan_result,
6656         .la_req_fini            = lfsck_layout_assistant_req_fini,
6657         .la_sync_failures       = lfsck_layout_assistant_sync_failures,
6658 };
6659
6660 int lfsck_layout_setup(const struct lu_env *env, struct lfsck_instance *lfsck)
6661 {
6662         struct lfsck_component  *com;
6663         struct lfsck_layout     *lo;
6664         struct dt_object        *root = NULL;
6665         struct dt_object        *obj;
6666         int                      i;
6667         int                      rc;
6668         ENTRY;
6669
6670         OBD_ALLOC_PTR(com);
6671         if (com == NULL)
6672                 RETURN(-ENOMEM);
6673
6674         INIT_LIST_HEAD(&com->lc_link);
6675         INIT_LIST_HEAD(&com->lc_link_dir);
6676         init_rwsem(&com->lc_sem);
6677         atomic_set(&com->lc_ref, 1);
6678         com->lc_lfsck = lfsck;
6679         com->lc_type = LFSCK_TYPE_LAYOUT;
6680         if (lfsck->li_master) {
6681                 com->lc_ops = &lfsck_layout_master_ops;
6682                 com->lc_data = lfsck_assistant_data_init(
6683                                 &lfsck_layout_assistant_ops,
6684                                 LFSCK_LAYOUT);
6685                 if (com->lc_data == NULL)
6686                         GOTO(out, rc = -ENOMEM);
6687
6688                 for (i = 0; i < LFSCK_STF_COUNT; i++)
6689                         mutex_init(&com->lc_sub_trace_objs[i].lsto_mutex);
6690         } else {
6691                 struct lfsck_layout_slave_data *llsd;
6692
6693                 com->lc_ops = &lfsck_layout_slave_ops;
6694                 OBD_ALLOC_PTR(llsd);
6695                 if (llsd == NULL)
6696                         GOTO(out, rc = -ENOMEM);
6697
6698                 INIT_LIST_HEAD(&llsd->llsd_seq_list);
6699                 INIT_LIST_HEAD(&llsd->llsd_master_list);
6700                 spin_lock_init(&llsd->llsd_lock);
6701                 llsd->llsd_rb_root = RB_ROOT;
6702                 rwlock_init(&llsd->llsd_rb_lock);
6703                 com->lc_data = llsd;
6704         }
6705         com->lc_file_size = sizeof(*lo);
6706         OBD_ALLOC(com->lc_file_ram, com->lc_file_size);
6707         if (com->lc_file_ram == NULL)
6708                 GOTO(out, rc = -ENOMEM);
6709
6710         OBD_ALLOC(com->lc_file_disk, com->lc_file_size);
6711         if (com->lc_file_disk == NULL)
6712                 GOTO(out, rc = -ENOMEM);
6713
6714         root = dt_locate(env, lfsck->li_bottom, &lfsck->li_local_root_fid);
6715         if (IS_ERR(root))
6716                 GOTO(out, rc = PTR_ERR(root));
6717
6718         if (unlikely(!dt_try_as_dir(env, root)))
6719                 GOTO(out, rc = -ENOTDIR);
6720
6721         obj = local_file_find_or_create(env, lfsck->li_los, root,
6722                                         LFSCK_LAYOUT,
6723                                         S_IFREG | S_IRUGO | S_IWUSR);
6724         if (IS_ERR(obj))
6725                 GOTO(out, rc = PTR_ERR(obj));
6726
6727         com->lc_obj = obj;
6728         rc = lfsck_layout_load(env, com);
6729         if (rc > 0)
6730                 rc = lfsck_layout_reset(env, com, true);
6731         else if (rc == -ENOENT)
6732                 rc = lfsck_layout_init(env, com);
6733         else if (lfsck->li_master)
6734                 rc = lfsck_load_sub_trace_files(env, com,
6735                                 &dt_lfsck_layout_dangling_features,
6736                                 LFSCK_LAYOUT, false);
6737
6738         if (rc != 0)
6739                 GOTO(out, rc);
6740
6741         lo = com->lc_file_ram;
6742         switch (lo->ll_status) {
6743         case LS_INIT:
6744         case LS_COMPLETED:
6745         case LS_FAILED:
6746         case LS_STOPPED:
6747         case LS_PARTIAL:
6748                 spin_lock(&lfsck->li_lock);
6749                 list_add_tail(&com->lc_link, &lfsck->li_list_idle);
6750                 spin_unlock(&lfsck->li_lock);
6751                 break;
6752         default:
6753                 CERROR("%s: unknown lfsck_layout status %d\n",
6754                        lfsck_lfsck2name(lfsck), lo->ll_status);
6755                 /* fall through */
6756         case LS_SCANNING_PHASE1:
6757         case LS_SCANNING_PHASE2:
6758                 /* No need to store the status to disk right now.
6759                  * If the system crashed before the status stored,
6760                  * it will be loaded back when next time. */
6761                 lo->ll_status = LS_CRASHED;
6762                 if (!lfsck->li_master)
6763                         lo->ll_flags |= LF_INCOMPLETE;
6764                 /* fall through */
6765         case LS_PAUSED:
6766         case LS_CRASHED:
6767         case LS_CO_FAILED:
6768         case LS_CO_STOPPED:
6769         case LS_CO_PAUSED:
6770                 spin_lock(&lfsck->li_lock);
6771                 list_add_tail(&com->lc_link, &lfsck->li_list_scan);
6772                 spin_unlock(&lfsck->li_lock);
6773                 break;
6774         }
6775
6776         if (lo->ll_flags & LF_CRASHED_LASTID) {
6777                 LASSERT(lfsck->li_out_notify != NULL);
6778
6779                 lfsck->li_out_notify(env, lfsck->li_out_notify_data,
6780                                      LE_LASTID_REBUILDING);
6781         }
6782
6783         GOTO(out, rc = 0);
6784
6785 out:
6786         if (root != NULL && !IS_ERR(root))
6787                 lfsck_object_put(env, root);
6788
6789         if (rc != 0) {
6790                 lfsck_component_cleanup(env, com);
6791                 CERROR("%s: fail to init layout LFSCK component: rc = %d\n",
6792                        lfsck_lfsck2name(lfsck), rc);
6793         }
6794
6795         return rc;
6796 }
6797
6798 struct lfsck_orphan_it {
6799         struct lfsck_component           *loi_com;
6800         struct lfsck_rbtree_node         *loi_lrn;
6801         struct lfsck_layout_slave_target *loi_llst;
6802         struct lu_fid                     loi_key;
6803         struct lu_orphan_rec_v2           loi_rec;
6804         __u64                             loi_hash;
6805         unsigned int                      loi_over:1;
6806 };
6807
6808 static int lfsck_fid_match_idx(const struct lu_env *env,
6809                                struct lfsck_instance *lfsck,
6810                                const struct lu_fid *fid, int idx)
6811 {
6812         struct seq_server_site  *ss;
6813         struct lu_server_fld    *sf;
6814         struct lu_seq_range     *range = &lfsck_env_info(env)->lti_range;
6815         int                      rc;
6816
6817         /* All abnormal cases will be returned to MDT0. */
6818         if (!fid_is_norm(fid)) {
6819                 if (idx == 0)
6820                         return 1;
6821
6822                 return 0;
6823         }
6824
6825         ss = lfsck_dev_site(lfsck);
6826         if (unlikely(ss == NULL))
6827                 return -ENOTCONN;
6828
6829         sf = ss->ss_server_fld;
6830         LASSERT(sf != NULL);
6831
6832         fld_range_set_any(range);
6833         rc = fld_server_lookup(env, sf, fid_seq(fid), range);
6834         if (rc != 0)
6835                 return rc;
6836
6837         if (!fld_range_is_mdt(range))
6838                 return -EINVAL;
6839
6840         if (range->lsr_index == idx)
6841                 return 1;
6842
6843         return 0;
6844 }
6845
6846 static void lfsck_layout_destroy_orphan(const struct lu_env *env,
6847                                         struct dt_object *obj)
6848 {
6849         struct dt_device        *dev    = lfsck_obj2dev(obj);
6850         struct thandle          *handle;
6851         int                      rc;
6852         ENTRY;
6853
6854         handle = dt_trans_create(env, dev);
6855         if (IS_ERR(handle))
6856                 RETURN_EXIT;
6857
6858         rc = dt_declare_ref_del(env, obj, handle);
6859         if (rc != 0)
6860                 GOTO(stop, rc);
6861
6862         rc = dt_declare_destroy(env, obj, handle);
6863         if (rc != 0)
6864                 GOTO(stop, rc);
6865
6866         rc = dt_trans_start_local(env, dev, handle);
6867         if (rc != 0)
6868                 GOTO(stop, rc);
6869
6870         dt_write_lock(env, obj, 0);
6871         rc = dt_ref_del(env, obj, handle);
6872         if (rc == 0)
6873                 rc = dt_destroy(env, obj, handle);
6874         dt_write_unlock(env, obj);
6875
6876         GOTO(stop, rc);
6877
6878 stop:
6879         dt_trans_stop(env, dev, handle);
6880
6881         CDEBUG(D_LFSCK, "destroy orphan OST-object "DFID": rc = %d\n",
6882                PFID(lfsck_dto2fid(obj)), rc);
6883
6884         RETURN_EXIT;
6885 }
6886
6887 static int lfsck_orphan_index_lookup(const struct lu_env *env,
6888                                      struct dt_object *dt,
6889                                      struct dt_rec *rec,
6890                                      const struct dt_key *key)
6891 {
6892         return -EOPNOTSUPP;
6893 }
6894
6895 static int lfsck_orphan_index_declare_insert(const struct lu_env *env,
6896                                              struct dt_object *dt,
6897                                              const struct dt_rec *rec,
6898                                              const struct dt_key *key,
6899                                              struct thandle *handle)
6900 {
6901         return -EOPNOTSUPP;
6902 }
6903
6904 static int lfsck_orphan_index_insert(const struct lu_env *env,
6905                                      struct dt_object *dt,
6906                                      const struct dt_rec *rec,
6907                                      const struct dt_key *key,
6908                                      struct thandle *handle,
6909                                      int ignore_quota)
6910 {
6911         return -EOPNOTSUPP;
6912 }
6913
6914 static int lfsck_orphan_index_declare_delete(const struct lu_env *env,
6915                                              struct dt_object *dt,
6916                                              const struct dt_key *key,
6917                                              struct thandle *handle)
6918 {
6919         return -EOPNOTSUPP;
6920 }
6921
6922 static int lfsck_orphan_index_delete(const struct lu_env *env,
6923                                      struct dt_object *dt,
6924                                      const struct dt_key *key,
6925                                      struct thandle *handle)
6926 {
6927         return -EOPNOTSUPP;
6928 }
6929
6930 static struct dt_it *lfsck_orphan_it_init(const struct lu_env *env,
6931                                           struct dt_object *dt,
6932                                           __u32 attr)
6933 {
6934         struct dt_device                *dev    = lu2dt_dev(dt->do_lu.lo_dev);
6935         struct lfsck_instance           *lfsck;
6936         struct lfsck_component          *com    = NULL;
6937         struct lfsck_layout_slave_data  *llsd;
6938         struct lfsck_orphan_it          *it     = NULL;
6939         struct lfsck_layout             *lo;
6940         int                              rc     = 0;
6941         ENTRY;
6942
6943         lfsck = lfsck_instance_find(dev, true, false);
6944         if (unlikely(lfsck == NULL))
6945                 RETURN(ERR_PTR(-ENXIO));
6946
6947         com = lfsck_component_find(lfsck, LFSCK_TYPE_LAYOUT);
6948         if (unlikely(com == NULL))
6949                 GOTO(out, rc = -ENOENT);
6950
6951         lo = com->lc_file_ram;
6952         if (lo->ll_flags & LF_INCOMPLETE)
6953                 GOTO(out, rc = -ESRCH);
6954
6955         llsd = com->lc_data;
6956         if (!llsd->llsd_rbtree_valid)
6957                 GOTO(out, rc = -ESRCH);
6958
6959         OBD_ALLOC_PTR(it);
6960         if (it == NULL)
6961                 GOTO(out, rc = -ENOMEM);
6962
6963         it->loi_llst = lfsck_layout_llst_find_and_del(llsd, attr, false);
6964         if (it->loi_llst == NULL)
6965                 GOTO(out, rc = -ENXIO);
6966
6967         if (dev->dd_record_fid_accessed) {
6968                 /* The first iteration against the rbtree, scan the whole rbtree
6969                  * to remove the nodes which do NOT need to be handled. */
6970                 write_lock(&llsd->llsd_rb_lock);
6971                 if (dev->dd_record_fid_accessed) {
6972                         struct rb_node                  *node;
6973                         struct rb_node                  *next;
6974                         struct lfsck_rbtree_node        *lrn;
6975
6976                         /* No need to record the fid accessing anymore. */
6977                         dev->dd_record_fid_accessed = 0;
6978
6979                         node = rb_first(&llsd->llsd_rb_root);
6980                         while (node != NULL) {
6981                                 next = rb_next(node);
6982                                 lrn = rb_entry(node, struct lfsck_rbtree_node,
6983                                                lrn_node);
6984                                 if (atomic_read(&lrn->lrn_known_count) <=
6985                                     atomic_read(&lrn->lrn_accessed_count)) {
6986                                         rb_erase(node, &llsd->llsd_rb_root);
6987                                         lfsck_rbtree_free(lrn);
6988                                 }
6989                                 node = next;
6990                         }
6991                 }
6992                 write_unlock(&llsd->llsd_rb_lock);
6993         }
6994
6995         /* read lock the rbtree when init, and unlock when fini */
6996         read_lock(&llsd->llsd_rb_lock);
6997         it->loi_com = com;
6998         com = NULL;
6999
7000         GOTO(out, rc = 0);
7001
7002 out:
7003         if (com != NULL)
7004                 lfsck_component_put(env, com);
7005
7006         CDEBUG(D_LFSCK, "%s: init the orphan iteration: rc = %d\n",
7007                lfsck_lfsck2name(lfsck), rc);
7008
7009         lfsck_instance_put(env, lfsck);
7010         if (rc != 0) {
7011                 if (it != NULL)
7012                         OBD_FREE_PTR(it);
7013
7014                 it = (struct lfsck_orphan_it *)ERR_PTR(rc);
7015         }
7016
7017         return (struct dt_it *)it;
7018 }
7019
7020 static void lfsck_orphan_it_fini(const struct lu_env *env,
7021                                  struct dt_it *di)
7022 {
7023         struct lfsck_orphan_it           *it    = (struct lfsck_orphan_it *)di;
7024         struct lfsck_component           *com   = it->loi_com;
7025         struct lfsck_layout_slave_data   *llsd;
7026         struct lfsck_layout_slave_target *llst;
7027
7028         if (com != NULL) {
7029                 CDEBUG(D_LFSCK, "%s: fini the orphan iteration\n",
7030                        lfsck_lfsck2name(com->lc_lfsck));
7031
7032                 llsd = com->lc_data;
7033                 read_unlock(&llsd->llsd_rb_lock);
7034                 llst = it->loi_llst;
7035                 LASSERT(llst != NULL);
7036
7037                 /* Save the key and hash for iterate next. */
7038                 llst->llst_fid = it->loi_key;
7039                 llst->llst_hash = it->loi_hash;
7040                 lfsck_layout_llst_put(llst);
7041                 lfsck_component_put(env, com);
7042         }
7043         OBD_FREE_PTR(it);
7044 }
7045
7046 /**
7047  * \retval       +1: the iteration finished
7048  * \retval        0: on success, not finished
7049  * \retval      -ve: on error
7050  */
7051 static int lfsck_orphan_it_next(const struct lu_env *env,
7052                                 struct dt_it *di)
7053 {
7054         struct lfsck_thread_info        *info   = lfsck_env_info(env);
7055         struct filter_fid               *ff     = &info->lti_ff;
7056         struct lu_attr                  *la     = &info->lti_la;
7057         struct lfsck_orphan_it          *it     = (struct lfsck_orphan_it *)di;
7058         struct lu_fid                   *key    = &it->loi_key;
7059         struct lu_orphan_rec_v2         *rec    = &it->loi_rec;
7060         struct ost_layout               *ol     = &rec->lor_layout;
7061         struct lfsck_component          *com    = it->loi_com;
7062         struct lfsck_instance           *lfsck  = com->lc_lfsck;
7063         struct lfsck_layout_slave_data  *llsd   = com->lc_data;
7064         struct dt_object                *obj;
7065         struct lfsck_rbtree_node        *lrn;
7066         int                              pos;
7067         int                              rc;
7068         __u32                            save;
7069         __u32                            idx    = it->loi_llst->llst_index;
7070         bool                             exact  = false;
7071         ENTRY;
7072
7073         if (it->loi_over)
7074                 RETURN(1);
7075
7076 again0:
7077         lrn = it->loi_lrn;
7078         if (lrn == NULL) {
7079                 lrn = lfsck_rbtree_search(llsd, key, &exact);
7080                 if (lrn == NULL) {
7081                         it->loi_over = 1;
7082                         RETURN(1);
7083                 }
7084
7085                 it->loi_lrn = lrn;
7086                 if (!exact) {
7087                         key->f_seq = lrn->lrn_seq;
7088                         key->f_oid = lrn->lrn_first_oid;
7089                         key->f_ver = 0;
7090                 }
7091         } else {
7092                 key->f_oid++;
7093                 if (unlikely(key->f_oid == 0)) {
7094                         key->f_seq++;
7095                         it->loi_lrn = NULL;
7096                         goto again0;
7097                 }
7098
7099                 if (key->f_oid >=
7100                     lrn->lrn_first_oid + LFSCK_RBTREE_BITMAP_WIDTH) {
7101                         it->loi_lrn = NULL;
7102                         goto again0;
7103                 }
7104         }
7105
7106         if (unlikely(atomic_read(&lrn->lrn_known_count) <=
7107                      atomic_read(&lrn->lrn_accessed_count))) {
7108                 struct rb_node *next = rb_next(&lrn->lrn_node);
7109
7110                 while (next != NULL) {
7111                         lrn = rb_entry(next, struct lfsck_rbtree_node,
7112                                        lrn_node);
7113                         if (atomic_read(&lrn->lrn_known_count) >
7114                             atomic_read(&lrn->lrn_accessed_count))
7115                                 break;
7116                         next = rb_next(next);
7117                 }
7118
7119                 if (next == NULL) {
7120                         it->loi_over = 1;
7121                         RETURN(1);
7122                 }
7123
7124                 it->loi_lrn = lrn;
7125                 key->f_seq = lrn->lrn_seq;
7126                 key->f_oid = lrn->lrn_first_oid;
7127                 key->f_ver = 0;
7128         }
7129
7130         pos = key->f_oid - lrn->lrn_first_oid;
7131
7132 again1:
7133         pos = find_next_bit(lrn->lrn_known_bitmap,
7134                             LFSCK_RBTREE_BITMAP_WIDTH, pos);
7135         if (pos >= LFSCK_RBTREE_BITMAP_WIDTH) {
7136                 key->f_oid = lrn->lrn_first_oid + pos;
7137                 if (unlikely(key->f_oid < lrn->lrn_first_oid)) {
7138                         key->f_seq++;
7139                         key->f_oid = 0;
7140                 }
7141                 it->loi_lrn = NULL;
7142                 goto again0;
7143         }
7144
7145         if (test_bit(pos, lrn->lrn_accessed_bitmap)) {
7146                 pos++;
7147                 goto again1;
7148         }
7149
7150         key->f_oid = lrn->lrn_first_oid + pos;
7151         obj = lfsck_object_find_bottom(env, lfsck, key);
7152         if (IS_ERR(obj)) {
7153                 rc = PTR_ERR(obj);
7154                 if (rc == -ENOENT) {
7155                         pos++;
7156                         goto again1;
7157                 }
7158                 RETURN(rc);
7159         }
7160
7161         dt_read_lock(env, obj, 0);
7162         if (dt_object_exists(obj) == 0 ||
7163             lfsck_is_dead_obj(obj)) {
7164                 dt_read_unlock(env, obj);
7165                 lfsck_object_put(env, obj);
7166                 pos++;
7167                 goto again1;
7168         }
7169
7170         rc = dt_attr_get(env, obj, la);
7171         if (rc != 0)
7172                 GOTO(out, rc);
7173
7174         rc = dt_xattr_get(env, obj, lfsck_buf_get(env, ff, sizeof(*ff)),
7175                           XATTR_NAME_FID);
7176         if (rc == -ENODATA) {
7177                 /* For the pre-created OST-object, update the bitmap to avoid
7178                  * others LFSCK (second phase) iteration to touch it again. */
7179                 if (la->la_ctime == 0) {
7180                         if (!test_and_set_bit(pos, lrn->lrn_accessed_bitmap))
7181                                 atomic_inc(&lrn->lrn_accessed_count);
7182
7183                         /* For the race between repairing dangling referenced
7184                          * MDT-object and unlink the file, it may left orphan
7185                          * OST-object there. Destroy it now! */
7186                         if (unlikely(!(la->la_mode & S_ISUID))) {
7187                                 dt_read_unlock(env, obj);
7188                                 lfsck_layout_destroy_orphan(env, obj);
7189                                 lfsck_object_put(env, obj);
7190                                 pos++;
7191                                 goto again1;
7192                         }
7193                 } else if (idx == 0) {
7194                         /* If the orphan OST-object has no parent information,
7195                          * regard it as referenced by the MDT-object on MDT0. */
7196                         fid_zero(&rec->lor_rec.lor_fid);
7197                         rec->lor_rec.lor_uid = la->la_uid;
7198                         rec->lor_rec.lor_gid = la->la_gid;
7199                         memset(ol, 0, sizeof(*ol));
7200
7201                         GOTO(out, rc = 0);
7202                 }
7203
7204                 dt_read_unlock(env, obj);
7205                 lfsck_object_put(env, obj);
7206                 pos++;
7207                 goto again1;
7208         }
7209
7210         if (rc < sizeof(struct lu_fid))
7211                 GOTO(out, rc = (rc < 0 ? rc : -EINVAL));
7212
7213         fid_le_to_cpu(&rec->lor_rec.lor_fid, &ff->ff_parent);
7214         /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
7215          * MDT-object's FID::f_ver, instead it is the OST-object index in its
7216          * parent MDT-object's layout EA. */
7217         save = rec->lor_rec.lor_fid.f_stripe_idx;
7218         rec->lor_rec.lor_fid.f_ver = 0;
7219         rc = lfsck_fid_match_idx(env, lfsck, &rec->lor_rec.lor_fid, idx);
7220         /* If the orphan OST-object does not claim the MDT, then next.
7221          *
7222          * If we do not know whether it matches or not, then return it
7223          * to the MDT for further check. */
7224         if (rc == 0) {
7225                 dt_read_unlock(env, obj);
7226                 lfsck_object_put(env, obj);
7227                 pos++;
7228                 goto again1;
7229         }
7230
7231         rec->lor_rec.lor_fid.f_stripe_idx = save;
7232         rec->lor_rec.lor_uid = la->la_uid;
7233         rec->lor_rec.lor_gid = la->la_gid;
7234         ost_layout_le_to_cpu(ol, &ff->ff_layout);
7235
7236         CDEBUG(D_LFSCK, "%s: return orphan "DFID", PFID "DFID", owner %u:%u, "
7237                "stripe size %u, stripe count %u, COMP id %u, COMP start %llu, "
7238                "COMP end %llu\n", lfsck_lfsck2name(com->lc_lfsck), PFID(key),
7239                PFID(&rec->lor_rec.lor_fid), rec->lor_rec.lor_uid,
7240                rec->lor_rec.lor_gid, ol->ol_stripe_size, ol->ol_stripe_count,
7241                ol->ol_comp_id, ol->ol_comp_start, ol->ol_comp_end);
7242
7243         GOTO(out, rc = 0);
7244
7245 out:
7246         dt_read_unlock(env, obj);
7247         lfsck_object_put(env, obj);
7248         if (rc == 0)
7249                 it->loi_hash++;
7250
7251         return rc;
7252 }
7253
7254 /**
7255  * \retval       +1: locate to the exactly position
7256  * \retval        0: cannot locate to the exactly position,
7257  *                   call next() to move to a valid position.
7258  * \retval      -ve: on error
7259  */
7260 static int lfsck_orphan_it_get(const struct lu_env *env,
7261                                struct dt_it *di,
7262                                const struct dt_key *key)
7263 {
7264         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7265         int                      rc;
7266
7267         it->loi_key = *(struct lu_fid *)key;
7268         rc = lfsck_orphan_it_next(env, di);
7269         if (rc == 1)
7270                 return 0;
7271
7272         if (rc == 0)
7273                 return 1;
7274
7275         return rc;
7276 }
7277
7278 static void lfsck_orphan_it_put(const struct lu_env *env,
7279                                 struct dt_it *di)
7280 {
7281 }
7282
7283 static struct dt_key *lfsck_orphan_it_key(const struct lu_env *env,
7284                                           const struct dt_it *di)
7285 {
7286         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7287
7288         return (struct dt_key *)&it->loi_key;
7289 }
7290
7291 static int lfsck_orphan_it_key_size(const struct lu_env *env,
7292                                     const struct dt_it *di)
7293 {
7294         return sizeof(struct lu_fid);
7295 }
7296
7297 static int lfsck_orphan_it_rec(const struct lu_env *env,
7298                                const struct dt_it *di,
7299                                struct dt_rec *rec,
7300                                __u32 attr)
7301 {
7302         struct lfsck_orphan_it *it = (struct lfsck_orphan_it *)di;
7303
7304         *(struct lu_orphan_rec_v2 *)rec = it->loi_rec;
7305
7306         return 0;
7307 }
7308
7309 static __u64 lfsck_orphan_it_store(const struct lu_env *env,
7310                                    const struct dt_it *di)
7311 {
7312         struct lfsck_orphan_it  *it   = (struct lfsck_orphan_it *)di;
7313
7314         return it->loi_hash;
7315 }
7316
7317 /**
7318  * \retval       +1: locate to the exactly position
7319  * \retval        0: cannot locate to the exactly position,
7320  *                   call next() to move to a valid position.
7321  * \retval      -ve: on error
7322  */
7323 static int lfsck_orphan_it_load(const struct lu_env *env,
7324                                 const struct dt_it *di,
7325                                 __u64 hash)
7326 {
7327         struct lfsck_orphan_it           *it   = (struct lfsck_orphan_it *)di;
7328         struct lfsck_layout_slave_target *llst = it->loi_llst;
7329         int                               rc;
7330
7331         LASSERT(llst != NULL);
7332
7333         if (hash != llst->llst_hash) {
7334                 CDEBUG(D_LFSCK, "%s: the given hash %llu for orphan "
7335                        "iteration does not match the one when fini "
7336                        "%llu, to be reset.\n",
7337                        lfsck_lfsck2name(it->loi_com->lc_lfsck), hash,
7338                        llst->llst_hash);
7339                 fid_zero(&llst->llst_fid);
7340                 llst->llst_hash = 0;
7341         }
7342
7343         it->loi_key = llst->llst_fid;
7344         it->loi_hash = llst->llst_hash;
7345         rc = lfsck_orphan_it_next(env, (struct dt_it *)di);
7346         if (rc == 1)
7347                 return 0;
7348
7349         if (rc == 0)
7350                 return 1;
7351
7352         return rc;
7353 }
7354
7355 static int lfsck_orphan_it_key_rec(const struct lu_env *env,
7356                                    const struct dt_it *di,
7357                                    void *key_rec)
7358 {
7359         return 0;
7360 }
7361
7362 const struct dt_index_operations lfsck_orphan_index_ops = {
7363         .dio_lookup             = lfsck_orphan_index_lookup,
7364         .dio_declare_insert     = lfsck_orphan_index_declare_insert,
7365         .dio_insert             = lfsck_orphan_index_insert,
7366         .dio_declare_delete     = lfsck_orphan_index_declare_delete,
7367         .dio_delete             = lfsck_orphan_index_delete,
7368         .dio_it = {
7369                 .init           = lfsck_orphan_it_init,
7370                 .fini           = lfsck_orphan_it_fini,
7371                 .get            = lfsck_orphan_it_get,
7372                 .put            = lfsck_orphan_it_put,
7373                 .next           = lfsck_orphan_it_next,
7374                 .key            = lfsck_orphan_it_key,
7375                 .key_size       = lfsck_orphan_it_key_size,
7376                 .rec            = lfsck_orphan_it_rec,
7377                 .store          = lfsck_orphan_it_store,
7378                 .load           = lfsck_orphan_it_load,
7379                 .key_rec        = lfsck_orphan_it_key_rec,
7380         }
7381 };