Whamcloud - gitweb
LU-3336 lfsck: recreate the lost MDT-object 11/7811/20
authorFan Yong <fan.yong@intel.com>
Wed, 12 Feb 2014 19:35:39 +0000 (03:35 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 27 Feb 2014 15:29:15 +0000 (15:29 +0000)
If the MDT-object does not exist but related OST-obejct still
alive, then when handle the orphan OST-object, the MDT-object
will be re-created under /.lustre/lost+found/MDTxxxx with the
given OST-object stripe and owner information. The other part
of the MDT-obejct layout EA will be empty which may be filled
by other orphan OST-object(s) as the LFSCK processing.

The name for the above re-created MDT-object will be named as
${MOUNT_POINT}/.lustre/lost+found/MDTxxxx/N-${FID}, the "FID"
is the MDT-object's fid. If the OST-object has zero PFID (for
some reason), then LFSCK will assign new FID to the parent.

If the MDT-object exists, but related layout EA slot is occupied
by other OST-object which may be created by former LFSCK running
for repairing dangling referenced MDT-object or may be not.

For the latter case, the LFSCK will create a new MDT-object with
the given OST-object under /.lustre/lost+found/MDTxxxx with name
"C-${FID1}-${FID2}-${idx}":
FID1: the new MDT-object's fid.
FID2: the original MDT-object's fid.
idx: index in the layout EA.

For the former one, then means the former LFSCK guess was wrong.
If such new created OST-object has never been modified which can
be indicated with lu_attr::la_ctime is zero and lu_attr::la_mode
has no "S_ISUID" set, then the LFSCK will destroy it and replace
it with the orphan OST-object; otherwise the LFSCK will keep the
new data, and create a new MDT-object to reference the orphan as
describe above with name "C-${FID1}-${FID2}-${idx}".

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Ia7700f7b2e4e09d6b576d025295fdb5c10ca90e0
Reviewed-on: http://review.whamcloud.com/7811
Tested-by: Jenkins
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre/lustre_idl.h
lustre/include/lustre_lfsck.h
lustre/include/obd_support.h
lustre/lfsck/lfsck_layout.c
lustre/lfsck/lfsck_lib.c
lustre/lod/lod_object.c
lustre/ofd/ofd_io.c
lustre/tests/sanity-lfsck.sh

index 0b7ed0f..cff332c 100644 (file)
@@ -3496,8 +3496,6 @@ struct lfsck_request {
        __u16           lr_param;
        __u16           lr_async_windows;
        __u32           lr_padding_1;
-       /* lr_fid is used on server-side only, and can be
-        * reused as others by client in the future. */
        struct lu_fid   lr_fid;
        __u64           lr_padding_2;
        __u64           lr_padding_3;
index 1209bc2..b23540e 100644 (file)
@@ -126,6 +126,7 @@ enum lfsck_events {
        LE_QUERY                = 7,
        LE_FID_ACCESSED         = 8,
        LE_PEER_EXIT            = 9,
+       LE_CONDITIONAL_DESTROY  = 10,
 };
 
 enum lfsck_event_flags {
index 6f4cbc5..6e1cd54 100644 (file)
@@ -507,6 +507,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_LFSCK_BAD_OWNER       0x1613
 #define OBD_FAIL_LFSCK_MULTIPLE_REF    0x1614
 #define OBD_FAIL_LFSCK_LOST_STRIPE     0x1615
+#define OBD_FAIL_LFSCK_LOST_MDTOBJ     0x1616
+#define OBD_FAIL_LFSCK_NOPFID          0x1617
+#define OBD_FAIL_LFSCK_CHANGE_STRIPE   0x1618
 
 #define OBD_FAIL_LFSCK_NOTIFY_NET      0x16f0
 #define OBD_FAIL_LFSCK_QUERY_NET       0x16f1
index 2fe033e..01e3151 100644 (file)
@@ -1828,11 +1828,355 @@ static int lfsck_layout_recreate_parent(const struct lu_env *env,
                                        const char *postfix,
                                        __u32 ea_off)
 {
-       /* XXX: To be extended in other patch. */
-       return 0;
+       struct lfsck_thread_info        *info   = lfsck_env_info(env);
+       char                            *name   = info->lti_key;
+       struct lu_attr                  *la     = &info->lti_la;
+       struct dt_object_format         *dof    = &info->lti_dof;
+       struct lfsck_instance           *lfsck  = com->lc_lfsck;
+       struct lu_fid                   *pfid   = &rec->lor_fid;
+       struct lu_fid                   *tfid   = &info->lti_fid3;
+       struct dt_device                *next   = lfsck->li_next;
+       struct dt_object                *pobj   = NULL;
+       struct dt_object                *cobj   = NULL;
+       struct thandle                  *th     = NULL;
+       struct lu_buf                   *pbuf   = NULL;
+       struct lu_buf                   *ea_buf = &info->lti_big_buf;
+       int                              buflen = ea_buf->lb_len;
+       int                              rc     = 0;
+       ENTRY;
+
+       /* Create .lustre/lost+found/MDTxxxx when needed. */
+       if (unlikely(lfsck->li_lpf_obj == NULL)) {
+               rc = lfsck_create_lpf(env, lfsck);
+               if (rc != 0)
+                       RETURN(rc);
+       }
+
+       if (fid_is_zero(pfid)) {
+               struct filter_fid *ff = &info->lti_new_pfid;
+
+               rc = lfsck_fid_alloc(env, lfsck, pfid, false);
+               if (rc != 0)
+                       RETURN(rc);
+
+               ff->ff_parent.f_seq = cpu_to_le64(pfid->f_seq);
+               ff->ff_parent.f_oid = cpu_to_le32(pfid->f_oid);
+               /* In fact, the ff_parent::f_ver is not the real parent FID::f_ver,
+                * instead, it is the OST-object index in its parent MDT-object
+                * layout EA. */
+               ff->ff_parent.f_ver = cpu_to_le32(ea_off);
+               pbuf = lfsck_buf_get(env, ff, sizeof(struct filter_fid));
+               cobj = lfsck_object_find_by_dev(env, ltd->ltd_tgt, cfid);
+               if (IS_ERR(cobj))
+                       RETURN(PTR_ERR(cobj));
+       }
+
+       CDEBUG(D_LFSCK, "Re-create the lost MDT-object: parent "
+              DFID", child "DFID", OST-index %u, stripe-index %u, "
+              "prefix %s, postfix %s\n",
+              PFID(pfid), PFID(cfid), ltd->ltd_index, ea_off, prefix, postfix);
+
+       pobj = lfsck_object_find_by_dev(env, lfsck->li_bottom, pfid);
+       if (IS_ERR(pobj))
+               GOTO(put, rc = PTR_ERR(pobj));
+
+       LASSERT(prefix != NULL);
+       LASSERT(postfix != NULL);
+
+       /** name rules:
+        *
+        *  1. Use the MDT-object's FID as the name with prefix and postfix.
+        *
+        *  1.1 prefix "C-":    More than one OST-objects cliam the same
+        *                      MDT-object and the same slot in the layout EA.
+        *                      It may be created for dangling referenced MDT
+        *                      object or may be not.
+        *  1.2 prefix "N-":    The orphan OST-object does not know which one
+        *                      is the real parent, so the LFSCK assign a new
+        *                      FID as its parent.
+        *  1.3 prefix "R-":    The orphan OST-object know its parent FID but
+        *                      does not know the position in the namespace.
+        *
+        *  2. If there is name conflict, increase FID::f_ver for new name. */
+       sprintf(name, "%s"DFID"%s", prefix, PFID(pfid), postfix);
+       do {
+               rc = dt_lookup(env, lfsck->li_lpf_obj, (struct dt_rec *)tfid,
+                              (const struct dt_key *)name, BYPASS_CAPA);
+               if (rc != 0 && rc != -ENOENT)
+                       GOTO(put, rc);
+
+               if (unlikely(rc == 0)) {
+                       CWARN("%s: The name %s under lost+found has been used "
+                             "by the "DFID". Try to increase the FID version "
+                             "for the new file name.\n",
+                             lfsck_lfsck2name(lfsck), name, PFID(tfid));
+                       *tfid = *pfid;
+                       tfid->f_ver++;
+                       sprintf(name, "%s"DFID"%s", prefix, PFID(tfid), postfix);
+               }
+       } while (rc == 0);
+
+       memset(la, 0, sizeof(*la));
+       la->la_uid = rec->lor_uid;
+       la->la_gid = rec->lor_gid;
+       la->la_mode = S_IFREG | S_IRUSR | S_IWUSR;
+       la->la_valid = LA_MODE | LA_UID | LA_GID;
+
+       memset(dof, 0, sizeof(*dof));
+       dof->dof_type = dt_mode_to_dft(S_IFREG);
+
+       rc = lov_mds_md_size(ea_off + 1, LOV_MAGIC_V1);
+       if (buflen < rc) {
+               lu_buf_realloc(ea_buf, rc);
+               buflen = ea_buf->lb_len;
+               if (ea_buf->lb_buf == NULL)
+                       GOTO(put, rc = -ENOMEM);
+       } else {
+               ea_buf->lb_len = rc;
+       }
+
+       th = dt_trans_create(env, next);
+       if (IS_ERR(th))
+               GOTO(put, rc = PTR_ERR(th));
+
+       /* 1a. Update OST-object's parent information remotely.
+        *
+        * If other subsequent modifications failed, then next LFSCK scanning
+        * will process the OST-object as orphan again with known parent FID. */
+       if (cobj != NULL) {
+               rc = dt_declare_xattr_set(env, cobj, pbuf, XATTR_NAME_FID, 0, th);
+               if (rc != 0)
+                       GOTO(stop, rc);
+       }
+
+       /* 2a. Create the MDT-object locally. */
+       rc = dt_declare_create(env, pobj, la, NULL, dof, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       /* 3a. Add layout EA for the MDT-object. */
+       rc = dt_declare_xattr_set(env, pobj, ea_buf, XATTR_NAME_LOV,
+                                 LU_XATTR_CREATE, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       /* 4a. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */
+       rc = dt_declare_insert(env, lfsck->li_lpf_obj,
+                              (const struct dt_rec *)pfid,
+                              (const struct dt_key *)name, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       rc = dt_trans_start(env, next, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       /* 1b. Update OST-object's parent information remotely. */
+       if (cobj != NULL) {
+               rc = dt_xattr_set(env, cobj, pbuf, XATTR_NAME_FID, 0, th,
+                                 BYPASS_CAPA);
+               if (rc != 0)
+                       GOTO(stop, rc);
+       }
+
+       dt_write_lock(env, pobj, 0);
+       /* 2b. Create the MDT-object locally. */
+       rc = dt_create(env, pobj, la, NULL, dof, th);
+       if (rc == 0)
+               /* 3b. Add layout EA for the MDT-object. */
+               rc = lfsck_layout_extend_lovea(env, th, pobj, cfid, ea_buf,
+                                              LU_XATTR_CREATE, ltd->ltd_index,
+                                              ea_off);
+       dt_write_unlock(env, pobj);
+       if (rc < 0)
+               GOTO(stop, rc);
+
+       /* 4b. Insert the MDT-object to .lustre/lost+found/MDTxxxx/ */
+       rc = dt_insert(env, lfsck->li_lpf_obj,
+                      (const struct dt_rec *)pfid,
+                      (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+
+       GOTO(stop, rc);
+
+stop:
+       dt_trans_stop(env, next, th);
+put:
+       if (cobj != NULL && !IS_ERR(cobj))
+               lu_object_put(env, &cobj->do_lu);
+       if (pobj != NULL && !IS_ERR(pobj))
+               lu_object_put(env, &pobj->do_lu);
+       ea_buf->lb_len = buflen;
+
+       return rc >= 0 ? 1 : rc;
+}
+
+static int lfsck_layout_master_conditional_destroy(const struct lu_env *env,
+                                                  struct lfsck_component *com,
+                                                  const struct lu_fid *fid,
+                                                  __u32 index)
+{
+       struct lfsck_thread_info *info  = lfsck_env_info(env);
+       struct lfsck_request     *lr    = &info->lti_lr;
+       struct lfsck_instance    *lfsck = com->lc_lfsck;
+       struct lfsck_tgt_desc    *ltd;
+       struct ptlrpc_request    *req;
+       struct lfsck_request     *tmp;
+       struct obd_export        *exp;
+       int                       rc    = 0;
+       ENTRY;
+
+       ltd = lfsck_tgt_get(&lfsck->li_ost_descs, index);
+       if (unlikely(ltd == NULL))
+               RETURN(-ENODEV);
+
+       exp = ltd->ltd_exp;
+       if (!(exp_connect_flags(exp) & OBD_CONNECT_LFSCK))
+               GOTO(put, rc = -EOPNOTSUPP);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LFSCK_NOTIFY);
+       if (req == NULL)
+               GOTO(put, rc = -ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OBD_VERSION, LFSCK_NOTIFY);
+       if (rc != 0) {
+               ptlrpc_request_free(req);
+
+               GOTO(put, rc);
+       }
+
+       memset(lr, 0, sizeof(*lr));
+       lr->lr_event = LE_CONDITIONAL_DESTROY;
+       lr->lr_active = LT_LAYOUT;
+       lr->lr_fid = *fid;
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_LFSCK_REQUEST);
+       *tmp = *lr;
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       ptlrpc_req_finished(req);
+
+       GOTO(put, rc);
+
+put:
+       lfsck_tgt_put(ltd);
+
+       return rc;
+}
+
+static int lfsck_layout_slave_conditional_destroy(const struct lu_env *env,
+                                                 struct lfsck_component *com,
+                                                 struct lfsck_request *lr)
+{
+       struct lfsck_thread_info        *info   = lfsck_env_info(env);
+       struct lu_attr                  *la     = &info->lti_la;
+       ldlm_policy_data_t              *policy = &info->lti_policy;
+       struct ldlm_res_id              *resid  = &info->lti_resid;
+       struct lfsck_instance           *lfsck  = com->lc_lfsck;
+       struct dt_device                *dev    = lfsck->li_bottom;
+       struct lu_fid                   *fid    = &lr->lr_fid;
+       struct dt_object                *obj;
+       struct thandle                  *th     = NULL;
+       struct lustre_handle             lh     = { 0 };
+       __u64                            flags  = 0;
+       int                              rc     = 0;
+       ENTRY;
+
+       obj = lfsck_object_find_by_dev(env, dev, fid);
+       if (IS_ERR(obj))
+               RETURN(PTR_ERR(obj));
+
+       dt_read_lock(env, obj, 0);
+       if (dt_object_exists(obj) == 0) {
+               dt_read_unlock(env, obj);
+
+               GOTO(put, rc = -ENOENT);
+       }
+
+       /* Get obj's attr without lock firstly. */
+       rc = dt_attr_get(env, obj, la, BYPASS_CAPA);
+       dt_read_unlock(env, obj);
+       if (rc != 0)
+               GOTO(put, rc);
+
+       if (likely(la->la_ctime != 0 || la->la_mode & S_ISUID))
+               GOTO(put, rc = -ETXTBSY);
+
+       /* Acquire extent lock on [0, EOF] to sync with all possible written. */
+       LASSERT(lfsck->li_namespace != NULL);
+
+       memset(policy, 0, sizeof(*policy));
+       policy->l_extent.end = OBD_OBJECT_EOF;
+       ost_fid_build_resid(fid, resid);
+       rc = ldlm_cli_enqueue_local(lfsck->li_namespace, resid, LDLM_EXTENT,
+                                   policy, LCK_EX, &flags, ldlm_blocking_ast,
+                                   ldlm_completion_ast, NULL, NULL, 0,
+                                   LVB_T_NONE, NULL, &lh);
+       if (rc != ELDLM_OK)
+               GOTO(put, rc = -EIO);
+
+       th = dt_trans_create(env, dev);
+       if (IS_ERR(th))
+               GOTO(unlock1, rc = PTR_ERR(th));
+
+       rc = dt_declare_ref_del(env, obj, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       rc = dt_declare_destroy(env, obj, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       rc = dt_trans_start_local(env, dev, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       dt_write_lock(env, obj, 0);
+       /* Get obj's attr within lock again. */
+       rc = dt_attr_get(env, obj, la, BYPASS_CAPA);
+       if (rc != 0)
+               GOTO(unlock2, rc);
+
+       if (la->la_ctime != 0)
+               GOTO(unlock2, rc = -ETXTBSY);
+
+       rc = dt_ref_del(env, obj, th);
+       if (rc != 0)
+               GOTO(unlock2, rc);
+
+       rc = dt_destroy(env, obj, th);
+       if (rc == 0)
+               CDEBUG(D_LFSCK, "Destroy the empty OST-object "DFID" which "
+                      "was created for reparing dangling referenced case. "
+                      "But the original missed OST-object is found now.\n",
+                      PFID(fid));
+
+       GOTO(unlock2, rc);
+
+unlock2:
+       dt_write_unlock(env, obj);
+
+stop:
+       dt_trans_stop(env, dev, th);
+
+unlock1:
+       ldlm_lock_decref(&lh, LCK_EX);
+
+put:
+       lu_object_put(env, &obj->do_lu);
+
+       return rc;
 }
 
 /**
+ * Some OST-object has occupied the specified layout EA slot.
+ * Such OST-object may be generated by the LFSCK when repair
+ * dangling referenced MDT-object, which can be indicated by
+ * attr::la_ctime == 0 but without S_ISUID in la_mode. If it
+ * is true and such OST-object has not been modified yet, we
+ * will replace it with the orphan OST-object; otherwise the
+ * LFSCK will create new MDT-object to reference the orphan.
+ *
  * \retval      +1: repaired
  * \retval       0: did nothing
  * \retval     -ve: on error
@@ -1847,8 +2191,87 @@ static int lfsck_layout_conflict_create(const struct lu_env *env,
                                        struct lov_ost_data_v1 *slot,
                                        __u32 ea_off, __u32 ori_len)
 {
-       /* XXX: To be extended in other patch. */
-       return 0;
+       struct lfsck_thread_info *info          = lfsck_env_info(env);
+       struct lu_fid            *cfid2         = &info->lti_fid2;
+       struct ost_id            *oi            = &info->lti_oi;
+       struct lov_mds_md_v1     *lmm           = ea_buf->lb_buf;
+       struct dt_device         *dev           = com->lc_lfsck->li_bottom;
+       struct thandle           *th            = NULL;
+       struct lustre_handle      lh            = { 0 };
+       char                      postfix[64];
+       __u32                     ost_idx2      = le32_to_cpu(slot->l_ost_idx);
+       int                       rc            = 0;
+       ENTRY;
+
+       ostid_le_to_cpu(&slot->l_ost_oi, oi);
+       ostid_to_fid(cfid2, oi, ost_idx2);
+
+       CDEBUG(D_LFSCK, "Handle layout EA conflict: parent "DFID
+              ", cur-child "DFID" on the OST %u, orphan-child "
+              DFID" on the OST %u, stripe-index %u\n",
+              PFID(lfsck_dto2fid(parent)), PFID(cfid2), ost_idx2,
+              PFID(cfid), ltd->ltd_index, ea_off);
+
+       /* Hold layout lock on the parent to prevent others to access. */
+       rc = lfsck_layout_lock(env, com, parent, &lh,
+                              MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       rc = lfsck_layout_master_conditional_destroy(env, com, cfid2, ost_idx2);
+
+       /* If the conflict OST-obejct is not created for fixing dangling
+        * referenced MDT-object in former LFSCK check/repair, or it has
+        * been modified by others, then we cannot destroy it. Re-create
+        * a new MDT-object for the orphan OST-object. */
+       if (rc == -ETXTBSY) {
+               /* No need the layout lock on the original parent. */
+               lfsck_layout_unlock(&lh);
+               ea_buf->lb_len = ori_len;
+
+               fid_zero(&rec->lor_fid);
+               snprintf(postfix, 64, "-"DFID"-%x",
+                        PFID(lu_object_fid(&parent->do_lu)), ea_off);
+               rc = lfsck_layout_recreate_parent(env, com, ltd, rec, cfid,
+                                                 "C-", postfix, ea_off);
+
+               RETURN(rc);
+       }
+
+       if (rc != 0 && rc != -ENOENT)
+               GOTO(unlock, rc);
+
+       th = dt_trans_create(env, dev);
+       if (IS_ERR(th))
+               GOTO(unlock, rc = PTR_ERR(th));
+
+       rc = dt_declare_xattr_set(env, parent, ea_buf, XATTR_NAME_LOV,
+                                 LU_XATTR_REPLACE, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       rc = dt_trans_start_local(env, dev, th);
+       if (rc != 0)
+               GOTO(stop, rc);
+
+       dt_write_lock(env, parent, 0);
+       lmm->lmm_layout_gen = cpu_to_le16(le16_to_cpu(lmm->lmm_layout_gen) + 1);
+       rc = lfsck_layout_refill_lovea(env, th, parent, cfid, ea_buf, slot,
+                                      LU_XATTR_REPLACE, ltd->ltd_index);
+       dt_write_unlock(env, parent);
+
+       GOTO(stop, rc);
+
+stop:
+       dt_trans_stop(env, dev, th);
+
+unlock:
+       lfsck_layout_unlock(&lh);
+
+out:
+       ea_buf->lb_len = ori_len;
+
+       return rc >= 0 ? 1 : rc;
 }
 
 /**
@@ -2240,6 +2663,18 @@ static int lfsck_layout_scan_orphan(const struct lu_env *env,
                struct dt_key           *key;
                struct lu_orphan_rec    *rec = &info->lti_rec;
 
+               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DELAY3) &&
+                   cfs_fail_val > 0) {
+                       struct ptlrpc_thread    *thread = &lfsck->li_thread;
+                       struct l_wait_info       lwi;
+
+                       lwi = LWI_TIMEOUT(cfs_time_seconds(cfs_fail_val),
+                                         NULL, NULL);
+                       l_wait_event(thread->t_ctl_waitq,
+                                    !thread_is_running(thread),
+                                    &lwi);
+               }
+
                key = iops->key(env, di);
                com->lc_fid_latest_scanned_phase2 = *(struct lu_fid *)key;
                rc = iops->rec(env, di, (struct dt_rec *)rec, 0);
@@ -4674,6 +5109,14 @@ static int lfsck_layout_slave_in_notify(const struct lu_env *env,
                RETURN(0);
        }
 
+       if (lr->lr_event == LE_CONDITIONAL_DESTROY) {
+               int rc;
+
+               rc = lfsck_layout_slave_conditional_destroy(env, com, lr);
+
+               RETURN(rc);
+       }
+
        if (lr->lr_event != LE_PHASE2_DONE && lr->lr_event != LE_PEER_EXIT)
                RETURN(-EINVAL);
 
index 6ea74b8..4f05c19 100644 (file)
@@ -2338,7 +2338,8 @@ int lfsck_in_notify(const struct lu_env *env, struct dt_device *key,
        case LE_PHASE1_DONE:
        case LE_PHASE2_DONE:
        case LE_FID_ACCESSED:
-       case LE_PEER_EXIT: {
+       case LE_PEER_EXIT:
+       case LE_CONDITIONAL_DESTROY: {
                struct lfsck_instance  *lfsck;
                struct lfsck_component *com;
 
index e4583d8..ef9781d 100644 (file)
@@ -364,6 +364,18 @@ static int lod_declare_attr_set(const struct lu_env *env,
            dt_object_remote(next) == 0)
                dt_declare_xattr_del(env, next, XATTR_NAME_LOV, handle);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) &&
+           dt_object_exists(next) &&
+           dt_object_remote(next) == 0 && S_ISREG(attr->la_mode)) {
+               struct lod_thread_info *info = lod_env_info(env);
+               struct lu_buf *buf = &info->lti_buf;
+
+               buf->lb_buf = info->lti_ea_store;
+               buf->lb_len = info->lti_ea_store_size;
+               dt_declare_xattr_set(env, next, buf, XATTR_NAME_LOV,
+                                    LU_XATTR_REPLACE, handle);
+       }
+
        RETURN(rc);
 }
 
@@ -441,6 +453,39 @@ static int lod_attr_set(const struct lu_env *env,
            dt_object_remote(next) == 0)
                dt_xattr_del(env, next, XATTR_NAME_LOV, handle, BYPASS_CAPA);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_CHANGE_STRIPE) &&
+           dt_object_exists(next) &&
+           dt_object_remote(next) == 0 && S_ISREG(attr->la_mode)) {
+               struct lod_thread_info *info = lod_env_info(env);
+               struct lu_buf *buf = &info->lti_buf;
+               struct ost_id *oi = &info->lti_ostid;
+               struct lu_fid *fid = &info->lti_fid;
+               struct lov_mds_md_v1 *lmm;
+               struct lov_ost_data_v1 *objs;
+               __u32 magic;
+               int rc1;
+
+               rc1 = lod_get_lov_ea(env, lo);
+               if (rc1  <= 0)
+                       RETURN(rc);
+
+               buf->lb_buf = info->lti_ea_store;
+               buf->lb_len = info->lti_ea_store_size;
+               lmm = info->lti_ea_store;
+               magic = le32_to_cpu(lmm->lmm_magic);
+               if (magic == LOV_MAGIC_V1)
+                       objs = &(lmm->lmm_objects[0]);
+               else
+                       objs = &((struct lov_mds_md_v3 *)lmm)->lmm_objects[0];
+               ostid_le_to_cpu(&objs->l_ost_oi, oi);
+               ostid_to_fid(fid, oi, le32_to_cpu(objs->l_ost_idx));
+               fid->f_oid--;
+               fid_to_ostid(fid, oi);
+               ostid_cpu_to_le(oi, &objs->l_ost_oi);
+               dt_xattr_set(env, next, buf, XATTR_NAME_LOV,
+                            LU_XATTR_REPLACE, handle, BYPASS_CAPA);
+       }
+
        RETURN(rc);
 }
 
@@ -1916,6 +1961,9 @@ static int lod_declare_object_destroy(const struct lu_env *env,
        if (rc)
                RETURN(rc);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ))
+               RETURN(0);
+
        /*
         * load striping information, notice we don't do this when object
         * is being initialized as we don't need this information till
@@ -1950,6 +1998,9 @@ static int lod_object_destroy(const struct lu_env *env,
        if (rc)
                RETURN(rc);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ))
+               RETURN(0);
+
        /* destroy all underlying objects */
        for (i = 0; i < lo->ldo_stripenr; i++) {
                LASSERT(lo->ldo_stripe[i]);
index b524054..7d33fc1 100644 (file)
@@ -384,6 +384,9 @@ ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd,
 
        /* set filter fid EA */
        if (ff_needed) {
+               if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
+                       GOTO(out_tx, rc);
+
                rc = dt_xattr_set(env, dt_obj, &info->fti_buf, XATTR_NAME_FID,
                                  0, th, BYPASS_CAPA);
                if (rc)
index e75e49d..f980246 100644 (file)
@@ -1675,6 +1675,418 @@ test_18a() {
 }
 run_test 18a "Find out orphan OST-object and repair it (1)"
 
+test_18b() {
+       [ $MDSCOUNT -lt 2 ] &&
+               skip "We need at least 2 MDSes for test_18b" && exit 0
+
+       [ $OSTCOUNT -lt 2 ] &&
+               skip "We need at least 2 OSTs for test_18b" && exit 0
+
+       echo "#####"
+       echo "The target MDT-object is lost. The LFSCK should re-create the"
+       echo "MDT-object under .lustre/lost+found/MDTxxxx. The admin should"
+       echo "can move it back to normal namespace manually."
+       echo "#####"
+
+       echo "stopall"
+       stopall > /dev/null
+       echo "formatall"
+       formatall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       mkdir -p $DIR/$tdir
+       $LFS mkdir -i 0 $DIR/$tdir/a1
+       $LFS mkdir -i 1 $DIR/$tdir/a2
+       $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
+       $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
+       dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
+       dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
+       local saved_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
+       local fid1=$($LFS path2fid $DIR/$tdir/a1/f1)
+       echo ${fid1}
+       $LFS getstripe $DIR/$tdir/a1/f1
+       local fid2=$($LFS path2fid $DIR/$tdir/a2/f2)
+       echo ${fid2}
+       $LFS getstripe $DIR/$tdir/a2/f2
+       sync
+       cancel_lru_locks osc
+
+       echo "Inject failure, to simulate the case of missing the MDT-object"
+       #define OBD_FAIL_LFSCK_LOST_MDTOBJ      0x1616
+       do_facet mds1 $LCTL set_param fail_loc=0x1616
+       rm -f $DIR/$tdir/a1/f1
+       do_facet mds2 $LCTL set_param fail_loc=0x1616
+       rm -f $DIR/$tdir/a2/f2
+       sync
+       sleep 2
+       do_facet mds1 $LCTL set_param fail_loc=0
+       do_facet mds2 $LCTL set_param fail_loc=0
+
+       echo "stopall to cleanup object cache"
+       stopall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
+       $START_LAYOUT -o || error "(1) Fail to start LFSCK for layout!"
+
+       for k in $(seq $MDSCOUNT); do
+               # The LFSCK status query internal is 30 seconds. For the case
+               # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
+               # time to guarantee the status sync up.
+               wait_update_facet mds${k} "$LCTL get_param -n \
+                       mdd.$(facet_svc mds${k}).lfsck_layout |
+                       awk '/^status/ { print \\\$2 }'" "completed" 32 ||
+                       error "(2) MDS${k} is not the expected 'completed'"
+       done
+
+       for k in $(seq $OSTCOUNT); do
+               local cur_status=$(do_facet ost${k} $LCTL get_param -n \
+                               obdfilter.$(facet_svc ost${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$cur_status" == "completed" ] ||
+               error "(3) OST${k} Expect 'completed', but got '$cur_status'"
+       done
+
+       for k in 1 2; do
+               local repaired=$(do_facet mds${k} $LCTL get_param -n \
+                                mdd.$(facet_svc mds${k}).lfsck_layout |
+                                awk '/^repaired_orphan/ { print $2 }')
+               [ $repaired -eq ${k} ] ||
+               error "(4) Expect ${k} fixed on mds${k}, but got: $repaired"
+       done
+
+       echo "Move the files from ./lustre/lost+found/MDTxxxx to namespace"
+       mv $MOUNT/.lustre/lost+found/MDT0000/R-${fid1} $DIR/$tdir/a1/f1 ||
+       error "(5) Fail to move $MOUNT/.lustre/lost+found/MDT0000/R-${fid1}"
+
+       mv $MOUNT/.lustre/lost+found/MDT0001/R-${fid2} $DIR/$tdir/a2/f2 ||
+       error "(6) Fail to move $MOUNT/.lustre/lost+found/MDT0001/R-${fid2}"
+
+       $LFS path2fid $DIR/$tdir/a1/f1
+       $LFS getstripe $DIR/$tdir/a1/f1
+       $LFS path2fid $DIR/$tdir/a2/f2
+       $LFS getstripe $DIR/$tdir/a2/f2
+
+       echo "The file size should be correct after layout LFSCK scanning"
+       local cur_size=$(ls -il $DIR/$tdir/a1/f1 | awk '{ print $6 }')
+       [ "$cur_size" == "$saved_size" ] ||
+               error "(7) Expect file1 size $saved_size, but got $cur_size"
+
+       cur_size=$(ls -il $DIR/$tdir/a2/f2 | awk '{ print $6 }')
+       [ "$cur_size" == "$saved_size" ] ||
+               error "(8) Expect file2 size $saved_size, but got $cur_size"
+}
+run_test 18b "Find out orphan OST-object and repair it (2)"
+
+test_18c() {
+       [ $MDSCOUNT -lt 2 ] &&
+               skip "We need at least 2 MDSes for test_18c" && exit 0
+
+       [ $OSTCOUNT -lt 2 ] &&
+               skip "We need at least 2 OSTs for test_18c" && exit 0
+
+       echo "#####"
+       echo "The target MDT-object is lost, and the OST-object FID is missing."
+       echo "The LFSCK should re-create the MDT-object with new FID under the "
+       echo "directory .lustre/lost+found/MDTxxxx."
+       echo "#####"
+
+       echo "stopall"
+       stopall > /dev/null
+       echo "formatall"
+       formatall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       mkdir -p $DIR/$tdir
+       $LFS mkdir -i 0 $DIR/$tdir/a1
+       $LFS mkdir -i 1 $DIR/$tdir/a2
+       $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
+       $LFS setstripe -c 2 -i 1 -s 1M $DIR/$tdir/a2
+
+       echo "Inject failure, to simulate the case of missing parent FID"
+       #define OBD_FAIL_LFSCK_NOPFID           0x1617
+       do_facet ost1 $LCTL set_param fail_loc=0x1617
+       do_facet ost2 $LCTL set_param fail_loc=0x1617
+
+       dd if=/dev/zero of=$DIR/$tdir/a1/f1 bs=1M count=2
+       dd if=/dev/zero of=$DIR/$tdir/a2/f2 bs=1M count=2
+       $LFS getstripe $DIR/$tdir/a1/f1
+       $LFS getstripe $DIR/$tdir/a2/f2
+       sync
+       cancel_lru_locks osc
+
+       echo "Inject failure, to simulate the case of missing the MDT-object"
+       #define OBD_FAIL_LFSCK_LOST_MDTOBJ      0x1616
+       do_facet mds1 $LCTL set_param fail_loc=0x1616
+       rm -f $DIR/$tdir/a1/f1
+       do_facet mds2 $LCTL set_param fail_loc=0x1616
+       rm -f $DIR/$tdir/a2/f2
+       sync
+       sleep 2
+       do_facet mds1 $LCTL set_param fail_loc=0
+       do_facet mds2 $LCTL set_param fail_loc=0
+
+       echo "stopall to cleanup object cache"
+       stopall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
+       $START_LAYOUT -o || error "(1) Fail to start LFSCK for layout!"
+
+       for k in $(seq $MDSCOUNT); do
+               # The LFSCK status query internal is 30 seconds. For the case
+               # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
+               # time to guarantee the status sync up.
+               wait_update_facet mds${k} "$LCTL get_param -n \
+                       mdd.$(facet_svc mds${k}).lfsck_layout |
+                       awk '/^status/ { print \\\$2 }'" "completed" 32 ||
+                       error "(2) MDS${k} is not the expected 'completed'"
+       done
+
+       for k in $(seq $OSTCOUNT); do
+               local cur_status=$(do_facet ost${k} $LCTL get_param -n \
+                               obdfilter.$(facet_svc ost${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$cur_status" == "completed" ] ||
+               error "(3) OST${k} Expect 'completed', but got '$cur_status'"
+       done
+
+       local repaired=$(do_facet mds1 $LCTL get_param -n \
+                        mdd.$(facet_svc mds1).lfsck_layout |
+                        awk '/^repaired_orphan/ { print $2 }')
+       [ $repaired -eq 3 ] ||
+               error "(4) Expect 3 fixed on mds1, but got: $repaired"
+
+       repaired=$(do_facet mds2 $LCTL get_param -n \
+                  mdd.$(facet_svc mds2).lfsck_layout |
+                  awk '/^repaired_orphan/ { print $2 }')
+       [ $repaired -eq 0 ] ||
+               error "(5) Expect 0 fixed on mds2, but got: $repaired"
+
+       echo "There should be some stub under .lustre/lost+found/MDT0001/"
+       ls -ail $MOUNT/.lustre/lost+found/MDT0001/N-* &&
+               error "(6) .lustre/lost+found/MDT0001/ should be empty"
+
+       echo "There should be some stub under .lustre/lost+found/MDT0000/"
+       ls -ail $MOUNT/.lustre/lost+found/MDT0000/N-* ||
+               error "(7) .lustre/lost+found/MDT0000/ should not be empty"
+}
+run_test 18c "Find out orphan OST-object and repair it (3)"
+
+test_18d() {
+       echo "#####"
+       echo "The target MDT-object layout EA slot is occpuied by some new"
+       echo "created OST-object when repair dangling reference case. Such"
+       echo "conflict OST-object has never been modified. Then when found"
+       echo "the orphan OST-object, LFSCK will replace it with the orphan"
+       echo "OST-object."
+       echo "#####"
+
+       echo "stopall"
+       stopall > /dev/null
+       echo "formatall"
+       formatall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       mkdir -p $DIR/$tdir/a1
+       $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
+       echo "guard" > $DIR/$tdir/a1/f1
+       echo "foo" > $DIR/$tdir/a1/f2
+       local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
+       $LFS path2fid $DIR/$tdir/a1/f1
+       $LFS getstripe $DIR/$tdir/a1/f1
+       $LFS path2fid $DIR/$tdir/a1/f2
+       $LFS getstripe $DIR/$tdir/a1/f2
+       sync
+       cancel_lru_locks osc
+
+       echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
+       echo "to reference the same OST-object (which is f1's OST-obejct)."
+       echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
+       echo "dangling reference case, but f2's old OST-object is there."
+       echo
+
+       #define OBD_FAIL_LFSCK_CHANGE_STRIPE    0x1618
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
+       chown 1.1 $DIR/$tdir/a1/f2
+       rm -f $DIR/$tdir/a1/f1
+       sync
+       sleep 2
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+
+       echo "stopall to cleanup object cache"
+       stopall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       echo "The file size should be incorrect since dangling referenced"
+       local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
+       [ "$cur_size" != "$saved_size" ] ||
+               error "(1) Expect incorrect file2 size"
+
+       echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
+       $START_LAYOUT -o || error "(2) Fail to start LFSCK for layout!"
+
+       for k in $(seq $MDSCOUNT); do
+               # The LFSCK status query internal is 30 seconds. For the case
+               # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
+               # time to guarantee the status sync up.
+               wait_update_facet mds${k} "$LCTL get_param -n \
+                       mdd.$(facet_svc mds${k}).lfsck_layout |
+                       awk '/^status/ { print \\\$2 }'" "completed" 32 ||
+                       error "(3) MDS${k} is not the expected 'completed'"
+       done
+
+       for k in $(seq $OSTCOUNT); do
+               local cur_status=$(do_facet ost${k} $LCTL get_param -n \
+                               obdfilter.$(facet_svc ost${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$cur_status" == "completed" ] ||
+               error "(4) OST${k} Expect 'completed', but got '$cur_status'"
+       done
+
+       local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
+                        mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
+                        awk '/^repaired_orphan/ { print $2 }')
+       [ $repaired -eq 1 ] ||
+               error "(5) Expect 1 orphan has been fixed, but got: $repaired"
+
+       echo "The file size should be correct after layout LFSCK scanning"
+       cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
+       [ "$cur_size" == "$saved_size" ] ||
+               error "(6) Expect file2 size $saved_size, but got $cur_size"
+
+       echo "There should be some stub under .lustre/lost+found/MDT0000/"
+       ls -ail $MOUNT/.lustre/lost+found/MDT0000/ &&
+               error "(7) .lustre/lost+found/MDT0000/ should be empty"
+
+       echo "The LFSCK should find back the original data."
+       cat $DIR/$tdir/a1/f2
+       $LFS path2fid $DIR/$tdir/a1/f2
+       $LFS getstripe $DIR/$tdir/a1/f2
+}
+run_test 18d "Find out orphan OST-object and repair it (4)"
+
+test_18e() {
+       echo "#####"
+       echo "The target MDT-object layout EA slot is occpuied by some new"
+       echo "created OST-object when repair dangling reference case. Such"
+       echo "conflict OST-object has been modified by others. To keep the"
+       echo "new data, the LFSCK will create a new file to refernece this"
+       echo "old orphan OST-object."
+       echo "#####"
+
+       echo "stopall"
+       stopall > /dev/null
+       echo "formatall"
+       formatall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       mkdir -p $DIR/$tdir/a1
+       $LFS setstripe -c 1 -i 0 -s 1M $DIR/$tdir/a1
+       echo "guard" > $DIR/$tdir/a1/f1
+       echo "foo" > $DIR/$tdir/a1/f2
+       local saved_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
+       $LFS path2fid $DIR/$tdir/a1/f1
+       $LFS getstripe $DIR/$tdir/a1/f1
+       $LFS path2fid $DIR/$tdir/a1/f2
+       $LFS getstripe $DIR/$tdir/a1/f2
+       sync
+       cancel_lru_locks osc
+
+       echo "Inject failure to make $DIR/$tdir/a1/f1 and $DIR/$tdir/a1/f2"
+       echo "to reference the same OST-object (which is f1's OST-obejct)."
+       echo "Then drop $DIR/$tdir/a1/f1 and its OST-object, so f2 becomes"
+       echo "dangling reference case, but f2's old OST-object is there."
+       echo
+
+       #define OBD_FAIL_LFSCK_CHANGE_STRIPE    0x1618
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1618
+       chown 1.1 $DIR/$tdir/a1/f2
+       rm -f $DIR/$tdir/a1/f1
+       sync
+       sleep 2
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+
+       echo "stopall to cleanup object cache"
+       stopall > /dev/null
+       echo "setupall"
+       setupall > /dev/null
+
+       echo "The file size should be incorrect since dangling referenced"
+       local cur_size=$(ls -il $DIR/$tdir/a1/f2 | awk '{ print $6 }')
+       [ "$cur_size" != "$saved_size" ] ||
+               error "(1) Expect incorrect file2 size"
+
+       #define OBD_FAIL_LFSCK_DELAY3           0x1602
+       do_facet $SINGLEMDS $LCTL set_param fail_val=10
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1602
+
+       echo "Trigger layout LFSCK on all devices to find out orphan OST-object"
+       $START_LAYOUT -o || error "(2) Fail to start LFSCK for layout!"
+
+       wait_update_facet mds1 "$LCTL get_param -n \
+               mdd.$(facet_svc mds1).lfsck_layout |
+               awk '/^status/ { print \\\$2 }'" "scanning-phase2" 6 ||
+               error "(3) MDS1 is not the expected 'scanning-phase2'"
+
+       echo "Write new data to f2 to modify the new created OST-object."
+       echo "dummy" >> $DIR/$tdir/a1/f2
+
+       do_facet $SINGLEMDS $LCTL set_param fail_val=0
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+
+       for k in $(seq $MDSCOUNT); do
+               # The LFSCK status query internal is 30 seconds. For the case
+               # of some LFSCK_NOTIFY RPCs failure/lost, we will wait enough
+               # time to guarantee the status sync up.
+               wait_update_facet mds${k} "$LCTL get_param -n \
+                       mdd.$(facet_svc mds${k}).lfsck_layout |
+                       awk '/^status/ { print \\\$2 }'" "completed" 32 ||
+                       error "(4) MDS${k} is not the expected 'completed'"
+       done
+
+       for k in $(seq $OSTCOUNT); do
+               local cur_status=$(do_facet ost${k} $LCTL get_param -n \
+                               obdfilter.$(facet_svc ost${k}).lfsck_layout |
+                               awk '/^status/ { print $2 }')
+               [ "$cur_status" == "completed" ] ||
+               error "(5) OST${k} Expect 'completed', but got '$cur_status'"
+       done
+
+       local repaired=$(do_facet $SINGLEMDS $LCTL get_param -n \
+                        mdd.$(facet_svc $SINGLEMDS).lfsck_layout |
+                        awk '/^repaired_orphan/ { print $2 }')
+       [ $repaired -eq 1 ] ||
+               error "(6) Expect 1 orphan has been fixed, but got: $repaired"
+
+       echo "There should be stub file under .lustre/lost+found/MDT0000/"
+       local cname=$(ls $MOUNT/.lustre/lost+found/MDT0000/C-*)
+       [ ! -z $name ] ||
+               error "(7) .lustre/lost+found/MDT0000/ should not be empty"
+
+       echo "The stub file should keep the original f2 data"
+       cur_size=$(ls -il $cname | awk '{ print $6 }')
+       [ "$cur_size" == "$saved_size" ] ||
+               error "(8) Expect file2 size $saved_size, but got $cur_size"
+
+       cat $cname
+       $LFS path2fid $cname
+       $LFS getstripe $cname
+
+       echo "The f2 should contains new data."
+       cat $DIR/$tdir/a1/f2
+       $LFS path2fid $DIR/$tdir/a1/f2
+       $LFS getstripe $DIR/$tdir/a1/f2
+}
+run_test 18e "Find out orphan OST-object and repair it (5)"
+
 $LCTL set_param debug=-lfsck > /dev/null || true
 
 # restore MDS/OST size