X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_scrub.c;h=6be982cdb39638ac647645c172c66dbc785e1efa;hb=refs%2Fchanges%2F57%2F6857%2F8;hp=bf6a8b9675e9d74531c5e203b98117c40505efdd;hpb=14325071012bfdf9723376123079ce66c79133d2;p=fs%2Flustre-release.git diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c index bf6a8b9..6be982c 100644 --- a/lustre/osd-ldiskfs/osd_scrub.c +++ b/lustre/osd-ldiskfs/osd_scrub.c @@ -35,9 +35,6 @@ * Author: Fan Yong */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_MDS #include @@ -60,6 +57,9 @@ #define SCRUB_NEXT_FATAL 6 /* simulate failure during OI scrub */ #define SCRUB_NEXT_NOSCRUB 7 /* new created object, no scrub on it */ #define SCRUB_NEXT_NOLMA 8 /* the inode has no FID-in-LMA */ +#define SCRUB_NEXT_OSTOBJ 9 /* for OST-object */ +#define SCRUB_NEXT_OSTOBJ_OLD 10 /* old OST-object, no LMA or no FID-on-OST + * flags in LMA */ /* misc functions */ @@ -89,48 +89,44 @@ static inline int osd_scrub_has_window(struct osd_scrub *scrub, static int osd_scrub_refresh_mapping(struct osd_thread_info *info, struct osd_device *dev, const struct lu_fid *fid, - const struct osd_inode_id *id, int ops) + const struct osd_inode_id *id, + int ops, enum oi_check_flags flags) { - struct lu_fid *oi_fid = &info->oti_fid2; - struct osd_inode_id *oi_id = &info->oti_id2; - struct iam_container *bag; - struct iam_path_descr *ipd; - handle_t *jh; - int rc; + const struct lu_env *env = info->oti_env; + struct thandle *th; + struct osd_thandle *oh; + int rc; ENTRY; - fid_cpu_to_be(oi_fid, fid); - osd_id_pack(oi_id, id); - jh = ldiskfs_journal_start_sb(osd_sb(dev), - osd_dto_credits_noquota[ops]); - if (IS_ERR(jh)) { - rc = PTR_ERR(jh); - CERROR("%.16s: fail to start trans for scrub store: rc = %d\n", - LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc); - RETURN(rc); - } + th = dt_trans_create(env, &dev->od_dt_dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); - bag = &osd_fid2oi(dev, fid)->oi_dir.od_container; - ipd = osd_idx_ipd_get(info->oti_env, bag); - if (unlikely(ipd == NULL)) { - ldiskfs_journal_stop(jh); - CERROR("%.16s: fail to get ipd for scrub store\n", - LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name); - RETURN(-ENOMEM); - } + oh = container_of0(th, struct osd_thandle, ot_super); + LASSERT(oh->ot_handle == NULL); switch (ops) { case DTO_INDEX_UPDATE: - rc = iam_update(jh, bag, (const struct iam_key *)oi_fid, - (struct iam_rec *)oi_id, ipd); + osd_trans_declare_op(env, oh, OSD_OT_UPDATE, + osd_dto_credits_noquota[DTO_INDEX_UPDATE]); + rc = dt_trans_start_local(env, &dev->od_dt_dev, th); + if (rc != 0) + GOTO(stop, rc); + + rc = osd_oi_update(info, dev, fid, id, th, flags); if (unlikely(rc == -ENOENT)) { /* Some unlink thread may removed the OI mapping. */ rc = 1; } break; case DTO_INDEX_INSERT: - rc = iam_insert(jh, bag, (const struct iam_key *)oi_fid, - (struct iam_rec *)oi_id, ipd); + osd_trans_declare_op(env, oh, OSD_OT_INSERT, + osd_dto_credits_noquota[DTO_INDEX_INSERT]); + rc = dt_trans_start_local(env, &dev->od_dt_dev, th); + if (rc != 0) + GOTO(stop, rc); + + rc = osd_oi_insert(info, dev, fid, id, th, flags); if (unlikely(rc == -EEXIST)) { rc = 1; /* XXX: There are trouble things when adding OI @@ -167,7 +163,13 @@ static int osd_scrub_refresh_mapping(struct osd_thread_info *info, } break; case DTO_INDEX_DELETE: - rc = iam_delete(jh, bag, (const struct iam_key *)oi_fid, ipd); + osd_trans_declare_op(env, oh, OSD_OT_DELETE, + osd_dto_credits_noquota[DTO_INDEX_DELETE]); + rc = dt_trans_start_local(env, &dev->od_dt_dev, th); + if (rc != 0) + GOTO(stop, rc); + + rc = osd_oi_delete(info, dev, fid, th, flags); if (rc == -ENOENT) { /* It is normal that the unlink thread has removed the * OI mapping already. */ @@ -178,9 +180,12 @@ static int osd_scrub_refresh_mapping(struct osd_thread_info *info, LASSERTF(0, "Unexpected ops %d\n", ops); break; } - osd_ipd_put(info->oti_env, bag, ipd); - ldiskfs_journal_stop(jh); - RETURN(rc); + + GOTO(stop, rc); + +stop: + dt_trans_stop(env, &dev->od_dt_dev, th); + return rc; } /* OI_scrub file ops */ @@ -442,6 +447,23 @@ osd_scrub_check_update(struct osd_thread_info *info, struct osd_device *dev, if (fid_is_igif(fid)) sf->sf_items_igif++; + if (val == SCRUB_NEXT_OSTOBJ_OLD) { + inode = osd_iget(info, dev, lid); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + /* Someone removed the inode. */ + if (rc == -ENOENT || rc == -ESTALE) + rc = 0; + GOTO(out, rc); + } + + sf->sf_flags |= SF_UPGRADE; + rc = osd_ea_fid_set(info, inode, fid, + LMAC_FID_ON_OST, 0); + if (rc != 0) + GOTO(out, rc); + } + if ((val == SCRUB_NEXT_NOLMA) && (!dev->od_handle_nolma || OBD_FAIL_CHECK(OBD_FAIL_FID_NOLMA))) GOTO(out, rc = 0); @@ -449,54 +471,53 @@ osd_scrub_check_update(struct osd_thread_info *info, struct osd_device *dev, if ((oii != NULL && oii->oii_insert) || (val == SCRUB_NEXT_NOLMA)) goto iget; - /* XXX: Currently, no FID-in-LMA for OST object, so osd_oi_lookup() - * without checking FLD is enough. - * - * It should be updated if FID-in-LMA for OSD object introduced - * in the future. */ - rc = osd_oi_lookup(info, dev, fid, lid2, false); + rc = osd_oi_lookup(info, dev, fid, lid2, + (val == SCRUB_NEXT_OSTOBJ || + val == SCRUB_NEXT_OSTOBJ_OLD) ? OI_KNOWN_ON_OST : 0); if (rc != 0) { if (rc != -ENOENT) GOTO(out, rc); iget: - inode = osd_iget(info, dev, lid); - if (IS_ERR(inode)) { - rc = PTR_ERR(inode); - /* Someone removed the inode. */ - if (rc == -ENOENT || rc == -ESTALE) - rc = 0; - GOTO(out, rc); - } - - /* Check whether the inode to be unlinked during OI scrub. */ - if (unlikely(inode->i_nlink == 0)) { - iput(inode); - GOTO(out, rc = 0); + if (inode == NULL) { + inode = osd_iget(info, dev, lid); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + /* Someone removed the inode. */ + if (rc == -ENOENT || rc == -ESTALE) + rc = 0; + GOTO(out, rc); + } } + scrub->os_full_speed = 1; ops = DTO_INDEX_INSERT; idx = osd_oi_fid2idx(dev, fid); - if (val == SCRUB_NEXT_NOLMA) { + switch (val) { + case SCRUB_NEXT_NOLMA: sf->sf_flags |= SF_UPGRADE; - scrub->os_full_speed = 1; - rc = osd_ea_fid_set(info, inode, fid, 0); + rc = osd_ea_fid_set(info, inode, fid, 0, 0); if (rc != 0) GOTO(out, rc); if (!(sf->sf_flags & SF_INCONSISTENT)) dev->od_igif_inoi = 0; - } else { + break; + case SCRUB_NEXT_OSTOBJ: + sf->sf_flags |= SF_INCONSISTENT; + case SCRUB_NEXT_OSTOBJ_OLD: + break; + default: sf->sf_flags |= SF_RECREATED; - scrub->os_full_speed = 1; if (unlikely(!ldiskfs_test_bit(idx, sf->sf_oi_bitmap))) ldiskfs_set_bit(idx, sf->sf_oi_bitmap); + break; } } else if (osd_id_eq(lid, lid2)) { GOTO(out, rc = 0); } else { - sf->sf_flags |= SF_INCONSISTENT; scrub->os_full_speed = 1; + sf->sf_flags |= SF_INCONSISTENT; /* XXX: If the device is restored from file-level backup, then * some IGIFs may have been already in OI files, and some @@ -513,12 +534,17 @@ iget: dev->od_igif_inoi = 1; } - rc = osd_scrub_refresh_mapping(info, dev, fid, lid, ops); + rc = osd_scrub_refresh_mapping(info, dev, fid, lid, ops, + (val == SCRUB_NEXT_OSTOBJ || + val == SCRUB_NEXT_OSTOBJ_OLD) ? OI_KNOWN_ON_OST : 0); if (rc == 0) { if (scrub->os_in_prior) sf->sf_items_updated_prior++; else sf->sf_items_updated++; + + /* The target has been changed, need to be re-loaded. */ + lu_object_purge(info->oti_env, osd2lu_dev(dev), fid); } GOTO(out, rc); @@ -533,16 +559,20 @@ out: rc = 0; } - if (ops == DTO_INDEX_INSERT) { - /* There may be conflict unlink during the OI scrub, - * if happend, then remove the new added OI mapping. */ - if (unlikely(inode->i_nlink == 0)) - osd_scrub_refresh_mapping(info, dev, fid, lid, - DTO_INDEX_DELETE); - iput(inode); - } + /* There may be conflict unlink during the OI scrub, + * if happend, then remove the new added OI mapping. */ + if (ops == DTO_INDEX_INSERT && inode != NULL && !IS_ERR(inode) && + unlikely(inode->i_nlink == 0)) + osd_scrub_refresh_mapping(info, dev, fid, lid, + DTO_INDEX_DELETE, + (val == SCRUB_NEXT_OSTOBJ || + val == SCRUB_NEXT_OSTOBJ_OLD) ? + OI_KNOWN_ON_OST : 0); up_write(&scrub->os_rwsem); + if (inode != NULL && !IS_ERR(inode)) + iput(inode); + if (oii != NULL) { LASSERT(!cfs_list_empty(&oii->oii_list)); @@ -658,13 +688,120 @@ static int osd_iit_next(struct osd_iit_param *param, __u32 *pos) } } +/** + * \retval SCRUB_NEXT_OSTOBJ_OLD: FID-on-OST + * \retval 0: FID-on-MDT + */ +static int osd_scrub_check_local_fldb(struct osd_thread_info *info, + struct osd_device *dev, + struct lu_fid *fid) +{ + /* XXX: The initial OI scrub will scan the top level /O to generate + * a small local FLDB according to the . If the given FID + * is in the local FLDB, then it is FID-on-OST; otherwise it's + * quite possible for FID-on-MDT. */ + return 0; +} + +static int osd_scrub_get_fid(struct osd_thread_info *info, + struct osd_device *dev, struct inode *inode, + struct lu_fid *fid, bool scrub) +{ + struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; + int rc; + bool has_lma = false; + + rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); + if (rc == 0) { + has_lma = true; + if (lma->lma_compat & LMAC_NOT_IN_OI) { + ldiskfs_set_inode_state(inode, + LDISKFS_STATE_LUSTRE_NO_OI); + return SCRUB_NEXT_CONTINUE; + } + + *fid = lma->lma_self_fid; + if (fid_is_internal(&lma->lma_self_fid)) { + if (!scrub) + rc = SCRUB_NEXT_CONTINUE; + return rc; + } + + if (!scrub) + return 0; + + if (fid_is_namespace_visible(fid) && !fid_is_norm(fid)) + return 0; + + if (lma->lma_compat & LMAC_FID_ON_OST) + return SCRUB_NEXT_OSTOBJ; + + if (fid_is_idif(fid) || fid_is_last_id(fid)) + return SCRUB_NEXT_OSTOBJ_OLD; + + if (lma->lma_incompat & LMAI_AGENT) + return SCRUB_NEXT_CONTINUE; + + /* Here, it may be MDT-object, or may be 2.4 OST-object. + * Fall through. */ + } + + if (rc == -ENODATA || rc == 0) { + rc = osd_get_idif(info, inode, &info->oti_obj_dentry, fid); + if (rc == 0) { + if (scrub) + /* It is old 2.x (x <= 3) or 1.8 OST-object. */ + rc = SCRUB_NEXT_OSTOBJ_OLD; + return rc; + } + + if (rc > 0) { + if (!has_lma) + /* It is FID-on-OST, but we do not know how + * to generate its FID, ignore it directly. */ + rc = SCRUB_NEXT_CONTINUE; + else + /* It is 2.4 OST-object. */ + rc = SCRUB_NEXT_OSTOBJ_OLD; + return rc; + } + + if (rc != -ENODATA) + return rc; + + if (!has_lma) { + if (dev->od_handle_nolma) { + lu_igif_build(fid, inode->i_ino, + inode->i_generation); + if (scrub) + rc = SCRUB_NEXT_NOLMA; + else + rc = 0; + } else { + /* It may be FID-on-OST, or may be FID for + * non-MDT0, anyway, we do not know how to + * generate its FID, ignore it directly. */ + rc = SCRUB_NEXT_CONTINUE; + } + return rc; + } + + /* For OI scrub case only: the object has LMA but has no ff + * (or ff crashed). It may be MDT-object, may be OST-object + * with crashed ff. The last check is local FLDB. */ + rc = osd_scrub_check_local_fldb(info, dev, fid); + } + + return rc; +} + static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, struct lu_fid *fid, struct osd_inode_id *lid, __u32 pos, struct super_block *sb, bool scrub) { - struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; - struct inode *inode; - int rc; + struct inode *inode; + int rc; + ENTRY; osd_id_gen(lid, pos, OSD_OII_NOGEN); inode = osd_iget(info, dev, lid); @@ -673,42 +810,30 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, /* The inode may be removed after bitmap searching, or the * file is new created without inode initialized yet. */ if (rc == -ENOENT || rc == -ESTALE) - return SCRUB_NEXT_CONTINUE; + RETURN(SCRUB_NEXT_CONTINUE); CERROR("%.16s: fail to read inode, ino# = %u, rc = %d\n", LDISKFS_SB(sb)->s_es->s_volume_name, pos, rc); - return rc; + RETURN(rc); } /* If the inode has no OI mapping, then it is special locally used, * should be invisible to OI scrub or up layer LFSCK. */ - if (ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI)) { - iput(inode); - return SCRUB_NEXT_CONTINUE; - } + if (ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI)) + GOTO(put, rc = SCRUB_NEXT_CONTINUE); if (scrub && ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB)) { /* Only skip it for the first OI scrub accessing. */ ldiskfs_clear_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB); - iput(inode); - return SCRUB_NEXT_NOSCRUB; + GOTO(put, rc = SCRUB_NEXT_NOSCRUB); } - rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); - if (rc == 0) { - if (fid_is_llog(&lma->lma_self_fid) || - (!scrub && fid_is_internal(&lma->lma_self_fid))) - rc = SCRUB_NEXT_CONTINUE; - else - *fid = lma->lma_self_fid; - } else if (rc == -ENODATA) { - lu_igif_build(fid, inode->i_ino, inode->i_generation); - if (scrub) - rc = SCRUB_NEXT_NOLMA; - else - rc = 0; - } + rc = osd_scrub_get_fid(info, dev, inode, fid, scrub); + + GOTO(put, rc); + +put: iput(inode); return rc; } @@ -972,18 +1097,10 @@ static int osd_inode_iteration(struct osd_thread_info *info, brelse(param.bitmap); RETURN(rc); } - - if (preload && dev->od_otable_it->ooi_stopping) { - brelse(param.bitmap); - RETURN(0); - } } next_group: brelse(param.bitmap); - - if (preload && dev->od_otable_it->ooi_stopping) - RETURN(0); } if (*pos > limit) @@ -1027,8 +1144,7 @@ static int osd_scrub_main(void *args) int rc; ENTRY; - cfs_daemonize("OI_scrub"); - rc = lu_env_init(&env, LCT_DT_THREAD); + rc = lu_env_init(&env, LCT_LOCAL); if (rc != 0) { CERROR("%.16s: OI scrub, fail to init env, rc = %d\n", LDISKFS_SB(sb)->s_es->s_volume_name, rc); @@ -1095,6 +1211,8 @@ typedef int (*scandir_t)(struct osd_thread_info *, struct osd_device *, static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type); +static int osd_ios_lf_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type); static int osd_ios_general_scan(struct osd_thread_info *info, struct osd_device *dev, @@ -1111,6 +1229,7 @@ enum osd_lf_flags { OLF_SCAN_SUBITEMS = 0x0001, OLF_HIDE_FID = 0x0002, OLF_SHOW_NAME = 0x0004, + OLF_NO_OI = 0x0008, }; struct osd_lf_map { @@ -1204,6 +1323,10 @@ static const struct osd_lf_map osd_lf_maps[] = { { "LAST_GROUP", { FID_SEQ_LOCAL_FILE, OFD_LAST_GROUP_OID, 0 }, OLF_SHOW_NAME, NULL, NULL }, + /* lost+found */ + { "lost+found", { 0, 0, 0 }, OLF_SCAN_SUBITEMS | OLF_NO_OI, + osd_ios_general_scan, osd_ios_lf_fill }, + { NULL, { 0, 0, 0 }, 0, NULL, NULL } }; @@ -1308,20 +1431,26 @@ osd_ios_scan_one(struct osd_thread_info *info, struct osd_device *dev, lu_igif_build(&tfid, inode->i_ino, inode->i_generation); else tfid = *fid; - rc = osd_ea_fid_set(info, inode, &tfid, 0); + rc = osd_ea_fid_set(info, inode, &tfid, 0, 0); if (rc != 0) RETURN(rc); } else { + if (lma->lma_compat & LMAC_NOT_IN_OI) + RETURN(0); + tfid = lma->lma_self_fid; } - rc = __osd_oi_lookup(info, dev, &tfid, id2); + rc = osd_oi_lookup(info, dev, &tfid, id2, 0); if (rc != 0) { if (rc != -ENOENT) RETURN(rc); rc = osd_scrub_refresh_mapping(info, dev, &tfid, id, - DTO_INDEX_INSERT); + DTO_INDEX_INSERT, 0); + if (rc > 0) + rc = 0; + RETURN(rc); } @@ -1337,11 +1466,85 @@ osd_ios_scan_one(struct osd_thread_info *info, struct osd_device *dev, RETURN(rc); } - rc = osd_scrub_refresh_mapping(info, dev, &tfid, id, DTO_INDEX_UPDATE); + rc = osd_scrub_refresh_mapping(info, dev, &tfid, id, + DTO_INDEX_UPDATE, 0); + if (rc > 0) + rc = 0; RETURN(rc); } +/** + * It scans the /lost+found, and for the OST-object (with filter_fid + * or filter_fid_old), move them back to its proper /O//d. + */ +static int osd_ios_lf_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type) +{ + struct osd_ios_filldir_buf *fill_buf = buf; + struct osd_thread_info *info = fill_buf->oifb_info; + struct osd_device *dev = fill_buf->oifb_dev; + struct lu_fid *fid = &info->oti_fid; + struct osd_scrub *scrub = &dev->od_scrub; + struct dentry *parent = fill_buf->oifb_dentry; + struct dentry *child; + struct inode *dir = parent->d_inode; + struct inode *inode; + int rc; + ENTRY; + + /* skip any '.' started names */ + if (name[0] == '.') + RETURN(0); + + scrub->os_lf_scanned++; + child = osd_ios_lookup_one_len(name, parent, namelen); + if (IS_ERR(child)) { + CWARN("%s: cannot lookup child '%.*s': rc = %d\n", + osd_name(dev), namelen, name, (int)PTR_ERR(child)); + RETURN(0); + } + + inode = child->d_inode; + if (S_ISDIR(inode->i_mode)) { + rc = osd_ios_new_item(dev, child, osd_ios_general_scan, + osd_ios_lf_fill); + if (rc != 0) + CWARN("%s: cannot add child '%.*s': rc = %d\n", + osd_name(dev), namelen, name, rc); + GOTO(put, rc); + } + + if (!S_ISREG(inode->i_mode)) + GOTO(put, rc = 0); + + rc = osd_scrub_get_fid(info, dev, inode, fid, true); + if (rc == SCRUB_NEXT_OSTOBJ || rc == SCRUB_NEXT_OSTOBJ_OLD) { + rc = osd_obj_map_recover(info, dev, dir, child, fid); + if (rc == 0) { + CDEBUG(D_LFSCK, "recovered '%.*s' ["DFID"] from " + "/lost+found.\n", namelen, name, PFID(fid)); + scrub->os_lf_repaired++; + } else { + CWARN("%s: cannot rename for '%.*s' "DFID": rc = %d\n", + osd_name(dev), namelen, name, PFID(fid), rc); + } + } + + /* XXX: For MDT-objects, we can move them from /lost+found to namespace + * visible place, such as the /ROOT/.lustre/lost+found, then LFSCK + * can process them in furtuer. */ + + GOTO(put, rc); + +put: + if (rc < 0) + scrub->os_lf_failed++; + dput(child); + /* skip the failure to make the scanning to continue. */ + return 0; +} + static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { @@ -1398,8 +1601,9 @@ static int osd_ios_root_fill(void *buf, const char *name, int namelen, if (IS_ERR(child)) RETURN(PTR_ERR(child)); - rc = osd_ios_scan_one(fill_buf->oifb_info, dev, child->d_inode, - &map->olm_fid, map->olm_flags); + if (!(map->olm_flags & OLF_NO_OI)) + rc = osd_ios_scan_one(fill_buf->oifb_info, dev, child->d_inode, + &map->olm_fid, map->olm_flags); if (rc == 0 && map->olm_flags & OLF_SCAN_SUBITEMS) rc = osd_ios_new_item(dev, child, map->olm_scandir, map->olm_filldir); @@ -1444,7 +1648,19 @@ osd_ios_ROOT_scan(struct osd_thread_info *info, struct osd_device *dev, int rc; ENTRY; - /* It is existing MDT device. */ + /* It is existing MDT0 device. We only allow the case of object without + * LMA to happen on the MDT0, which is usually for old 1.8 MDT. Then we + * can generate IGIF mode FID for the object and related OI mapping. If + * it is on other MDTs, then becuase file-level backup/restore, related + * OI mapping may be invalid already, we do not know which is the right + * FID for the object. We only allow IGIF objects to reside on the MDT0. + * + * XXX: For the case of object on non-MDT0 device with neither LMA nor + * "fid" xattr, then something crashed. We cannot re-generate the + * FID directly, instead, the OI scrub will scan the OI structure + * and try to re-generate the LMA from the OI mapping. But if the + * OI mapping crashed or lost also, then we have to give up under + * double failure cases. */ dev->od_handle_nolma = 1; child = osd_ios_lookup_one_len(dot_lustre_name, dentry, strlen(dot_lustre_name)); @@ -1529,11 +1745,12 @@ osd_ios_OBJECTS_scan(struct osd_thread_info *info, struct osd_device *dev, static int osd_initial_OI_scrub(struct osd_thread_info *info, struct osd_device *dev) { - struct osd_ios_item *item = NULL; - scandir_t scandir = osd_ios_general_scan; - filldir_t filldir = osd_ios_root_fill; - struct dentry *dentry = osd_sb(dev)->s_root; - int rc; + struct osd_ios_item *item = NULL; + scandir_t scandir = osd_ios_general_scan; + filldir_t filldir = osd_ios_root_fill; + struct dentry *dentry = osd_sb(dev)->s_root; + const struct osd_lf_map *map = osd_lf_maps; + int rc; ENTRY; while (1) { @@ -1567,7 +1784,32 @@ static int osd_initial_OI_scrub(struct osd_thread_info *info, OBD_FREE_PTR(item); } - RETURN(rc); + if (rc != 0) + RETURN(rc); + + /* There maybe the case that the object has been removed, but its OI + * mapping is still in the OI file, such as the "CATALOGS" after MDT + * file-level backup/restore. So here cleanup the stale OI mappings. */ + while (map->olm_name != NULL) { + struct dentry *child; + + if (fid_is_zero(&map->olm_fid)) { + map++; + continue; + } + + child = osd_ios_lookup_one_len(map->olm_name, + osd_sb(dev)->s_root, + strlen(map->olm_name)); + if (!IS_ERR(child)) + dput(child); + else if (PTR_ERR(child) == -ENOENT) + osd_scrub_refresh_mapping(info, dev, &map->olm_fid, + NULL, DTO_INDEX_DELETE, 0); + map++; + } + + RETURN(0); } char *osd_lf_fid2name(const struct lu_fid *fid) @@ -1619,8 +1861,8 @@ again: scrub->os_start_flags = flags; thread_set_flags(thread, 0); - rc = cfs_create_thread(osd_scrub_main, dev, 0); - if (rc < 0) { + rc = PTR_ERR(kthread_run(osd_scrub_main, dev, "OI_scrub")); + if (IS_ERR_VALUE(rc)) { CERROR("%.16s: cannot start iteration thread, rc = %d\n", LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc); RETURN(rc); @@ -1690,6 +1932,8 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) struct ldiskfs_super_block *es = LDISKFS_SB(sb)->s_es; struct lvfs_run_ctxt saved; struct file *filp; + struct inode *inode; + struct lu_fid *fid = &info->oti_fid; int dirty = 0; int rc = 0; ENTRY; @@ -1707,14 +1951,26 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) push_ctxt(&saved, ctxt, NULL); filp = filp_open(osd_scrub_name, O_RDWR | O_CREAT, 0644); - if (IS_ERR(filp)) + if (IS_ERR(filp)) { + pop_ctxt(&saved, ctxt, NULL); RETURN(PTR_ERR(filp)); + } + + inode = filp->f_dentry->d_inode; + ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI); + /* 'What the @fid is' is not imporatant, because the object + * has no OI mapping, and only is visible inside the OSD.*/ + lu_igif_build(fid, inode->i_ino, inode->i_generation); + rc = osd_ea_fid_set(info, inode, fid, LMAC_NOT_IN_OI, 0); + if (rc != 0) { + filp_close(filp, 0); + pop_ctxt(&saved, ctxt, NULL); + RETURN(rc); + } - scrub->os_inode = igrab(filp->f_dentry->d_inode); + scrub->os_inode = igrab(inode); filp_close(filp, 0); pop_ctxt(&saved, ctxt, NULL); - ldiskfs_set_inode_state(scrub->os_inode, - LDISKFS_STATE_LUSTRE_NO_OI); rc = osd_scrub_file_load(scrub); if (rc == -ENOENT) { @@ -1777,6 +2033,13 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) rc = osd_scrub_start(dev); } + /* it is possible that dcache entries may keep objects after they are + * deleted by OSD. While it looks safe this can cause object data to + * stay until umount causing failures in tests calculating free space, + * e.g. replay-ost-single. Since those dcache entries are not used + * anymore let's just free them after use here */ + shrink_dcache_sb(sb); + RETURN(rc); } @@ -1821,7 +2084,6 @@ static struct dt_it *osd_otable_it_init(const struct lu_env *env, dev->od_otable_it = it; it->ooi_dev = dev; - it->ooi_pid = cfs_curproc_pid(); it->ooi_cache.ooc_consumer_idx = -1; if (flags & DOIF_OUTUSED) it->ooi_used_outside = 1; @@ -1873,30 +2135,8 @@ static int osd_otable_it_get(const struct lu_env *env, return 0; } -/** - * It is hack here: - * - * Sometimes the otable-based iteration driver (LFSCK) may be blocked in OSD - * layer when someone wants to stop/pause the iteration. Under such case, we - * need some mechanism to notify the event and wakeup the blocker. - */ static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di) { - struct osd_otable_it *it = (struct osd_otable_it *)di; - struct osd_device *dev = it->ooi_dev; - - /* od_otable_mutex: prevent curcurrent init/fini */ - mutex_lock(&dev->od_otable_mutex); - if (it->ooi_pid == cfs_curproc_pid()) { - dev->od_scrub.os_paused = 1; - } else { - struct ptlrpc_thread *thread = &dev->od_scrub.os_thread; - - it->ooi_stopping = 1; - if (it->ooi_waiting) - cfs_waitq_broadcast(&thread->t_ctl_waitq); - } - mutex_unlock(&dev->od_otable_mutex); } static inline int @@ -1904,7 +2144,7 @@ osd_otable_it_wakeup(struct osd_scrub *scrub, struct osd_otable_it *it) { spin_lock(&scrub->os_lock); if (it->ooi_cache.ooc_pos_preload < scrub->os_pos_current || - scrub->os_waiting || it->ooi_stopping || + scrub->os_waiting || !thread_is_running(&scrub->os_thread)) it->ooi_waiting = 0; else @@ -1960,9 +2200,6 @@ again: if (!thread_is_running(thread) && !it->ooi_used_outside) RETURN(1); - if (it->ooi_stopping) - RETURN(0); - rc = osd_otable_it_preload(env, it); if (rc >= 0) goto again; @@ -2310,8 +2547,13 @@ int osd_scrub_dump(struct osd_device *dev, char *buf, int len) "run_time: %u seconds\n" "average_speed: "LPU64" objects/sec\n" "real-time_speed: "LPU64" objects/sec\n" - "current_position: %u\n", - rtime, speed, new_checked, scrub->os_pos_current); + "current_position: %u\n" + "lf_scanned: "LPU64"\n" + "lf_reparied: "LPU64"\n" + "lf_failed: "LPU64"\n", + rtime, speed, new_checked, scrub->os_pos_current, + scrub->os_lf_scanned, scrub->os_lf_repaired, + scrub->os_lf_failed); } else { if (sf->sf_run_time != 0) do_div(speed, sf->sf_run_time); @@ -2319,8 +2561,12 @@ int osd_scrub_dump(struct osd_device *dev, char *buf, int len) "run_time: %u seconds\n" "average_speed: "LPU64" objects/sec\n" "real-time_speed: N/A\n" - "current_position: N/A\n", - sf->sf_run_time, speed); + "current_position: N/A\n" + "lf_scanned: "LPU64"\n" + "lf_reparied: "LPU64"\n" + "lf_failed: "LPU64"\n", + sf->sf_run_time, speed, scrub->os_lf_scanned, + scrub->os_lf_repaired, scrub->os_lf_failed); } if (rc <= 0) goto out;