From: Fan Yong Date: Tue, 10 May 2016 23:10:39 +0000 (+0800) Subject: LU-7782 scrub: handle slave obj of striped directory X-Git-Tag: 2.8.56~72 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=80fe81c5b14835bbd5d751e878edbd00fe90f797 LU-7782 scrub: handle slave obj of striped directory When lookup item under striped directory, we need to locate the master MDT-object of the striped directory firstly, then the client will send lookup (getattr_by_name) RPC to the MDT with some slave MDT-object's FID and the item's name. If the system is restored from MDT file level backup, then before the OI scrub completely built the OI files, the OI mappings of the master MDT-object and slave MDT-object may be invalid. Usually, it is not a problem for the master MDT-object. Because when locate the master MDT-object, we will do name based lookup (for the striped directory itself) firstly, during such process we can setup the correct OI mapping for the master MDT-object. But it will be trouble for the slave MDT-object. Because the client will not trigger name based lookup on the MDT to locate the slave MDT-object before locating item under the striped directory, then when osd_fid_lookup(), it will find that the OI mapping for the slave MDT-object is invalid and does not know what the right OI mapping is, then the MDT has to return -EINPROGRESS to the client to notify that the OI scrub is rebuiding the OI file, related OI mapping is unknown yet, please try again later. And then client will re-try the RPC again and again until related OI mapping has been updated. That is quite inefficient. To resolve above trouble, we will handle it as the following two cases: 1) The slave MDT-object and the MDT-object are on different MDTs. It is relative easy. Be as one of remote MDT-objects, the slave MDT-object is linked under /REMOTE_PARENT_DIR with the name of its FID string. We can locate the slave MDT-object via lookup the /REMOTE_PARENT_DIR directly. 2) The slave MDT-object and the MDT-object reside on the same MDT. Under such case, during lookup the master MDT-object, we will lookup the slave MDT-object via readdir the master MDT-object, because the slave MDT-objects information are stored as sub-directories with the name "${FID}:${index}". Then when find the local slave MDT-object, its OI mapping will be recorded. Then subsequent osd_fid_lookup() will know the correct OI mapping for the slave MDT-object. The patch also enhance sanity-scrub to avoid DNE in sanity-scrub on one MDT. Signed-off-by: Andreas Dilger Signed-off-by: Fan Yong Change-Id: I0bf12ac981017245e4c2da08176422a993086c18 Reviewed-on: http://review.whamcloud.com/18801 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- diff --git a/lustre/osd-ldiskfs/osd_compat.c b/lustre/osd-ldiskfs/osd_compat.c index cec04db..5e7bb50 100644 --- a/lustre/osd-ldiskfs/osd_compat.c +++ b/lustre/osd-ldiskfs/osd_compat.c @@ -332,9 +332,19 @@ int osd_lookup_in_remote_parent(struct osd_thread_info *oti, if (bh == NULL) { rc = -ENOENT; } else { - rc = 0; + struct inode *inode; + osd_id_gen(id, le32_to_cpu(de->inode), OSD_OII_NOGEN); brelse(bh); + inode = osd_iget(oti, osd, id); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + if (rc == -ESTALE) + rc = -ENOENT; + } else { + iput(inode); + rc = 0; + } } mutex_unlock(&parent->d_inode->i_mutex); if (rc == 0) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 21bbdfc..bf1a02d 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -767,6 +767,176 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj) RETURN(rc); } +struct osd_check_lmv_buf { +#ifdef HAVE_DIR_CONTEXT + /* please keep it as first member */ + struct dir_context ctx; +#endif + struct osd_thread_info *oclb_info; + struct osd_device *oclb_dev; + struct osd_idmap_cache *oclb_oic; +}; + +/** + * It is called internally by ->readdir() to filter out the + * local slave object's FID of the striped directory. + * + * \retval 1 found the local slave's FID + * \retval 0 continue to check next item + * \retval -ve for failure + */ +#ifdef HAVE_FILLDIR_USE_CTX +static int osd_stripe_dir_filldir(struct dir_context *buf, +#else +static int osd_stripe_dir_filldir(void *buf, +#endif + const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type) +{ + struct osd_check_lmv_buf *oclb = (struct osd_check_lmv_buf *)buf; + struct osd_thread_info *oti = oclb->oclb_info; + struct lu_fid *fid = &oti->oti_fid3; + struct osd_inode_id *id = &oti->oti_id3; + struct osd_device *dev = oclb->oclb_dev; + struct osd_idmap_cache *oic = oclb->oclb_oic; + struct inode *inode; + int rc; + + if (name[0] == '.') + return 0; + + fid_zero(fid); + sscanf(name + 1, SFID, RFID(fid)); + if (!fid_is_sane(fid)) + return 0; + + if (osd_remote_fid(oti->oti_env, dev, fid)) + return 0; + + osd_id_gen(id, ino, OSD_OII_NOGEN); + inode = osd_iget(oti, dev, id); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iput(inode); + osd_add_oi_cache(oti, dev, id, fid); + oic->oic_fid = *fid; + oic->oic_lid = *id; + oic->oic_dev = dev; + rc = osd_oii_insert(dev, oic, true); + + return rc == 0 ? 1 : rc; +} + +/* When lookup item under striped directory, we need to locate the master + * MDT-object of the striped directory firstly, then the client will send + * lookup (getattr_by_name) RPC to the MDT with some slave MDT-object's FID + * and the item's name. If the system is restored from MDT file level backup, + * then before the OI scrub completely built the OI files, the OI mappings of + * the master MDT-object and slave MDT-object may be invalid. Usually, it is + * not a problem for the master MDT-object. Because when locate the master + * MDT-object, we will do name based lookup (for the striped directory itself) + * firstly, during such process we can setup the correct OI mapping for the + * master MDT-object. But it will be trouble for the slave MDT-object. Because + * the client will not trigger name based lookup on the MDT to locate the slave + * MDT-object before locating item under the striped directory, then when + * osd_fid_lookup(), it will find that the OI mapping for the slave MDT-object + * is invalid and does not know what the right OI mapping is, then the MDT has + * to return -EINPROGRESS to the client to notify that the OI scrub is rebuiding + * the OI file, related OI mapping is unknown yet, please try again later. And + * then client will re-try the RPC again and again until related OI mapping has + * been updated. That is quite inefficient. + * + * To resolve above trouble, we will handle it as the following two cases: + * + * 1) The slave MDT-object and the master MDT-object are on different MDTs. + * It is relative easy. Be as one of remote MDT-objects, the slave MDT-object + * is linked under /REMOTE_PARENT_DIR with the name of its FID string. + * We can locate the slave MDT-object via lookup the /REMOTE_PARENT_DIR + * directly. Please check osd_fid_lookup(). + * + * 2) The slave MDT-object and the master MDT-object reside on the same MDT. + * Under such case, during lookup the master MDT-object, we will lookup the + * slave MDT-object via readdir against the master MDT-object, because the + * slave MDT-objects information are stored as sub-directories with the name + * "${FID}:${index}". Then when find the local slave MDT-object, its OI + * mapping will be recorded. Then subsequent osd_fid_lookup() will know + * the correct OI mapping for the slave MDT-object. */ +static int osd_check_lmv(struct osd_thread_info *oti, struct osd_device *dev, + struct inode *inode, struct osd_idmap_cache *oic) +{ + struct lu_buf *buf = &oti->oti_big_buf; + struct dentry *dentry = &oti->oti_obj_dentry; + struct file *filp = &oti->oti_file; + const struct file_operations *fops; + struct lmv_mds_md_v1 *lmv1; + struct osd_check_lmv_buf oclb = { +#ifdef HAVE_DIR_CONTEXT + .ctx.actor = osd_stripe_dir_filldir, +#endif + .oclb_info = oti, + .oclb_dev = dev, + .oclb_oic = oic + }; + int rc = 0; + ENTRY; + +again: + rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMV, buf->lb_buf, + buf->lb_len); + if (rc == -ERANGE) { + rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMV, NULL, 0); + if (rc > 0) { + lu_buf_realloc(buf, rc); + if (buf->lb_buf == NULL) + GOTO(out, rc = -ENOMEM); + + goto again; + } + } + + if (unlikely(rc == 0 || rc == -ENODATA)) + GOTO(out, rc = 0); + + if (rc < 0) + GOTO(out, rc); + + if (unlikely(buf->lb_buf == NULL)) { + lu_buf_realloc(buf, rc); + if (buf->lb_buf == NULL) + GOTO(out, rc = -ENOMEM); + + goto again; + } + + lmv1 = buf->lb_buf; + if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1) + GOTO(out, rc = 0); + + fops = inode->i_fop; + dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; + filp->f_pos = 0; + filp->f_path.dentry = dentry; + filp->f_mode = FMODE_64BITHASH; + filp->f_mapping = inode->i_mapping; + filp->f_op = fops; + filp->private_data = NULL; + set_file_inode(filp, inode); + +#ifdef HAVE_DIR_CONTEXT + oclb.ctx.pos = filp->f_pos; + rc = fops->iterate(filp, &oclb.ctx); + filp->f_pos = oclb.ctx.pos; +#else + rc = fops->readdir(filp, &oclb, osd_stripe_dir_filldir); +#endif + fops->release(inode, filp); + +out: + RETURN(rc >= 0 ? 0 : rc); +} + static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, const struct lu_fid *fid, const struct lu_object_conf *conf) @@ -990,7 +1160,13 @@ found: obj->oo_compat_dot_created = 1; obj->oo_compat_dotdot_created = 1; - if (!S_ISDIR(inode->i_mode) || !ldiskfs_pdo) /* done */ + if (!S_ISDIR(inode->i_mode)) + GOTO(out, result = 0); + + if (flags & SS_AUTO_PARTIAL) + osd_check_lmv(info, dev, inode, oic); + + if (!ldiskfs_pdo) GOTO(out, result = 0); LASSERT(obj->oo_hl_head == NULL); @@ -4627,7 +4803,8 @@ osd_consistency_check(struct osd_thread_info *oti, struct osd_device *dev, { struct osd_scrub *scrub = &dev->od_scrub; struct lu_fid *fid = &oic->oic_fid; - struct osd_inode_id *id = &oti->oti_id; + struct osd_inode_id *id = &oic->oic_lid; + struct inode *inode = NULL; int once = 0; int rc; ENTRY; @@ -4639,13 +4816,14 @@ osd_consistency_check(struct osd_thread_info *oti, struct osd_device *dev, RETURN(0); again: - rc = osd_oi_lookup(oti, dev, fid, id, 0); + rc = osd_oi_lookup(oti, dev, fid, &oti->oti_id, 0); if (rc == -ENOENT) { - struct inode *inode; + __u32 gen = id->oii_gen; - *id = oic->oic_lid; - inode = osd_iget(oti, dev, &oic->oic_lid); + if (inode != NULL) + goto trigger; + inode = osd_iget(oti, dev, id); /* The inode has been removed (by race maybe). */ if (IS_ERR(inode)) { rc = PTR_ERR(inode); @@ -4653,15 +4831,15 @@ again: RETURN(rc == -ESTALE ? -ENOENT : rc); } - iput(inode); /* The OI mapping is lost. */ - if (id->oii_gen != OSD_OII_NOGEN) + if (gen != OSD_OII_NOGEN) goto trigger; + iput(inode); /* The inode may has been reused by others, we do not know, * leave it to be handled by subsequent osd_fid_lookup(). */ RETURN(0); - } else if (rc != 0 || osd_id_eq(id, &oic->oic_lid)) { + } else if (rc != 0 || osd_id_eq(id, &oti->oti_id)) { RETURN(rc); } @@ -4675,21 +4853,41 @@ trigger: if (unlikely(rc == -EAGAIN)) goto again; - RETURN(0); + if (inode == NULL) { + inode = osd_iget(oti, dev, id); + /* The inode has been removed (by race maybe). */ + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + + RETURN(rc == -ESTALE ? -ENOENT : rc); + } + } + + if (!S_ISDIR(inode->i_mode)) + rc = 0; + else + rc = osd_check_lmv(oti, dev, inode, oic); + + iput(inode); + RETURN(rc); } if (!dev->od_noscrub && ++once == 1) { rc = osd_scrub_start(dev, SS_AUTO_PARTIAL | SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT); - CDEBUG(D_LFSCK | D_CONSOLE, "%.16s: trigger OI scrub by RPC " - "for "DFID", rc = %d [2]\n", + CDEBUG(D_LFSCK | D_CONSOLE | D_WARNING, + "%.16s: trigger partial OI scrub for RPC inconsistency " + "checking FID "DFID": rc = %d\n", LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, PFID(fid), rc); if (rc == 0 || rc == -EALREADY) goto again; } - RETURN(0); + if (inode != NULL) + iput(inode); + + RETURN(rc); } static int osd_fail_fid_lookup(struct osd_thread_info *oti, @@ -5485,13 +5683,12 @@ struct osd_filldir_cbs { * \retval 1 on buffer full */ #ifdef HAVE_FILLDIR_USE_CTX -static int osd_ldiskfs_filldir(struct dir_context *buf, - const char *name, int namelen, +static int osd_ldiskfs_filldir(struct dir_context *buf, #else -static int osd_ldiskfs_filldir(void *buf, const char *name, int namelen, +static int osd_ldiskfs_filldir(void *buf, #endif - loff_t offset, __u64 ino, - unsigned d_type) + const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type) { struct osd_it_ea *it = ((struct osd_filldir_cbs *)buf)->it; diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c index 3cd4e8c..f5c7f13 100644 --- a/lustre/osd-ldiskfs/osd_scrub.c +++ b/lustre/osd-ldiskfs/osd_scrub.c @@ -1223,16 +1223,16 @@ static void osd_scrub_join(struct osd_device *dev, __u32 flags, sf->sf_status = SS_SCANNING; } - if (flags & SS_AUTO_FULL) { - sf->sf_flags |= SF_AUTO; - scrub->os_full_speed = 1; - } - if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_UPGRADE)) scrub->os_full_speed = 1; else scrub->os_full_speed = 0; + if (flags & SS_AUTO_FULL) { + sf->sf_flags |= SF_AUTO; + scrub->os_full_speed = 1; + } + scrub->os_new_checked = 0; if (sf->sf_pos_last_checkpoint != 0) sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1; @@ -1899,11 +1899,11 @@ osd_ios_scan_one(struct osd_thread_info *info, struct osd_device *dev, * or filter_fid_old), move them back to its proper /O//d. */ #ifdef HAVE_FILLDIR_USE_CTX -static int osd_ios_lf_fill(struct dir_context *buf, const char *name, - int namelen, +static int osd_ios_lf_fill(struct dir_context *buf, #else -static int osd_ios_lf_fill(void *buf, const char *name, int namelen, +static int osd_ios_lf_fill(void *buf, #endif + const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { struct osd_ios_filldir_buf *fill_buf = @@ -1973,11 +1973,11 @@ put: } #ifdef HAVE_FILLDIR_USE_CTX -static int osd_ios_varfid_fill(struct dir_context *buf, const char *name, - int namelen, +static int osd_ios_varfid_fill(struct dir_context *buf, #else -static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, +static int osd_ios_varfid_fill(void *buf, #endif + const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { struct osd_ios_filldir_buf *fill_buf = @@ -2006,11 +2006,11 @@ static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, } #ifdef HAVE_FILLDIR_USE_CTX -static int osd_ios_dl_fill(struct dir_context *buf, const char *name, - int namelen, +static int osd_ios_dl_fill(struct dir_context *buf, #else -static int osd_ios_dl_fill(void *buf, const char *name, int namelen, +static int osd_ios_dl_fill(void *buf, #endif + const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { struct osd_ios_filldir_buf *fill_buf = @@ -2048,11 +2048,11 @@ static int osd_ios_dl_fill(void *buf, const char *name, int namelen, } #ifdef HAVE_FILLDIR_USE_CTX -static int osd_ios_uld_fill(struct dir_context *buf, const char *name, - int namelen, +static int osd_ios_uld_fill(struct dir_context *buf, #else -static int osd_ios_uld_fill(void *buf, const char *name, int namelen, +static int osd_ios_uld_fill(void *buf, #endif + const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { struct osd_ios_filldir_buf *fill_buf = @@ -2083,11 +2083,11 @@ static int osd_ios_uld_fill(void *buf, const char *name, int namelen, } #ifdef HAVE_FILLDIR_USE_CTX -static int osd_ios_root_fill(struct dir_context *buf, const char *name, - int namelen, +static int osd_ios_root_fill(struct dir_context *buf, #else -static int osd_ios_root_fill(void *buf, const char *name, int namelen, +static int osd_ios_root_fill(void *buf, #endif + const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { struct osd_ios_filldir_buf *fill_buf = diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh index 2000ed8..9e3e216 100644 --- a/lustre/tests/sanity-scrub.sh +++ b/lustre/tests/sanity-scrub.sh @@ -128,13 +128,8 @@ scrub_prep() { echo "preparing... $(date)" for n in $(seq $MDSCOUNT); do echo "creating $nfiles files on mds$n" - if [ $n -eq 1 ]; then - mkdir $DIR/$tdir/mds$n || - error "Failed to create directory mds$n" - else - $LFS mkdir -i $((n - 1)) $DIR/$tdir/mds$n || - error "Failed to create remote directory mds$n" - fi + test_mkdir -i $((n - 1)) $DIR/$tdir/mds$n || + error "Failed to create directory mds$n" cp $LUSTRE/tests/*.sh $DIR/$tdir/mds$n || error "Failed to copy files to mds$n" mkdir -p $DIR/$tdir/mds$n/d_$tfile || @@ -287,7 +282,7 @@ scrub_backup_restore() { for n in $(seq $MDSCOUNT); do mds_backup_restore mds$n $igif || - error "(error_id) Backup/restore on mds$n failed" + error "($error_id) Backup/restore on mds$n failed" done } @@ -590,12 +585,12 @@ test_5() { scrub_check_flags 4 recreated,inconsistent mount_client $MOUNT || error "(5) Fail to start client!" scrub_enable_auto + full_scrub_ratio 0 #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 do_nodes $(comma_list $(mdts_nodes)) \ $LCTL set_param fail_val=3 fail_loc=0x190 - full_scrub_ratio 0 scrub_check_data 6 umount_client $MOUNT || error "(7) Fail to stop client!" scrub_check_status 8 scanning @@ -657,12 +652,12 @@ test_6() { scrub_check_flags 4 recreated,inconsistent mount_client $MOUNT || error "(5) Fail to start client!" scrub_enable_auto + full_scrub_ratio 0 #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 do_nodes $(comma_list $(mdts_nodes)) \ $LCTL set_param fail_val=2 fail_loc=0x190 - full_scrub_ratio 0 scrub_check_data 6 # Sleep 5 sec to guarantee at least one object processed by OI scrub @@ -735,12 +730,12 @@ test_7() { scrub_check_flags 4 recreated,inconsistent mount_client $MOUNT || error "(5) Fail to start client!" scrub_enable_auto + full_scrub_ratio 0 #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 do_nodes $(comma_list $(mdts_nodes)) \ $LCTL set_param fail_val=3 fail_loc=0x190 - full_scrub_ratio 0 scrub_check_data 6 local n @@ -874,12 +869,12 @@ test_10a() { scrub_check_flags 4 recreated,inconsistent mount_client $MOUNT || error "(5) Fail to start client!" scrub_enable_auto + full_scrub_ratio 0 #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 do_nodes $(comma_list $(mdts_nodes)) \ $LCTL set_param fail_val=1 fail_loc=0x190 - full_scrub_ratio 0 scrub_check_data 6 scrub_check_status 7 scanning umount_client $MOUNT || error "(8) Fail to stop client!" @@ -938,7 +933,7 @@ test_11() { check_mount_and_prep for n in $(seq $MDSCOUNT); do - $LFS mkdir -i $((n - 1)) $DIR/$tdir/mds$n || + test_mkdir -i $((n - 1)) $DIR/$tdir/mds$n || error "(1) Fail to mkdir $DIR/$tdir/mds$n" createmany -o $DIR/$tdir/mds$n/f $CREATED ||