From 0c1ef56346b1df6eddfca761bb422186db27a575 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 27 Jun 2013 17:14:39 +0800 Subject: [PATCH] LU-3335 scrub: recover OST-objects from /lost+found Originally, Lustre used the tool ll_recover_lost_found_objs to recover orphans under /lost+found under offline mode. With OI scrub introduced it can be done by initial OI scrub during the OST mount automatically. Test-Parameters: testlist=sanity-scrub Signed-off-by: Fan Yong Change-Id: Ibf7e51cc0de10def47a117217b84c2afcd1209ca Reviewed-on: http://review.whamcloud.com/6857 Tested-by: Hudson Reviewed-by: Alex Zhuravlev Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/osd-ldiskfs/osd_compat.c | 110 ++++++++++++++++++++ lustre/osd-ldiskfs/osd_internal.h | 3 + lustre/osd-ldiskfs/osd_scrub.c | 208 ++++++++++++++++++++++++++++---------- lustre/osd-ldiskfs/osd_scrub.h | 10 ++ lustre/tests/sanity-scrub.sh | 52 ++++++++++ 5 files changed, 327 insertions(+), 56 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_compat.c b/lustre/osd-ldiskfs/osd_compat.c index f8d65bf..2f22f5a 100644 --- a/lustre/osd-ldiskfs/osd_compat.c +++ b/lustre/osd-ldiskfs/osd_compat.c @@ -1028,6 +1028,116 @@ int osd_obj_map_update(struct osd_thread_info *info, RETURN(rc); } +int osd_obj_map_recover(struct osd_thread_info *info, + struct osd_device *osd, + struct inode *src_parent, + struct dentry *src_child, + const struct lu_fid *fid) +{ + struct osd_obj_seq *osd_seq; + struct dentry *tgt_parent; + struct dentry *tgt_child = &info->oti_child_dentry; + struct inode *dir; + struct inode *inode = src_child->d_inode; + struct ost_id *ostid = &info->oti_ostid; + handle_t *jh; + struct ldiskfs_dir_entry_2 *de; + struct buffer_head *bh; + char name[32]; + int dirn; + int rc = 0; + ENTRY; + + if (fid_is_last_id(fid)) { + osd_seq = osd_seq_load(info, osd, fid_seq(fid)); + if (IS_ERR(osd_seq)) + RETURN(PTR_ERR(osd_seq)); + + tgt_parent = osd_seq->oos_root; + tgt_child->d_name.name = "LAST_ID"; + tgt_child->d_name.len = strlen("LAST_ID"); + } else { + fid_to_ostid(fid, ostid); + osd_seq = osd_seq_load(info, osd, ostid_seq(ostid)); + if (IS_ERR(osd_seq)) + RETURN(PTR_ERR(osd_seq)); + + dirn = ostid_id(ostid) & (osd_seq->oos_subdir_count - 1); + tgt_parent = osd_seq->oos_dirs[dirn]; + osd_oid_name(name, sizeof(name), fid, ostid_id(ostid)); + tgt_child->d_name.name = name; + tgt_child->d_name.len = strlen(name); + } + LASSERT(tgt_parent != NULL); + + dir = tgt_parent->d_inode; + tgt_child->d_name.hash = 0; + tgt_child->d_parent = tgt_parent; + tgt_child->d_inode = inode; + + /* The non-initialized src_child may be destroyed. */ + jh = ldiskfs_journal_start_sb(osd_sb(osd), + osd_dto_credits_noquota[DTO_INDEX_DELETE] + + osd_dto_credits_noquota[DTO_INDEX_INSERT] + + osd_dto_credits_noquota[DTO_OBJECT_DELETE]); + if (IS_ERR(jh)) + RETURN(PTR_ERR(jh)); + + ll_vfs_dq_init(src_parent); + ll_vfs_dq_init(dir); + + mutex_lock(&src_parent->i_mutex); + mutex_lock(&dir->i_mutex); + bh = osd_ldiskfs_find_entry(dir, tgt_child, &de, NULL); + if (bh != NULL) { + /* XXX: If some other object occupied the same slot. And If such + * inode is zero-sized and with SUID+SGID, then means it is + * a new created one. Maybe we can remove it and insert the + * original one back to the /O//d. But there are + * something to be considered: + * + * 1) The OST-object under /lost+found has crashed LMA. + * So it should not conflict with the current one. + * + * 2) There are race conditions that: someone may just want + * to modify the current one. Even if the OI scrub takes + * the object lock when remove the current one, it still + * cause the modification to be lost becasue the target + * has been removed when the RPC service thread waiting + * for the lock. + * + * So keep it there before we have suitable solution. */ + brelse(bh); + + /* If the src object has never been modified, then remove it. */ + if (inode->i_size == 0 && inode->i_mode & S_ISUID && + inode->i_mode & S_ISGID) + ll_vfs_unlink(src_parent, src_child, osd->od_mnt); + GOTO(unlock_src, rc = 0); + } + + bh = osd_ldiskfs_find_entry(src_parent, src_child, &de, NULL); + if (unlikely(bh == NULL)) + GOTO(unlock, rc = -ENOENT); + + rc = ldiskfs_delete_entry(jh, src_parent, de, bh); + brelse(bh); + if (rc != 0) + GOTO(unlock, rc); + + rc = osd_ldiskfs_add_entry(jh, tgt_child, inode, NULL); + + GOTO(unlock, rc); + +unlock: + mutex_unlock(&dir->i_mutex); + +unlock_src: + mutex_unlock(&src_parent->i_mutex); + ldiskfs_journal_stop(jh); + return rc; +} + static struct dentry * osd_object_spec_find(struct osd_thread_info *info, struct osd_device *osd, const struct lu_fid *fid, char **name) diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 3680022..efe0313 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -655,6 +655,9 @@ int osd_obj_map_delete(struct osd_thread_info *info, struct osd_device *osd, int osd_obj_map_update(struct osd_thread_info *info, struct osd_device *osd, const struct lu_fid *fid, const struct osd_inode_id *id, struct thandle *th); +int osd_obj_map_recover(struct osd_thread_info *info, struct osd_device *osd, + struct inode *src_parent, struct dentry *src_child, + const struct lu_fid *fid); int osd_obj_spec_lookup(struct osd_thread_info *info, struct osd_device *osd, const struct lu_fid *fid, struct osd_inode_id *id); int osd_obj_spec_insert(struct osd_thread_info *info, struct osd_device *osd, diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c index 95b0caf..6be982c 100644 --- a/lustre/osd-ldiskfs/osd_scrub.c +++ b/lustre/osd-ldiskfs/osd_scrub.c @@ -703,41 +703,13 @@ static int osd_scrub_check_local_fldb(struct osd_thread_info *info, return 0; } -static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, - struct lu_fid *fid, struct osd_inode_id *lid, __u32 pos, - struct super_block *sb, bool scrub) +static int osd_scrub_get_fid(struct osd_thread_info *info, + struct osd_device *dev, struct inode *inode, + struct lu_fid *fid, bool scrub) { - struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; - struct inode *inode; - int rc = 0; - bool has_lma = false; - ENTRY; - - osd_id_gen(lid, pos, OSD_OII_NOGEN); - inode = osd_iget(info, dev, lid); - if (IS_ERR(inode)) { - rc = PTR_ERR(inode); - /* The inode may be removed after bitmap searching, or the - * file is new created without inode initialized yet. */ - if (rc == -ENOENT || rc == -ESTALE) - RETURN(SCRUB_NEXT_CONTINUE); - - CERROR("%.16s: fail to read inode, ino# = %u, rc = %d\n", - LDISKFS_SB(sb)->s_es->s_volume_name, pos, rc); - RETURN(rc); - } - - /* If the inode has no OI mapping, then it is special locally used, - * should be invisible to OI scrub or up layer LFSCK. */ - if (ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI)) - GOTO(put, rc = SCRUB_NEXT_CONTINUE); - - if (scrub && - ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB)) { - /* Only skip it for the first OI scrub accessing. */ - ldiskfs_clear_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB); - GOTO(put, rc = SCRUB_NEXT_NOSCRUB); - } + struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; + int rc; + bool has_lma = false; rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); if (rc == 0) { @@ -745,33 +717,30 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, if (lma->lma_compat & LMAC_NOT_IN_OI) { ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI); - GOTO(put, rc = SCRUB_NEXT_CONTINUE); + return SCRUB_NEXT_CONTINUE; } - if (fid_is_llog(&lma->lma_self_fid)) - GOTO(put, rc = SCRUB_NEXT_CONTINUE); - *fid = lma->lma_self_fid; if (fid_is_internal(&lma->lma_self_fid)) { if (!scrub) rc = SCRUB_NEXT_CONTINUE; - GOTO(put, rc); + return rc; } if (!scrub) - GOTO(put, rc); + return 0; if (fid_is_namespace_visible(fid) && !fid_is_norm(fid)) - GOTO(put, rc); + return 0; - if (lma->lma_compat & LMAC_FID_ON_OST || fid_is_last_id(fid)) - GOTO(put, rc = SCRUB_NEXT_OSTOBJ); + if (lma->lma_compat & LMAC_FID_ON_OST) + return SCRUB_NEXT_OSTOBJ; - if (fid_is_idif(fid)) - GOTO(put, rc = SCRUB_NEXT_OSTOBJ_OLD); + if (fid_is_idif(fid) || fid_is_last_id(fid)) + return SCRUB_NEXT_OSTOBJ_OLD; if (lma->lma_incompat & LMAI_AGENT) - GOTO(put, rc = SCRUB_NEXT_CONTINUE); + return SCRUB_NEXT_CONTINUE; /* Here, it may be MDT-object, or may be 2.4 OST-object. * Fall through. */ @@ -783,7 +752,7 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, if (scrub) /* It is old 2.x (x <= 3) or 1.8 OST-object. */ rc = SCRUB_NEXT_OSTOBJ_OLD; - GOTO(put, rc); + return rc; } if (rc > 0) { @@ -794,11 +763,11 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, else /* It is 2.4 OST-object. */ rc = SCRUB_NEXT_OSTOBJ_OLD; - GOTO(put, rc); + return rc; } if (rc != -ENODATA) - GOTO(put, rc); + return rc; if (!has_lma) { if (dev->od_handle_nolma) { @@ -814,7 +783,7 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, * generate its FID, ignore it directly. */ rc = SCRUB_NEXT_CONTINUE; } - GOTO(put, rc); + return rc; } /* For OI scrub case only: the object has LMA but has no ff @@ -823,6 +792,45 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, rc = osd_scrub_check_local_fldb(info, dev, fid); } + return rc; +} + +static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, + struct lu_fid *fid, struct osd_inode_id *lid, __u32 pos, + struct super_block *sb, bool scrub) +{ + struct inode *inode; + int rc; + ENTRY; + + osd_id_gen(lid, pos, OSD_OII_NOGEN); + inode = osd_iget(info, dev, lid); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + /* The inode may be removed after bitmap searching, or the + * file is new created without inode initialized yet. */ + if (rc == -ENOENT || rc == -ESTALE) + RETURN(SCRUB_NEXT_CONTINUE); + + CERROR("%.16s: fail to read inode, ino# = %u, rc = %d\n", + LDISKFS_SB(sb)->s_es->s_volume_name, pos, rc); + RETURN(rc); + } + + /* If the inode has no OI mapping, then it is special locally used, + * should be invisible to OI scrub or up layer LFSCK. */ + if (ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI)) + GOTO(put, rc = SCRUB_NEXT_CONTINUE); + + if (scrub && + ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB)) { + /* Only skip it for the first OI scrub accessing. */ + ldiskfs_clear_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB); + GOTO(put, rc = SCRUB_NEXT_NOSCRUB); + } + + rc = osd_scrub_get_fid(info, dev, inode, fid, scrub); + GOTO(put, rc); put: @@ -1203,6 +1211,8 @@ typedef int (*scandir_t)(struct osd_thread_info *, struct osd_device *, static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type); +static int osd_ios_lf_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type); static int osd_ios_general_scan(struct osd_thread_info *info, struct osd_device *dev, @@ -1219,6 +1229,7 @@ enum osd_lf_flags { OLF_SCAN_SUBITEMS = 0x0001, OLF_HIDE_FID = 0x0002, OLF_SHOW_NAME = 0x0004, + OLF_NO_OI = 0x0008, }; struct osd_lf_map { @@ -1312,6 +1323,10 @@ static const struct osd_lf_map osd_lf_maps[] = { { "LAST_GROUP", { FID_SEQ_LOCAL_FILE, OFD_LAST_GROUP_OID, 0 }, OLF_SHOW_NAME, NULL, NULL }, + /* lost+found */ + { "lost+found", { 0, 0, 0 }, OLF_SCAN_SUBITEMS | OLF_NO_OI, + osd_ios_general_scan, osd_ios_lf_fill }, + { NULL, { 0, 0, 0 }, 0, NULL, NULL } }; @@ -1459,6 +1474,77 @@ osd_ios_scan_one(struct osd_thread_info *info, struct osd_device *dev, RETURN(rc); } +/** + * It scans the /lost+found, and for the OST-object (with filter_fid + * or filter_fid_old), move them back to its proper /O//d. + */ +static int osd_ios_lf_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type) +{ + struct osd_ios_filldir_buf *fill_buf = buf; + struct osd_thread_info *info = fill_buf->oifb_info; + struct osd_device *dev = fill_buf->oifb_dev; + struct lu_fid *fid = &info->oti_fid; + struct osd_scrub *scrub = &dev->od_scrub; + struct dentry *parent = fill_buf->oifb_dentry; + struct dentry *child; + struct inode *dir = parent->d_inode; + struct inode *inode; + int rc; + ENTRY; + + /* skip any '.' started names */ + if (name[0] == '.') + RETURN(0); + + scrub->os_lf_scanned++; + child = osd_ios_lookup_one_len(name, parent, namelen); + if (IS_ERR(child)) { + CWARN("%s: cannot lookup child '%.*s': rc = %d\n", + osd_name(dev), namelen, name, (int)PTR_ERR(child)); + RETURN(0); + } + + inode = child->d_inode; + if (S_ISDIR(inode->i_mode)) { + rc = osd_ios_new_item(dev, child, osd_ios_general_scan, + osd_ios_lf_fill); + if (rc != 0) + CWARN("%s: cannot add child '%.*s': rc = %d\n", + osd_name(dev), namelen, name, rc); + GOTO(put, rc); + } + + if (!S_ISREG(inode->i_mode)) + GOTO(put, rc = 0); + + rc = osd_scrub_get_fid(info, dev, inode, fid, true); + if (rc == SCRUB_NEXT_OSTOBJ || rc == SCRUB_NEXT_OSTOBJ_OLD) { + rc = osd_obj_map_recover(info, dev, dir, child, fid); + if (rc == 0) { + CDEBUG(D_LFSCK, "recovered '%.*s' ["DFID"] from " + "/lost+found.\n", namelen, name, PFID(fid)); + scrub->os_lf_repaired++; + } else { + CWARN("%s: cannot rename for '%.*s' "DFID": rc = %d\n", + osd_name(dev), namelen, name, PFID(fid), rc); + } + } + + /* XXX: For MDT-objects, we can move them from /lost+found to namespace + * visible place, such as the /ROOT/.lustre/lost+found, then LFSCK + * can process them in furtuer. */ + + GOTO(put, rc); + +put: + if (rc < 0) + scrub->os_lf_failed++; + dput(child); + /* skip the failure to make the scanning to continue. */ + return 0; +} + static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, loff_t offset, __u64 ino, unsigned d_type) { @@ -1515,8 +1601,9 @@ static int osd_ios_root_fill(void *buf, const char *name, int namelen, if (IS_ERR(child)) RETURN(PTR_ERR(child)); - rc = osd_ios_scan_one(fill_buf->oifb_info, dev, child->d_inode, - &map->olm_fid, map->olm_flags); + if (!(map->olm_flags & OLF_NO_OI)) + rc = osd_ios_scan_one(fill_buf->oifb_info, dev, child->d_inode, + &map->olm_fid, map->olm_flags); if (rc == 0 && map->olm_flags & OLF_SCAN_SUBITEMS) rc = osd_ios_new_item(dev, child, map->olm_scandir, map->olm_filldir); @@ -2460,8 +2547,13 @@ int osd_scrub_dump(struct osd_device *dev, char *buf, int len) "run_time: %u seconds\n" "average_speed: "LPU64" objects/sec\n" "real-time_speed: "LPU64" objects/sec\n" - "current_position: %u\n", - rtime, speed, new_checked, scrub->os_pos_current); + "current_position: %u\n" + "lf_scanned: "LPU64"\n" + "lf_reparied: "LPU64"\n" + "lf_failed: "LPU64"\n", + rtime, speed, new_checked, scrub->os_pos_current, + scrub->os_lf_scanned, scrub->os_lf_repaired, + scrub->os_lf_failed); } else { if (sf->sf_run_time != 0) do_div(speed, sf->sf_run_time); @@ -2469,8 +2561,12 @@ int osd_scrub_dump(struct osd_device *dev, char *buf, int len) "run_time: %u seconds\n" "average_speed: "LPU64" objects/sec\n" "real-time_speed: N/A\n" - "current_position: N/A\n", - sf->sf_run_time, speed); + "current_position: N/A\n" + "lf_scanned: "LPU64"\n" + "lf_reparied: "LPU64"\n" + "lf_failed: "LPU64"\n", + sf->sf_run_time, speed, scrub->os_lf_scanned, + scrub->os_lf_repaired, scrub->os_lf_failed); } if (rc <= 0) goto out; diff --git a/lustre/osd-ldiskfs/osd_scrub.h b/lustre/osd-ldiskfs/osd_scrub.h index 60a21a0..f5eb375 100644 --- a/lustre/osd-ldiskfs/osd_scrub.h +++ b/lustre/osd-ldiskfs/osd_scrub.h @@ -193,6 +193,16 @@ struct osd_scrub { /* The time for next checkpoint, jiffies */ cfs_time_t os_time_next_checkpoint; + /* statistics for /lost+found are in ram only, it will be reset + * when each time the device remount. */ + + /* How many objects have been scanned during initial OI scrub. */ + __u64 os_lf_scanned; + /* How many objects have been repaired during initial OI scrub. */ + __u64 os_lf_repaired; + /* How many objects failed to be processed during initial OI scrub. */ + __u64 os_lf_failed; + /* How many objects have been checked since last checkpoint. */ __u32 os_new_checked; __u32 os_pos_current; diff --git a/lustre/tests/sanity-scrub.sh b/lustre/tests/sanity-scrub.sh index f8bc0d6..894ae9f 100644 --- a/lustre/tests/sanity-scrub.sh +++ b/lustre/tests/sanity-scrub.sh @@ -32,6 +32,9 @@ check_and_setup_lustre [ $(facet_fstype $SINGLEMDS) != ldiskfs ] && skip "test OI scrub only for ldiskfs" && check_and_cleanup_lustre && exit 0 +[ $(facet_fstype ost1) != ldiskfs ] && + skip "test OI scrub only for ldiskfs" && check_and_cleanup_lustre && + exit 0 [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.2.90) ]] && skip "Need MDS version at least 2.2.90" && check_and_cleanup_lustre && exit 0 @@ -39,6 +42,9 @@ check_and_setup_lustre [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.3.90) ]] && ALWAYS_EXCEPT="$ALWAYS_EXCEPT 1a" +[[ $(lustre_version_code ost1) -lt $(version_code 2.4.50) ]] && + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 11 12 13 14" + build_test_filter MDT_DEV="${FSNAME}-MDT0000" @@ -876,6 +882,52 @@ test_13() { } run_test 13 "OI scrub can rebuild missed /O entries" +test_14() { + echo "stopall" + stopall > /dev/null + echo "formatall" + formatall > /dev/null + echo "setupall" + setupall > /dev/null + + mkdir -p $DIR/$tdir + $SETSTRIPE -c 1 -i 0 $DIR/$tdir + + #define OBD_FAIL_OSD_COMPAT_NO_ENTRY 0x196 + do_facet ost1 $LCTL set_param fail_loc=0x196 + createmany -o $DIR/$tdir/f 64 + do_facet ost1 $LCTL set_param fail_loc=0 + + echo "stopall" + stopall > /dev/null + echo "setupall" + setupall > /dev/null + + local STATUS=$($SHOW_SCRUB_ON_OST | awk '/^status/ { print $2 }') + [ "$STATUS" == "init" ] || + error "(1) Expect 'init', but got '$STATUS'" + + ls -ail $DIR/$tdir > /dev/null 2>&1 && error "(2) ls should fail" + + echo "stopall" + stopall > /dev/null + + echo "run e2fsck" + run_e2fsck $(facet_host ost1) $(ostdevname 1) "-y" || + error "(3) Fail to run e2fsck error" + + echo "setupall" + setupall > /dev/null + + local LF_REPAIRED=$($SHOW_SCRUB_ON_OST | + awk '/^lf_reparied/ { print $2 }') + [ $LF_REPAIRED -gt 0 ] || + error "(4) Some entry under /lost+found should be repaired" + + ls -ail $DIR/$tdir > /dev/null 2>&1 || error "(5) ls should succeed" +} +run_test 14 "OI scrub can repair objects under lost+found" + # restore MDS/OST size MDSSIZE=${SAVED_MDSSIZE} OSTSIZE=${SAVED_OSTSIZE} -- 1.8.3.1