From 113b7cfe87be903a2398550fb2cd32b77430bcda Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 11 Sep 2012 15:07:30 +0800 Subject: [PATCH] LU-1881 oi: not shrink the last entry in OI index node Shrinking OI index node to recycle idle leaf for the last entry will cause subsequent lookup/insert ops to access invaild space. So just keep the last entry there, which can be reused directly by next new node. Other fixes: 1) The recycled empty OI blocks should be recorded on divice and be re-loaded after the device remounted. Then they can be reused when needs new OI blocks. 2) Need not check iam_container::ic_idle_failed in iam_new_node(). 3) Clear iam_frame::at_shifted when iam_path_release(). Signed-off-by: Fan Yong Change-Id: I46611c208563a943a0980110b2c416186e6d1249 Reviewed-on: http://review.whamcloud.com/3931 Tested-by: Hudson Tested-by: Maloo Tested-by: Oleg Drokin Reviewed-by: wangdi Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/osd-ldiskfs/osd_handler.c | 12 ++++- lustre/osd-ldiskfs/osd_iam.c | 16 ++++-- lustre/osd-ldiskfs/osd_iam_lfix.c | 27 +++++----- lustre/osd-ldiskfs/osd_iam_lvar.c | 11 ++--- lustre/tests/sanity.sh | 100 ++++++++++++++++++++++++++++++++++++-- 5 files changed, 136 insertions(+), 30 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 1c37bd9..cd74171 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1966,6 +1966,9 @@ static int osd_declare_object_create(const struct lu_env *env, if (fid_is_norm(lu_object_fid(&dt->do_lu))) { OSD_DECLARE_OP(oh, insert); oh->ot_credits += osd_dto_credits_noquota[DTO_INDEX_INSERT]; + /* Reuse idle OI block may cause additional one OI block + * to be changed. */ + oh->ot_credits += 1; } /* If this is directory, then we expect . and .. to be inserted as * well. The one directory block always needs to be created for the @@ -2041,7 +2044,14 @@ static int osd_declare_object_destroy(const struct lu_env *env, OSD_DECLARE_OP(oh, destroy); OSD_DECLARE_OP(oh, delete); oh->ot_credits += osd_dto_credits_noquota[DTO_OBJECT_DELETE]; - oh->ot_credits += osd_dto_credits_noquota[DTO_INDEX_DELETE]; + /* XXX: So far, only normal fid needs to be inserted into the OI, + * so only normal fid needs to be removed from the OI also. */ + if (fid_is_norm(lu_object_fid(&dt->do_lu))) { + oh->ot_credits += osd_dto_credits_noquota[DTO_INDEX_DELETE]; + /* Recycle idle OI leaf may cause additional three OI blocks + * to be changed. */ + oh->ot_credits += 3; + } osd_declare_qid(dt, oh, USRQUOTA, inode->i_uid, inode); osd_declare_qid(dt, oh, GRPQUOTA, inode->i_gid, inode); diff --git a/lustre/osd-ldiskfs/osd_iam.c b/lustre/osd-ldiskfs/osd_iam.c index 2d3e80e..16f6089 100644 --- a/lustre/osd-ldiskfs/osd_iam.c +++ b/lustre/osd-ldiskfs/osd_iam.c @@ -304,6 +304,7 @@ void iam_path_release(struct iam_path *path) for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) { if (path->ip_frames[i].bh != NULL) { + path->ip_frames[i].at_shifted = 0; brelse(path->ip_frames[i].bh); path->ip_frames[i].bh = NULL; } @@ -1671,7 +1672,7 @@ iam_new_node(handle_t *h, struct iam_container *c, iam_ptr_t *b, int *e) goto newblock; cfs_down(&c->ic_idle_sem); - if (unlikely(c->ic_idle_failed || c->ic_idle_bh == NULL)) { + if (unlikely(c->ic_idle_bh == NULL)) { cfs_up(&c->ic_idle_sem); goto newblock; } @@ -2280,6 +2281,15 @@ static iam_ptr_t iam_index_shrink(handle_t *h, struct iam_path *p, return 0; } + entries = frame->entries; + count = dx_get_count(entries); + /* NOT shrink the last entry in the index node, which can be reused + * directly by next new node. */ + if (count == 2) { + iam_unlock_htree(c, lh); + return 0; + } + rc = iam_txn_add(h, p, frame->bh); if (rc != 0) { iam_unlock_htree(c, lh); @@ -2287,8 +2297,6 @@ static iam_ptr_t iam_index_shrink(handle_t *h, struct iam_path *p, } iam_lock_bh(frame->bh); - entries = frame->entries; - count = dx_get_count(entries); if (frame->at < iam_entry_shift(p, entries, count - 1)) { struct iam_entry *n = iam_entry_shift(p, frame->at, 1); @@ -2297,8 +2305,8 @@ static iam_ptr_t iam_index_shrink(handle_t *h, struct iam_path *p, frame->at_shifted = 1; } dx_set_count(entries, count - 1); - rc = iam_txn_dirty(h, p, frame->bh); iam_unlock_bh(frame->bh); + rc = iam_txn_dirty(h, p, frame->bh); iam_unlock_htree(c, lh); if (rc != 0) return 0; diff --git a/lustre/osd-ldiskfs/osd_iam_lfix.c b/lustre/osd-ldiskfs/osd_iam_lfix.c index efea0b3..b36683f 100644 --- a/lustre/osd-ldiskfs/osd_iam_lfix.c +++ b/lustre/osd-ldiskfs/osd_iam_lfix.c @@ -799,26 +799,25 @@ static void lfix_root(void *buf, LASSERT((keysize + ptrsize) >= (sizeof(struct dx_countlimit) + sizeof(__u32))); - entry = limit + 1; + entry = (void *)(limit + 1); /* Put "idle_blocks" just after the limit. There was padding after * the limit, the "idle_blocks" re-uses part of the padding, so no * compatibility issues with old layout. */ *(__u32 *)entry = 0; - entry = root + 1; - /* - * Skip over @limit. - */ - entry += keysize + ptrsize; - - /* - * Entry format is followed by . In the minimal tree - * consisting of a root and single node, is a minimal possible - * key. - * - * XXX: this key is hard-coded to be a sequence of 0's. - */ + /* + * Skip over @limit. + */ + entry = (void *)(root + 1) + keysize + ptrsize; + + /* + * Entry format is followed by . In the minimal tree + * consisting of a root and single node, is a minimal possible + * key. + * + * XXX: this key is hard-coded to be a sequence of 0's. + */ memset(entry, 0, keysize); entry += keysize; diff --git a/lustre/osd-ldiskfs/osd_iam_lvar.c b/lustre/osd-ldiskfs/osd_iam_lvar.c index 9fc9fd7..4444f835 100644 --- a/lustre/osd-ldiskfs/osd_iam_lvar.c +++ b/lustre/osd-ldiskfs/osd_iam_lvar.c @@ -949,18 +949,17 @@ static void lvar_root(void *buf, LASSERT((keysize + ptrsize) >= (sizeof(struct dx_countlimit) + sizeof(__u32))); - entry = limit + 1; + entry = (void *)(limit + 1); /* Put "idle_blocks" just after the limit. There was padding after * the limit, the "idle_blocks" re-uses part of the padding, so no * compatibility issues with old layout. */ *(__u32 *)entry = 0; - entry = root + 1; - /* - * Skip over @limit. - */ - entry += isize; + /* + * Skip over @limit. + */ + entry = (void *)(root + 1) + isize; /* * Entry format is followed by . In the minimal tree diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 6bf9e08..da388b4 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -5855,9 +5855,9 @@ test_115() { cut -c11-20) # don't return an error - [ $OSTIO_post -eq $OSTIO_pre ] && echo \ - "WARNING: No new ll_ost_io threads were created ($OSTIO_pre)" &&\ - echo "This may be fine, depending on what ran before this test" &&\ + [ $OSTIO_post == $OSTIO_pre ] && echo \ + "WARNING: No new ll_ost_io threads were created ($OSTIO_pre)" && + echo "This may be fine, depending on what ran before this test" && echo "and how fast this system is." && return echo "Started with $OSTIO_pre threads, ended with $OSTIO_post" @@ -9609,7 +9609,7 @@ test_227() { run_test 227 "running truncated executable does not cause OOM" # LU-1512 try to reuse idle OI blocks -test_228() { +test_228a() { [ "$FSTYPE" != "ldiskfs" ] && skip "non-ldiskfs backend" && return local MDT_DEV=$(mdsdevname ${SINGLEMDS//mds/}) @@ -9646,7 +9646,97 @@ test_228() { [ $blk1 == $blk2 ] || error "old blk1=$blk1, new blk2=$blk2, unmatched!" } -run_test 228 "try to reuse idle OI blocks" +run_test 228a "try to reuse idle OI blocks" + +test_228b() { + [ "$FSTYPE" != "ldiskfs" ] && skip "non-ldiskfs backend" && return + + local MDT_DEV=$(mdsdevname ${SINGLEMDS//mds/}) + local myDIR=$DIR/$tdir + + mkdir -p $myDIR + #define OBD_FAIL_SEQ_EXHAUST 0x1002 + $LCTL set_param fail_loc=0x80001002 + createmany -o $myDIR/t- 10000 + $LCTL set_param fail_loc=0 + # The guard is current the largest FID holder + touch $myDIR/guard + local SEQ=$($LFS path2fid $myDIR/guard | awk -F ':' '{print $1}' | + tr -d '[') + local IDX=$(($SEQ % 64)) + + do_facet $SINGLEMDS sync + # Make sure journal flushed. + sleep 6 + local blk1=$(do_facet $SINGLEMDS \ + "$DEBUGFS -c -R \\\"stat oi.16.${IDX}\\\" $MDT_DEV" | + grep Blockcount | awk '{print $4}') + + # Remove old files, some OI blocks will become idle. + unlinkmany $myDIR/t- 10000 + + # stop the MDT + stop $SINGLEMDS || error "Fail to stop MDT." + # remount the MDT + start $SINGLEMDS $MDT_DEV $MDS_MOUNT_OPTS || error "Fail to start MDT." + + df $MOUNT || error "Fail to df." + # Create new files, idle OI blocks should be reused. + createmany -o $myDIR/t- 2000 + do_facet $SINGLEMDS sync + # Make sure journal flushed. + sleep 6 + local blk2=$(do_facet $SINGLEMDS \ + "$DEBUGFS -c -R \\\"stat oi.16.${IDX}\\\" $MDT_DEV" | + grep Blockcount | awk '{print $4}') + + [ $blk1 == $blk2 ] || error "old blk1=$blk1, new blk2=$blk2, unmatched!" +} +run_test 228b "idle OI blocks can be reused after MDT restart" + +#LU-1881 +test_228c() { + [ "$FSTYPE" != "ldiskfs" ] && skip "non-ldiskfs backend" && return + + local MDT_DEV=$(mdsdevname ${SINGLEMDS//mds/}) + local myDIR=$DIR/$tdir + + mkdir -p $myDIR + #define OBD_FAIL_SEQ_EXHAUST 0x1002 + $LCTL set_param fail_loc=0x80001002 + # 20000 files can guarantee there are index nodes in the OI file + createmany -o $myDIR/t- 20000 + $LCTL set_param fail_loc=0 + # The guard is current the largest FID holder + touch $myDIR/guard + local SEQ=$($LFS path2fid $myDIR/guard | awk -F ':' '{print $1}' | + tr -d '[') + local IDX=$(($SEQ % 64)) + + do_facet $SINGLEMDS sync + # Make sure journal flushed. + sleep 6 + local blk1=$(do_facet $SINGLEMDS \ + "$DEBUGFS -c -R \\\"stat oi.16.${IDX}\\\" $MDT_DEV" | + grep Blockcount | awk '{print $4}') + + # Remove old files, some OI blocks will become idle. + unlinkmany $myDIR/t- 20000 + rm -f $myDIR/guard + # The OI file should become empty now + + # Create new files, idle OI blocks should be reused. + createmany -o $myDIR/t- 2000 + do_facet $SINGLEMDS sync + # Make sure journal flushed. + sleep 6 + local blk2=$(do_facet $SINGLEMDS \ + "$DEBUGFS -c -R \\\"stat oi.16.${IDX}\\\" $MDT_DEV" | + grep Blockcount | awk '{print $4}') + + [ $blk1 == $blk2 ] || error "old blk1=$blk1, new blk2=$blk2, unmatched!" +} +run_test 228c "NOT shrink the last entry in OI index node to recycle idle leaf" # # tests that do cleanup/setup should be run at the end -- 1.8.3.1