From 8a259ad4cec27b64a4d5a95e681d8178513fd3ca Mon Sep 17 00:00:00 2001 From: wangdi Date: Fri, 6 Mar 2009 07:11:41 +0000 Subject: [PATCH] Branch: b1_8 b=17817 Make read-ahead stripe size aligned. i=Andreas,Johann --- lustre/ChangeLog | 4 ++++ lustre/include/obd.h | 11 +++++++-- lustre/llite/file.c | 12 +++------- lustre/llite/llite_internal.h | 3 ++- lustre/llite/rw.c | 52 ++++++++++++++++++++++++++++++----------- lustre/lov/lov_obd.c | 27 ++++++++++++++++++---- lustre/osc/osc_request.c | 8 ++++++- lustre/tests/sanity.sh | 54 ++++++++++++++++++++++++++++++++++++++++--- 8 files changed, 137 insertions(+), 34 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 3846142..b5ed7f8 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -32,6 +32,10 @@ tbd Sun Microsystems, Inc. more information, please refer to bugzilla 17630. Severity : enhancement +Bugzilla : 17817 +Description: Make read-ahead stripe size aligned. + +Severity : enhancement Bugzilla : 17536 Description: MDS create should not wait for statfs RPC while holding DLM lock. diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 7f31c6c..68c84e0 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1060,6 +1060,7 @@ enum obd_cleanup_stage { #define KEY_ASYNC "async" #define KEY_CAPA_KEY "capa_key" #define KEY_GRANT_SHRINK "grant_shrink" +#define KEY_OFF_RPCSIZE "off_rpcsize" struct obd_ops { struct module *o_owner; @@ -1309,8 +1310,14 @@ static inline struct lsm_operations *lsm_op_find(int magic) int lvfs_check_io_health(struct obd_device *obd, struct file *file); /* Requests for obd_extent_calc() */ -#define OBD_CALC_STRIPE_START 1 -#define OBD_CALC_STRIPE_END 2 +#define OBD_CALC_STRIPE_START 0x0001 +#define OBD_CALC_STRIPE_END 0x0010 +#define OBD_CALC_STRIPE_RPC_ALIGN 0x0100 + +#define OBD_CALC_STRIPE_RPC_START_ALIGN (OBD_CALC_STRIPE_START | \ + OBD_CALC_STRIPE_RPC_ALIGN) +#define OBD_CALC_STRIPE_RPC_END_ALIGN (OBD_CALC_STRIPE_START | \ + OBD_CALC_STRIPE_RPC_ALIGN) static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, struct obd_export *exp, int error) diff --git a/lustre/llite/file.c b/lustre/llite/file.c index a315983..d9f1840 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1588,9 +1588,7 @@ repeat: /* initialize read-ahead window once per syscall */ if (ra == 0) { ra = 1; - bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; - bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; - ll_ra_read_in(file, &bead); + ll_ra_read_init(file, &bead, *ppos, count); } /* BUG: 5972 */ @@ -1946,9 +1944,7 @@ static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos, CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n", inode->i_ino, count, *ppos, i_size_read(inode)); - bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; - bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; - ll_ra_read_in(in_file, &bead); + ll_ra_read_init(in_file, &bead, *ppos, count); /* BUG: 5972 */ file_accessed(in_file); rc = generic_file_sendfile(in_file, ppos, count, actor, target); @@ -2047,9 +2043,7 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n", inode->i_ino, count, *ppos, i_size_read(inode)); - bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; - bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; - ll_ra_read_in(in_file, &bead); + ll_ra_read_init(in_file, &bead, *ppos, count); /* BUG: 5972 */ file_accessed(in_file); rc = generic_file_splice_read(in_file, ppos, pipe, count, flags); diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 60d5256..4e8748e 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -650,7 +650,8 @@ struct cache_definition { #define ll_unregister_cache(cache) do {} while (0) #endif -void ll_ra_read_in(struct file *f, struct ll_ra_read *rar); +void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, + loff_t offset, size_t count); void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar); struct ll_ra_read *ll_ra_read_get(struct file *f); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 22e661e..87fecc5 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1327,12 +1327,16 @@ static struct ll_readahead_state *ll_ras_get(struct file *f) return &fd->fd_ras; } -void ll_ra_read_in(struct file *f, struct ll_ra_read *rar) +void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, + loff_t offset, size_t count) { struct ll_readahead_state *ras; ras = ll_ras_get(f); + rar->lrr_start = offset >> CFS_PAGE_SHIFT; + rar->lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; + spin_lock(&ras->ras_lock); ras->ras_requests++; ras->ras_request_index = 0; @@ -1462,7 +1466,7 @@ struct ra_io_arg { ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ ria->ria_pages) -#define RAS_INCREASE_STEP (1024 * 1024 >> CFS_PAGE_SHIFT) +#define INIT_RAS_WINDOW_PAGES PTLRPC_MAX_BRW_PAGES static inline int stride_io_mode(struct ll_readahead_state *ras) { @@ -1603,7 +1607,11 @@ static int ll_readahead(struct ll_readahead_state *ras, /* Enlarge the RA window to encompass the full read */ if (bead != NULL && ras->ras_window_start + ras->ras_window_len < bead->lrr_start + bead->lrr_count) { - ras->ras_window_len = bead->lrr_start + bead->lrr_count - + obd_off read_end = (bead->lrr_start + bead->lrr_count) << + CFS_PAGE_SHIFT; + obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, + &read_end); + ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - ras->ras_window_start; } /* Reserve a part of the read-ahead window that we'll be issuing */ @@ -1675,7 +1683,7 @@ static int ll_readahead(struct ll_readahead_state *ras, static void ras_set_start(struct ll_readahead_state *ras, unsigned long index) { - ras->ras_window_start = index & (~(RAS_INCREASE_STEP - 1)); + ras->ras_window_start = index & (~(INIT_RAS_WINDOW_PAGES - 1)); } /* called with the ras_lock held or from places where it doesn't matter */ @@ -1806,6 +1814,30 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras) RAS_CDEBUG(ras); } +static void ras_increase_window(struct ll_readahead_state *ras, + struct ll_ra_info *ra, struct inode *inode) +{ + __u64 step; + __u32 size; + int rc; + + step = ((loff_t)(ras->ras_window_start + + ras->ras_window_len)) << CFS_PAGE_SHIFT; + size = sizeof(step); + /*Get rpc_size for this offset (step) */ + rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), + KEY_OFF_RPCSIZE, &size, &step, + ll_i2info(inode)->lli_smd); + if (rc) + step = INIT_RAS_WINDOW_PAGES; + + if (stride_io_mode(ras)) + ras_stride_increase_window(ras, ra, (unsigned long)step); + else + ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step, + ra->ra_max_pages); +} + static void ras_update(struct ll_sb_info *sbi, struct inode *inode, struct ll_readahead_state *ras, unsigned long index, unsigned hit) @@ -1912,7 +1944,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, /* Trigger RA in the mmap case where ras_consecutive_requests * is not incremented and thus can't be used to trigger RA */ if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) { - ras->ras_window_len = RAS_INCREASE_STEP; + ras->ras_window_len = INIT_RAS_WINDOW_PAGES; GOTO(out_unlock, 0); } @@ -1924,14 +1956,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, * uselessly reading and discarding pages for random IO the window is * only increased once per consecutive request received. */ if ((ras->ras_consecutive_requests > 1 && - !ras->ras_request_index) || stride_detect) { - if (stride_io_mode(ras)) - ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP); - else - ras->ras_window_len = min(ras->ras_window_len + - RAS_INCREASE_STEP, - ra->ra_max_pages); - } + !ras->ras_request_index) || stride_detect) + ras_increase_window(ras, ra, inode); EXIT; out_unlock: RAS_CDEBUG(ras); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 45ddf6e..276de7d 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -2914,7 +2914,21 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, } else if (KEY_IS(KEY_FIEMAP)) { rc = lov_fiemap(lov, keylen, key, vallen, val, lsm); GOTO(out, rc); - } + } else if (KEY_IS(KEY_OFF_RPCSIZE)) { + __u64 *offset = val; + struct lov_tgt_desc *tgt; + struct lov_oinfo *loi; + int stripe; + + LASSERT(*vallen == sizeof(__u64)); + stripe = lov_stripe_number(lsm, *offset); + loi = lsm->lsm_oinfo[stripe]; + tgt = lov->lov_tgts[loi->loi_ost_idx]; + if (!tgt || !tgt->ltd_active) + GOTO(out, rc = -ESRCH); + rc = obd_get_info(tgt->ltd_exp, keylen, key, vallen, val, NULL); + GOTO(out, rc); + } rc = -EINVAL; out: @@ -3035,19 +3049,22 @@ static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm, __u64 start; __u32 ssize = lsm->lsm_stripe_size; + if (cmd & OBD_CALC_STRIPE_RPC_ALIGN) + ssize = ssize > PTLRPC_MAX_BRW_SIZE ? + PTLRPC_MAX_BRW_SIZE : ssize; + start = *offset; do_div(start, ssize); start = start * ssize; CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64 ", end "LPU64"\n", *offset, ssize, start, start + ssize - 1); - if (cmd == OBD_CALC_STRIPE_END) { + if (cmd & OBD_CALC_STRIPE_END) *offset = start + ssize - 1; - } else if (cmd == OBD_CALC_STRIPE_START) { + else if (cmd & OBD_CALC_STRIPE_START) *offset = start; - } else { + else LBUG(); - } RETURN(0); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 57a0586..16aca44 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3737,7 +3737,13 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen, *vallen = sizeof(*stripe); *stripe = 0; RETURN(0); - } else if (KEY_IS(KEY_LAST_ID)) { + } else if (KEY_IS(KEY_OFF_RPCSIZE)) { + struct client_obd *cli = &exp->exp_obd->u.cli; + __u64 *rpcsize = val; + LASSERT(*vallen == sizeof(__u64)); + *rpcsize = (__u64)cli->cl_max_pages_per_rpc; + RETURN(0); + } else if (KEY_IS(KEY_LAST_ID)) { struct ptlrpc_request *req; obd_id *reply; char *bufs[2] = { NULL, key }; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index d963031..da3c9ef 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3730,7 +3730,7 @@ setup_101b() { STRIPE_COUNT=$OSTCOUNT STRIPE_OFFSET=0 - trap cleanup_101b EXIT + trap cleanup_101 EXIT # prepare the read-ahead file $SETSTRIPE $DIR/$tfile -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $OSTCOUNT @@ -3738,7 +3738,7 @@ setup_101b() { SETUP_TEST101b=yes } -cleanup_101b() { +cleanup_101() { trap 0 rm -rf $DIR/$tdir $DIR/$tfile SETUP_TEST101b=no @@ -3788,10 +3788,58 @@ test_101b() { cancel_lru_locks osc ra_check_101b $BSIZE done - cleanup_101b true } run_test 101b "check stride-io mode read-ahead =================" + +test_101c() { + local STRIPE_SIZE=1048576 + local FILE_LENGTH=$((STRIPE_SIZE*100)) + local nreads=10000 + + setup_test101 + + cancel_lru_locks osc + $LCTL set_param osc.*.rpc_stats 0 + $READS -f $DIR/$tfile -s$FILE_LENGTH -b65536 -n$nreads -t 180 + for OSC in `$LCTL get_param -N osc.*` + do + if [ "$OSC" == "osc.num_refs" ]; then + continue + fi + lines=`$LCTL get_param -n ${OSC}.rpc_stats | wc | awk '{print $1}'` + if [ $lines -le 20 ]; then + continue + fi + + rpc4k=$($LCTL get_param -n $OSC | awk '$1 == "1:" { print $2; exit; }') + rpc8k=$($LCTL get_param -n $OSC | awk '$1 == "2:" { print $2; exit; }') + rpc16k=$($LCTL get_param -n $OSC | awk '$1 == "4:" { print $2; exit; }') + rpc32k=$($LCTL get_param -n $OSC | awk '$1 == "8:" { print $2; exit; }') + + [ $rpc4k != 0 ] && error "Small 4k read IO ${rpc4k}!" + [ $rpc8k != 0 ] && error "Small 8k read IO ${rpc8k}!" + [ $rpc16k != 0 ] && error "Small 16k read IO ${rpc16k}!" + [ $rpc32k != 0 ] && error "Small 32k read IO ${rpc32k}!" + + echo "Small rpc check passed!" + rpc64k=$($LCTL get_param -n $OSC | awk '$1 == "16:" { print $2; exit; }') + rpc128k=$($LCTL get_param -n $OSC | awk '$1 == "32:" { print $2; exit; }') + rpc256k=$($LCTL get_param -n $OSC | awk '$1 == "64:" { print $2; exit; }') + rpc512k=$($LCTL get_param -n $OSC | awk '$1 == "128:" { print $2; exit; }') + rpc1024k=$($LCTL get_param -n $OSC | awk '$1 == "256:" { print $2; exit; }') + + [ $rpc64k == 0 ] && error "No 64k readahead IO ${rpc64k}" + [ $rpc128k == 0 ] && error "No 128k readahead IO ${rpc128k}" + [ $rpc256k == 0 ] && error "No 256k readahead IO ${rpc256k}" + [ $rpc512k == 0 ] && error "No 512k readahead IO ${rpc256k}" + [ $rpc1024k == 0 ] && error "No 1024k readahead IO ${rpc1024k}" + echo "Big rpc check passed!" + done + cleanup_101 + true +} +run_test 101c "check stripe_size aligned read-ahead =================" export SETUP_TEST102=no setup_test102() { -- 1.8.3.1