Whamcloud - gitweb
Branch: b1_8
authorwangdi <wangdi>
Fri, 6 Mar 2009 07:11:41 +0000 (07:11 +0000)
committerwangdi <wangdi>
Fri, 6 Mar 2009 07:11:41 +0000 (07:11 +0000)
b=17817
Make read-ahead stripe size aligned.
i=Andreas,Johann

lustre/ChangeLog
lustre/include/obd.h
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/rw.c
lustre/lov/lov_obd.c
lustre/osc/osc_request.c
lustre/tests/sanity.sh

index 3846142..b5ed7f8 100644 (file)
@@ -32,6 +32,10 @@ tbd Sun Microsystems, Inc.
          more information, please refer to bugzilla 17630.
 
 Severity   : enhancement
+Bugzilla   : 17817 
+Description: Make read-ahead stripe size aligned. 
+
+Severity   : enhancement
 Bugzilla   : 17536
 Description: MDS create should not wait for statfs RPC while holding DLM lock.
 
index 7f31c6c..68c84e0 100644 (file)
@@ -1060,6 +1060,7 @@ enum obd_cleanup_stage {
 #define KEY_ASYNC               "async"
 #define KEY_CAPA_KEY            "capa_key"
 #define KEY_GRANT_SHRINK        "grant_shrink"
+#define KEY_OFF_RPCSIZE                "off_rpcsize"
 
 struct obd_ops {
         struct module *o_owner;
@@ -1309,8 +1310,14 @@ static inline struct lsm_operations *lsm_op_find(int magic)
 int lvfs_check_io_health(struct obd_device *obd, struct file *file);
 
 /* Requests for obd_extent_calc() */
-#define OBD_CALC_STRIPE_START   1
-#define OBD_CALC_STRIPE_END     2
+#define OBD_CALC_STRIPE_START                 0x0001 
+#define OBD_CALC_STRIPE_END                   0x0010 
+#define OBD_CALC_STRIPE_RPC_ALIGN      0x0100 
+
+#define OBD_CALC_STRIPE_RPC_START_ALIGN (OBD_CALC_STRIPE_START | \
+                                        OBD_CALC_STRIPE_RPC_ALIGN)
+#define OBD_CALC_STRIPE_RPC_END_ALIGN (OBD_CALC_STRIPE_START | \
+                                      OBD_CALC_STRIPE_RPC_ALIGN)
 
 static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno,
                                          struct obd_export *exp, int error)
index a315983..d9f1840 100644 (file)
@@ -1588,9 +1588,7 @@ repeat:
                 /* initialize read-ahead window once per syscall */
                 if (ra == 0) {
                         ra = 1;
-                        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
-                        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-                        ll_ra_read_in(file, &bead);
+                        ll_ra_read_init(file, &bead, *ppos, count);
                 }
 
                 /* BUG: 5972 */
@@ -1946,9 +1944,7 @@ static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,
         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
                inode->i_ino, count, *ppos, i_size_read(inode));
 
-        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
-        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-        ll_ra_read_in(in_file, &bead);
+        ll_ra_read_init(in_file, &bead, *ppos, count);
         /* BUG: 5972 */
         file_accessed(in_file);
         rc = generic_file_sendfile(in_file, ppos, count, actor, target);
@@ -2047,9 +2043,7 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
                inode->i_ino, count, *ppos, i_size_read(inode));
 
-        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
-        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-        ll_ra_read_in(in_file, &bead);
+        ll_ra_read_init(in_file, &bead, *ppos, count);
         /* BUG: 5972 */
         file_accessed(in_file);
         rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
index 60d5256..4e8748e 100644 (file)
@@ -650,7 +650,8 @@ struct cache_definition {
 #define ll_unregister_cache(cache) do {} while (0)
 #endif
 
-void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+                     loff_t offset, size_t count);
 void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
 struct ll_ra_read *ll_ra_read_get(struct file *f);
 
index 22e661e..87fecc5 100644 (file)
@@ -1327,12 +1327,16 @@ static struct ll_readahead_state *ll_ras_get(struct file *f)
         return &fd->fd_ras;
 }
 
-void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+                     loff_t offset, size_t count)
 {
         struct ll_readahead_state *ras;
 
         ras = ll_ras_get(f);
 
+        rar->lrr_start = offset >> CFS_PAGE_SHIFT;
+        rar->lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+
         spin_lock(&ras->ras_lock);
         ras->ras_requests++;
         ras->ras_request_index = 0;
@@ -1462,7 +1466,7 @@ struct ra_io_arg {
         ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
         ria->ria_pages)
 
-#define RAS_INCREASE_STEP (1024 * 1024 >> CFS_PAGE_SHIFT)
+#define INIT_RAS_WINDOW_PAGES PTLRPC_MAX_BRW_PAGES
 
 static inline int stride_io_mode(struct ll_readahead_state *ras)
 {
@@ -1603,7 +1607,11 @@ static int ll_readahead(struct ll_readahead_state *ras,
         /* Enlarge the RA window to encompass the full read */
         if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
             bead->lrr_start + bead->lrr_count) {
-                ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+                obd_off read_end = (bead->lrr_start + bead->lrr_count) << 
+                                    CFS_PAGE_SHIFT;
+                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, 
+                                &read_end);
+                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - 
                                       ras->ras_window_start;
         }
                /* Reserve a part of the read-ahead window that we'll be issuing */
@@ -1675,7 +1683,7 @@ static int ll_readahead(struct ll_readahead_state *ras,
 
 static void ras_set_start(struct ll_readahead_state *ras, unsigned long index)
 {
-        ras->ras_window_start = index & (~(RAS_INCREASE_STEP - 1));
+        ras->ras_window_start = index & (~(INIT_RAS_WINDOW_PAGES - 1));
 }
 
 /* called with the ras_lock held or from places where it doesn't matter */
@@ -1806,6 +1814,30 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras)
         RAS_CDEBUG(ras);
 }
 
+static void ras_increase_window(struct ll_readahead_state *ras, 
+                               struct ll_ra_info *ra, struct inode *inode)
+{
+       __u64 step;
+       __u32 size;
+       int rc;
+
+       step = ((loff_t)(ras->ras_window_start + 
+                        ras->ras_window_len)) << CFS_PAGE_SHIFT;
+       size = sizeof(step);
+       /*Get rpc_size for this offset (step) */
+        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), 
+                         KEY_OFF_RPCSIZE, &size, &step, 
+                         ll_i2info(inode)->lli_smd);
+       if (rc)
+               step = INIT_RAS_WINDOW_PAGES;
+
+       if (stride_io_mode(ras))
+               ras_stride_increase_window(ras, ra, (unsigned long)step);
+       else
+               ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step,
+                                         ra->ra_max_pages);
+}
+
 static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                        struct ll_readahead_state *ras, unsigned long index,
                        unsigned hit)
@@ -1912,7 +1944,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
         /* Trigger RA in the mmap case where ras_consecutive_requests
          * is not incremented and thus can't be used to trigger RA */
         if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
-                ras->ras_window_len = RAS_INCREASE_STEP;
+                ras->ras_window_len = INIT_RAS_WINDOW_PAGES;
                 GOTO(out_unlock, 0);
         }
 
@@ -1924,14 +1956,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
          * uselessly reading and discarding pages for random IO the window is
          * only increased once per consecutive request received. */
         if ((ras->ras_consecutive_requests > 1 &&
-            !ras->ras_request_index) || stride_detect) {
-                if (stride_io_mode(ras))
-                        ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP);
-                else
-                        ras->ras_window_len = min(ras->ras_window_len +
-                                                  RAS_INCREASE_STEP,
-                                                  ra->ra_max_pages);
-        }
+            !ras->ras_request_index) || stride_detect) 
+               ras_increase_window(ras, ra, inode); 
         EXIT;
 out_unlock:
         RAS_CDEBUG(ras);
index 45ddf6e..276de7d 100644 (file)
@@ -2914,7 +2914,21 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
         } else if (KEY_IS(KEY_FIEMAP)) {
                 rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
                 GOTO(out, rc);
-        }
+        } else if (KEY_IS(KEY_OFF_RPCSIZE)) {
+               __u64 *offset = val;
+                struct lov_tgt_desc *tgt;
+                struct lov_oinfo *loi;
+               int stripe;
+
+               LASSERT(*vallen == sizeof(__u64));
+               stripe = lov_stripe_number(lsm, *offset); 
+               loi = lsm->lsm_oinfo[stripe];
+               tgt = lov->lov_tgts[loi->loi_ost_idx];
+                if (!tgt || !tgt->ltd_active)
+                        GOTO(out, rc = -ESRCH);
+               rc = obd_get_info(tgt->ltd_exp, keylen, key, vallen, val, NULL);
+               GOTO(out, rc);  
+       }
 
         rc = -EINVAL;
 out:
@@ -3035,19 +3049,22 @@ static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm,
         __u64 start;
         __u32 ssize  = lsm->lsm_stripe_size;
 
+        if (cmd & OBD_CALC_STRIPE_RPC_ALIGN)
+                ssize = ssize > PTLRPC_MAX_BRW_SIZE ? 
+                        PTLRPC_MAX_BRW_SIZE : ssize;
+
         start = *offset;
         do_div(start, ssize);
         start = start * ssize;
 
         CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64
                ", end "LPU64"\n", *offset, ssize, start, start + ssize - 1);
-        if (cmd == OBD_CALC_STRIPE_END) {
+        if (cmd & OBD_CALC_STRIPE_END) 
                 *offset = start + ssize - 1;
-        } else if (cmd == OBD_CALC_STRIPE_START) {
+        else if (cmd & OBD_CALC_STRIPE_START)
                 *offset = start;
-        } else {
+        else 
                 LBUG();
-        }
 
         RETURN(0);
 }
index 57a0586..16aca44 100644 (file)
@@ -3737,7 +3737,13 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
                 *vallen = sizeof(*stripe);
                 *stripe = 0;
                 RETURN(0);
-        } else if (KEY_IS(KEY_LAST_ID)) {
+        } else if (KEY_IS(KEY_OFF_RPCSIZE)) {
+               struct client_obd *cli = &exp->exp_obd->u.cli;
+               __u64 *rpcsize = val;
+               LASSERT(*vallen == sizeof(__u64));
+               *rpcsize = (__u64)cli->cl_max_pages_per_rpc;    
+               RETURN(0);
+       } else if (KEY_IS(KEY_LAST_ID)) {
                 struct ptlrpc_request *req;
                 obd_id *reply;
                 char *bufs[2] = { NULL, key };
index d963031..da3c9ef 100644 (file)
@@ -3730,7 +3730,7 @@ setup_101b() {
        STRIPE_COUNT=$OSTCOUNT
        STRIPE_OFFSET=0
 
-       trap cleanup_101b EXIT
+       trap cleanup_101 EXIT
        # prepare the read-ahead file
        $SETSTRIPE $DIR/$tfile -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $OSTCOUNT
 
@@ -3738,7 +3738,7 @@ setup_101b() {
        SETUP_TEST101b=yes
 }
 
-cleanup_101b() {
+cleanup_101() {
        trap 0
        rm -rf $DIR/$tdir $DIR/$tfile
        SETUP_TEST101b=no
@@ -3788,10 +3788,58 @@ test_101b() {
                cancel_lru_locks osc
                ra_check_101b $BSIZE
        done
-       cleanup_101b
        true
 }
 run_test 101b "check stride-io mode read-ahead ================="
+  
+test_101c() {
+        local STRIPE_SIZE=1048576
+        local FILE_LENGTH=$((STRIPE_SIZE*100))
+        local nreads=10000
+
+        setup_test101
+
+        cancel_lru_locks osc
+        $LCTL set_param osc.*.rpc_stats 0
+        $READS -f $DIR/$tfile -s$FILE_LENGTH -b65536 -n$nreads -t 180 
+        for OSC in `$LCTL  get_param -N osc.*`
+        do
+                if [ "$OSC" == "osc.num_refs" ]; then
+                        continue
+                fi
+                lines=`$LCTL get_param -n ${OSC}.rpc_stats | wc | awk '{print $1}'`
+                if [ $lines -le 20 ]; then
+                        continue
+                fi
+                
+               rpc4k=$($LCTL get_param -n $OSC | awk '$1 == "1:" { print $2; exit; }')
+                rpc8k=$($LCTL get_param -n $OSC | awk '$1 == "2:" { print $2; exit; }')
+                rpc16k=$($LCTL get_param -n $OSC | awk '$1 == "4:" { print $2; exit; }')
+                rpc32k=$($LCTL get_param -n $OSC | awk '$1 == "8:" { print $2; exit; }')
+               
+                [ $rpc4k != 0 ]  && error "Small 4k read IO ${rpc4k}!"
+                [ $rpc8k != 0 ]  && error "Small 8k read IO ${rpc8k}!"
+                [ $rpc16k != 0 ] && error "Small 16k read IO ${rpc16k}!"
+                [ $rpc32k != 0 ] && error "Small 32k read IO ${rpc32k}!"
+
+                echo "Small rpc check passed!"
+                       rpc64k=$($LCTL get_param -n $OSC | awk '$1 == "16:" { print $2; exit; }')
+                rpc128k=$($LCTL get_param -n $OSC | awk '$1 == "32:" { print $2; exit; }')
+                rpc256k=$($LCTL get_param -n $OSC | awk '$1 == "64:" { print $2; exit; }')
+                rpc512k=$($LCTL get_param -n $OSC | awk '$1 == "128:" { print $2; exit; }')
+                rpc1024k=$($LCTL get_param -n $OSC | awk '$1 == "256:" { print $2; exit; }')
+                   
+                [ $rpc64k == 0 ]   && error "No 64k readahead IO ${rpc64k}" 
+                [ $rpc128k == 0 ]  && error "No 128k readahead IO ${rpc128k}" 
+                [ $rpc256k == 0 ]  && error "No 256k readahead IO ${rpc256k}" 
+                [ $rpc512k == 0 ]  && error "No 512k readahead IO ${rpc256k}" 
+                [ $rpc1024k == 0 ] && error "No 1024k readahead IO ${rpc1024k}" 
+                echo "Big rpc check passed!"
+        done
+        cleanup_101
+        true
+}
+run_test 101c "check stripe_size aligned read-ahead ================="
 
 export SETUP_TEST102=no
 setup_test102() {