Whamcloud - gitweb
Land changes to the read-ahead algorithm improving its behavior for random
authornikita <nikita>
Tue, 20 Sep 2005 13:24:54 +0000 (13:24 +0000)
committernikita <nikita>
Tue, 20 Sep 2005 13:24:54 +0000 (13:24 +0000)
reads:

 - always try to read-ahead at least file region that will be read by read(2)
   call.

 - try to detect random reads, and avoid excessive read-ahead in that case.

b=6252
r=adilger

lustre/ChangeLog
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/rw.c

index 0f88382..d9ba5d0 100644 (file)
@@ -2,6 +2,15 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.4.6
        * bug fixes
 
+Severity   : enhancement
+Frequency  : rare
+Bugzilla   : 6252
+Description: Improve read-ahead algorithm to avoid excessive IO for random reads
+Details    : Existing read-ahead algorithm is tuned for the case of streamlined
+            sequential reads and behaves badly with applications doing random
+            reads. Improve it by 1. reading ahead at least read region, and
+            2. avoiding excessive large RPC for small reads.
+
 Severity   : major
 Frequency  : rare
 Bugzilla   : 7407
index 10ebaf1..d7b3d08 100644 (file)
@@ -785,6 +785,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
         struct lov_stripe_md *lsm = lli->lli_smd;
         struct ll_lock_tree tree;
         struct ll_lock_tree_node *node;
+        struct ll_ra_read bead;
         int rc;
         ssize_t retval;
         __u64 kms;
@@ -856,7 +857,11 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
 #else
         file->f_ra.ra_pages = 0;
 #endif
+        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
+        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+        ll_ra_read_in(file, &bead);
         retval = generic_file_read(file, buf, count, ppos);
+        ll_ra_read_ex(file, &bead);
 
  out:
         ll_tree_unlock(&tree);
@@ -887,7 +892,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
 
         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
          * called on the file, don't fail the below assertion (bug 2388). */
-        if (file->f_flags & O_LOV_DELAY_CREATE && 
+        if (file->f_flags & O_LOV_DELAY_CREATE &&
             ll_i2info(inode)->lli_smd == NULL)
                 RETURN(-EBADF);
 
@@ -896,7 +901,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
         if (file->f_flags & O_APPEND)
                 node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW);
         else
-                node = ll_node_from_inode(inode, *ppos, *ppos  + count - 1, 
+                node = ll_node_from_inode(inode, *ppos, *ppos  + count - 1,
                                           LCK_PW);
 
         if (IS_ERR(node))
index 47e33f3..6ade334 100644 (file)
@@ -152,6 +152,13 @@ struct ll_sb_info {
         struct file_operations   *ll_fop;
 };
 
+struct ll_ra_read {
+        pgoff_t             lrr_start;
+        pgoff_t             lrr_count;
+        struct task_struct *lrr_reader;
+        struct list_head    lrr_linkage;
+};
+
 /*
  * per file-descriptor read-ahead data.
  */
@@ -192,7 +199,12 @@ struct ll_readahead_state {
          * not covered by DLM lock.
          */
         unsigned long   ras_next_readahead;
-
+        /*
+         * list of struct ll_ra_read's one per read(2) call current in
+         * progress against this file descriptor. Used by read-ahead code,
+         * protected by ->ras_lock.
+         */
+        struct list_head ras_read_beads;
 };
 
 extern kmem_cache_t *ll_file_data_slab;
@@ -269,6 +281,10 @@ extern char *llap_origins[];
 #define ll_unregister_cache(cache) do {} while (0)
 #endif
 
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
 /* llite/lproc_llite.c */
 #ifdef LPROCFS
 int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
index b8f2b2c..cc34c88 100644 (file)
@@ -949,6 +949,61 @@ static int index_in_window(unsigned long index, unsigned long point,
         return start <= index && index <= end;
 }
 
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+        struct ll_file_data       *fd;
+
+        fd = LUSTRE_FPRIVATE(f);
+        return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+        struct ll_readahead_state *ras;
+
+        ras = ll_ras_get(f);
+        rar->lrr_reader = current;
+
+        spin_lock(&ras->ras_lock);
+        list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+        spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+        struct ll_readahead_state *ras;
+
+        ras = ll_ras_get(f);
+
+        spin_lock(&ras->ras_lock);
+        list_del_init(&rar->lrr_linkage);
+        spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+        struct ll_ra_read *scan;
+
+        list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+                if (scan->lrr_reader == current)
+                        return scan;
+        }
+        return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+        struct ll_readahead_state *ras;
+        struct ll_ra_read         *bead;
+
+        ras = ll_ras_get(f);
+
+        spin_lock(&ras->ras_lock);
+        bead = ll_ra_read_get_locked(ras);
+        spin_unlock(&ras->ras_lock);
+        return bead;
+}
+
 static int ll_readahead(struct ll_readahead_state *ras,
                          struct obd_export *exp, struct address_space *mapping,
                          struct obd_io_group *oig, int flags)
@@ -959,6 +1014,7 @@ static int ll_readahead(struct ll_readahead_state *ras,
         int rc, ret = 0, match_failed = 0;
         __u64 kms;
         unsigned int gfp_mask;
+        struct ll_ra_read *bead;
         ENTRY;
 
         kms = lov_merge_size(ll_i2info(mapping->host)->lli_smd, 1);
@@ -968,13 +1024,37 @@ static int ll_readahead(struct ll_readahead_state *ras,
         }
 
         spin_lock(&ras->ras_lock);
+        bead = ll_ra_read_get_locked(ras);
         /* reserve a part of the read-ahead window that we'll be issuing */
         if (ras->ras_window_len) {
                 start = ras->ras_next_readahead;
                 end = ras->ras_window_start + ras->ras_window_len - 1;
+        }
+        if (bead != NULL) {
+                pgoff_t read_end;
+
+                start = max(start, bead->lrr_start);
+                read_end = bead->lrr_start + bead->lrr_count - 1;
+                if (ras->ras_consecutive > start - bead->lrr_start + 1)
+                        /*
+                         * if current read(2) is a part of larger sequential
+                         * read, make sure read-ahead is at least to the end
+                         * of the read region.
+                         *
+                         * XXX nikita: This doesn't work when some pages in
+                         * [lrr_start, start] were cached (and, as a result,
+                         * weren't counted in ->ras_consecutive).
+                         */
+                        end = max(end, read_end);
+                else
+                        /*
+                         * otherwise, clip read-ahead at the read boundary.
+                         */
+                        end = read_end;
+        }
+        if (end != 0) {
                 end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
                 ras->ras_next_readahead = max(end, end + 1);
-
                 RAS_CDEBUG(ras);
         }
         spin_unlock(&ras->ras_lock);
@@ -1084,6 +1164,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
 {
         spin_lock_init(&ras->ras_lock);
         ras_reset(ras, 0);
+        INIT_LIST_HEAD(&ras->ras_read_beads);
 }
 
 static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras,