+
+ for (i = start; reserved > 0 && !match_failed && i <= end; i++) {
+ /* skip locked pages from previous readpage calls */
+ page = grab_cache_page_nowait_gfp(mapping, i, gfp_mask);
+ if (page == NULL) {
+ ll_ra_stats_inc(mapping, RA_STAT_FAILED_GRAB_PAGE);
+ CDEBUG(D_READA, "g_c_p_n failed\n");
+ continue;
+ }
+
+ /* Check if page was truncated or reclaimed */
+ if (page->mapping != mapping) {
+ ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
+ CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
+ goto next_page;
+ }
+
+ /* we do this first so that we can see the page in the /proc
+ * accounting */
+ llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD);
+ if (IS_ERR(llap) || llap->llap_defer_uptodate)
+ goto next_page;
+
+ /* skip completed pages */
+ if (Page_Uptodate(page))
+ goto next_page;
+
+ /* bail when we hit the end of the lock. */
+ if ((rc = ll_page_matches(page, flags|LL_FILE_READAHEAD)) <= 0){
+ LL_CDEBUG_PAGE(D_READA | D_PAGE, page,
+ "lock match failed: rc %d\n", rc);
+ ll_ra_stats_inc(mapping, RA_STAT_FAILED_MATCH);
+ match_failed = 1;
+ goto next_page;
+ }
+
+ rc = ll_issue_page_read(exp, llap, oig, 1);
+ if (rc == 0) {
+ reserved--;
+ ret++;
+ LL_CDEBUG_PAGE(D_READA| D_PAGE, page,
+ "started read-ahead\n");
+ } else {
+ next_page:
+ LL_CDEBUG_PAGE(D_READA | D_PAGE, page,
+ "skipping read-ahead\n");
+
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ }
+
+ LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+ if (reserved != 0)
+ ll_ra_count_put(ll_i2sbi(inode), reserved);
+ if (i == end + 1 && end == (kms >> CFS_PAGE_SHIFT))
+ ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+ /* if we didn't get to the end of the region we reserved from
+ * the ras we need to go back and update the ras so that the
+ * next read-ahead tries from where we left off. we only do so
+ * if the region we failed to issue read-ahead on is still ahead
+ * of the app and behind the next index to start read-ahead from */
+ if (i != end + 1) {
+ spin_lock(&ras->ras_lock);
+ if (i < ras->ras_next_readahead &&
+ index_in_window(i, ras->ras_window_start, 0,
+ ras->ras_window_len)) {
+ ras->ras_next_readahead = i;
+ RAS_CDEBUG(ras);
+ }
+ spin_unlock(&ras->ras_lock);
+ }
+
+ RETURN(ret);
+}
+
+static void ras_set_start(struct ll_readahead_state *ras, unsigned long index)
+{
+ ras->ras_window_start = index & (~((1024 * 1024 >> CFS_PAGE_SHIFT) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct ll_readahead_state *ras, unsigned long index)
+{
+ ras->ras_last_readpage = index;
+ ras->ras_consecutive_requests = 0;
+ ras->ras_consecutive_pages = 0;
+ ras->ras_window_len = 0;
+ ras_set_start(ras, index);
+ ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+ RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+ spin_lock_init(&ras->ras_lock);
+ ras_reset(ras, 0);
+ ras->ras_requests = 0;
+ INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+ struct ll_readahead_state *ras, unsigned long index,
+ unsigned hit)
+{
+ struct ll_ra_info *ra = &sbi->ll_ra_info;
+ int zero = 0;
+ ENTRY;
+
+ spin_lock(&sbi->ll_lock);
+ spin_lock(&ras->ras_lock);
+
+ ll_ra_stats_inc_unlocked(ra, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+ /* reset the read-ahead window in two cases. First when the app seeks
+ * or reads to some other part of the file. Secondly if we get a
+ * read-ahead miss that we think we've previously issued. This can
+ * be a symptom of there being so many read-ahead pages that the VM is
+ * reclaiming it before we get to it. */
+ if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+ zero = 1;
+ ll_ra_stats_inc_unlocked(ra, RA_STAT_DISTANT_READPAGE);
+ } else if (!hit && ras->ras_window_len &&
+ index < ras->ras_next_readahead &&
+ index_in_window(index, ras->ras_window_start, 0,
+ ras->ras_window_len)) {
+ zero = 1;
+ ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
+ }
+
+ /* On the second access to a file smaller than the tunable
+ * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+ * file up to ra_max_pages. This is simply a best effort and
+ * only occurs once per open file. Normal RA behavior is reverted
+ * to for subsequent IO. The mmap case does not increment
+ * ras_requests and thus can never trigger this behavior. */
+ if (ras->ras_requests == 2 && !ras->ras_request_index) {
+ __u64 kms_pages;
+
+ kms_pages = (i_size_read(inode) + CFS_PAGE_SIZE - 1) >>
+ CFS_PAGE_SHIFT;
+
+ CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
+ ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+
+ if (kms_pages &&
+ kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+ ras->ras_window_start = 0;
+ ras->ras_last_readpage = 0;
+ ras->ras_next_readahead = 0;
+ ras->ras_window_len = min(ra->ra_max_pages,
+ ra->ra_max_read_ahead_whole_pages);
+ GOTO(out_unlock, 0);
+ }
+ }
+
+ if (zero) {
+ ras_reset(ras, index);
+ GOTO(out_unlock, 0);
+ }
+
+ ras->ras_last_readpage = index;
+ ras->ras_consecutive_pages++;
+ ras_set_start(ras, index);
+ ras->ras_next_readahead = max(ras->ras_window_start,
+ ras->ras_next_readahead);
+
+ /* Trigger RA in the mmap case where ras_consecutive_requests
+ * is not incremented and thus can't be used to trigger RA */
+ if (!ras->ras_window_len && ras->ras_consecutive_pages == 3) {
+ ras->ras_window_len = 1024 * 1024 >> CFS_PAGE_SHIFT;
+ GOTO(out_unlock, 0);
+ }
+
+ /* The initial ras_window_len is set to the request size. To avoid
+ * uselessly reading and discarding pages for random IO the window is
+ * only increased once per consecutive request received. */
+ if (ras->ras_consecutive_requests > 1 && !ras->ras_request_index) {
+ ras->ras_window_len = min(ras->ras_window_len +
+ (1024 * 1024 >> CFS_PAGE_SHIFT),
+ ra->ra_max_pages);
+ }
+
+ EXIT;
+out_unlock:
+ RAS_CDEBUG(ras);
+ ras->ras_request_index++;
+ spin_unlock(&ras->ras_lock);
+ spin_unlock(&sbi->ll_lock);
+ return;
+}
+
+int ll_writepage(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_export *exp;
+ struct ll_async_page *llap;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(!PageDirty(page));
+ LASSERT(PageLocked(page));
+
+ exp = ll_i2dtexp(inode);
+ if (exp == NULL)
+ GOTO(out, rc = -EINVAL);
+
+ llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
+ if (IS_ERR(llap))
+ GOTO(out, rc = PTR_ERR(llap));
+
+ LASSERT(!PageWriteback(page));
+ set_page_writeback(page);
+
+ page_cache_get(page);
+ if (llap->llap_write_queued) {
+ LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
+ rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
+ llap->llap_cookie,
+ ASYNC_READY | ASYNC_URGENT);
+ } else {
+ rc = queue_or_sync_write(exp, inode, llap, CFS_PAGE_SIZE,
+ ASYNC_READY | ASYNC_URGENT);
+ }
+ if (rc)
+ page_cache_release(page);
+out:
+ if (rc) {
+ if (!lli->lli_async_rc)
+ lli->lli_async_rc = rc;
+ /* re-dirty page on error so it retries write */
+ if (PageWriteback(page)) {
+ end_page_writeback(page);
+ }
+ /* resend page only for not started IO*/
+ if (!PageError(page))
+ ll_redirty_page(page);
+ unlock_page(page);
+ }
+ RETURN(rc);
+}
+
+/*
+ * for now we do our readpage the same on both 2.4 and 2.5. The kernel's
+ * read-ahead assumes it is valid to issue readpage all the way up to
+ * i_size, but our dlm locks make that not the case. We disable the
+ * kernel's read-ahead and do our own by walking ahead in the page cache
+ * checking for dlm lock coverage. the main difference between 2.4 and
+ * 2.6 is how read-ahead gets batched and issued, but we're using our own,
+ * so they look the same.
+ */
+int ll_readpage(struct file *filp, struct page *page)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
+ struct inode *inode = page->mapping->host;
+ struct obd_export *exp;
+ struct ll_async_page *llap;
+ struct obd_io_group *oig = NULL;
+ int rc;
+ ENTRY;
+
+ LASSERT(PageLocked(page));
+ LASSERT(!PageUptodate(page));
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset=%Lu=%#Lx\n",
+ inode->i_ino, inode->i_generation, inode,
+ (((loff_t)page->index) << CFS_PAGE_SHIFT),
+ (((loff_t)page->index) << CFS_PAGE_SHIFT));
+ LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0);
+
+ if (!ll_i2info(inode)->lli_smd) {
+ /* File with no objects - one big hole */
+ /* We use this just for remove_from_page_cache that is not
+ * exported, we'd make page back up to date. */
+ ll_truncate_complete_page(page);
+ clear_page(kmap(page));
+ kunmap(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ RETURN(0);
+ }
+
+ rc = oig_init(&oig);
+ if (rc < 0)
+ GOTO(out, rc);
+
+ exp = ll_i2dtexp(inode);
+ if (exp == NULL)
+ GOTO(out, rc = -EINVAL);
+
+ llap = llap_from_page(page, LLAP_ORIGIN_READPAGE);
+ if (IS_ERR(llap))
+ GOTO(out, rc = PTR_ERR(llap));
+
+ if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+ ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
+ llap->llap_defer_uptodate);
+
+ if (llap->llap_defer_uptodate) {
+ llap->llap_ra_used = 1;
+ rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
+ fd->fd_flags);
+ if (rc > 0)
+ obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd,
+ NULL, oig);
+ LL_CDEBUG_PAGE(D_PAGE, page, "marking uptodate from defer\n");
+ SetPageUptodate(page);
+ unlock_page(page);
+ GOTO(out_oig, rc = 0);
+ }
+
+ if (likely((fd->fd_flags & LL_FILE_IGNORE_LOCK) == 0)) {
+ rc = ll_page_matches(page, fd->fd_flags);
+ if (rc < 0) {
+ LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ if (rc == 0) {
+ CWARN("ino %lu page %lu (%llu) not covered by "
+ "a lock (mmap?). check debug logs.\n",
+ inode->i_ino, page->index,
+ (long long)page->index << CFS_PAGE_SHIFT);
+ }
+ }
+
+ rc = ll_issue_page_read(exp, llap, oig, 0);
+ if (rc)
+ GOTO(out, rc);
+
+ LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
+ if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+ ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
+ fd->fd_flags);
+
+ rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
+
+out:
+ if (rc)
+ unlock_page(page);
+out_oig:
+ if (oig != NULL)
+ oig_release(oig);
+ RETURN(rc);
+}