X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fllite%2Frw.c;h=3d69fecf9a7c57f255a0e26bf42c4e04fc61b320;hp=da9e04c8792f8953ec8ec5f6c6af95cdc61616b6;hb=1e4d10af3909452b0eee1f99010d80aeb01d42a7;hpb=7e8efb339b0958146eb294fc9d961688f5d16079

diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index da9e04c..3d69fec 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -27,7 +27,6 @@
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lustre/llite/rw.c
  *
@@ -85,17 +84,34 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
 				     unsigned long pages,
 				     unsigned long pages_min)
 {
-        struct ll_ra_info *ra = &sbi->ll_ra_info;
-        long ret;
+	struct ll_ra_info *ra = &sbi->ll_ra_info;
+	long ret;
+
         ENTRY;
 
-        /* If read-ahead pages left are less than 1M, do not do read-ahead,
-         * otherwise it will form small read RPC(< 1M), which hurt server
-         * performance a lot. */
+	WARN_ON_ONCE(pages_min > pages);
+	/**
+	 * Don't try readahead aggresively if we are limited
+	 * LRU pages, otherwise, it could cause deadlock.
+	 */
+	pages = min(sbi->ll_cache->ccc_lru_max >> 2, pages);
+	/**
+	 * if this happen, we reserve more pages than needed,
+	 * this will make us leak @ra_cur_pages, because
+	 * ll_ra_count_put() acutally freed @pages.
+	 */
+	if (unlikely(pages_min > pages))
+		pages_min = pages;
+
+	/*
+	 * If read-ahead pages left are less than 1M, do not do read-ahead,
+	 * otherwise it will form small read RPC(< 1M), which hurt server
+	 * performance a lot.
+	 */
 	ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages),
 		  pages);
-        if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
-                GOTO(out, ret = 0);
+	if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+		GOTO(out, ret = 0);
 
 	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
 		atomic_sub(ret, &ra->ra_cur_pages);
@@ -227,9 +243,11 @@ static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
 	cl_page_assume(env, io, page);
 	vpg = cl2vvp_page(cl_object_page_slice(clob, page));
 	if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) {
-		vpg->vpg_defer_uptodate = 1;
-		vpg->vpg_ra_used = 0;
-		cl_page_list_add(queue, page);
+		if (hint == MAYNEED) {
+			vpg->vpg_defer_uptodate = 1;
+			vpg->vpg_ra_used = 0;
+		}
+		cl_page_list_add(queue, page, true);
 	} else {
 		/* skip completed pages */
 		cl_page_unassume(env, io, page);
@@ -348,7 +366,11 @@ static unsigned long ria_page_count(struct ra_io_arg *ria)
 
 static pgoff_t ras_align(struct ll_readahead_state *ras, pgoff_t index)
 {
-	return index - (index % ras->ras_rpc_pages);
+	unsigned opt_size = min(ras->ras_window_pages, ras->ras_rpc_pages);
+
+	if (opt_size == 0)
+		opt_size = 1;
+	return index - (index % opt_size);
 }
 
 /* Check whether the index is in the defined ra-window */
@@ -382,7 +404,7 @@ static bool ras_inside_ra_window(pgoff_t idx, struct ra_io_arg *ria)
 static unsigned long
 ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 		    struct cl_page_list *queue, struct ll_readahead_state *ras,
-		    struct ra_io_arg *ria, pgoff_t *ra_end)
+		    struct ra_io_arg *ria, pgoff_t *ra_end, pgoff_t skip_index)
 {
 	struct cl_read_ahead ra = { 0 };
 	/* busy page count is per stride */
@@ -395,6 +417,8 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 	for (page_idx = ria->ria_start_idx;
 	     page_idx <= ria->ria_end_idx && ria->ria_reserved > 0;
 	     page_idx++) {
+		if (skip_index && page_idx == skip_index)
+			continue;
 		if (ras_inside_ra_window(page_idx, ria)) {
 			if (ra.cra_end_idx == 0 || ra.cra_end_idx < page_idx) {
 				pgoff_t end_idx;
@@ -438,10 +462,12 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 				if (ras->ras_rpc_pages != ra.cra_rpc_pages &&
 				    ra.cra_rpc_pages > 0)
 					ras->ras_rpc_pages = ra.cra_rpc_pages;
-				/* trim it to align with optimal RPC size */
-				end_idx = ras_align(ras, ria->ria_end_idx + 1);
-				if (end_idx > 0 && !ria->ria_eof)
-					ria->ria_end_idx = end_idx - 1;
+				if (!skip_index) {
+					/* trim it to align with optimal RPC size */
+					end_idx = ras_align(ras, ria->ria_end_idx + 1);
+					if (end_idx > 0 && !ria->ria_eof)
+						ria->ria_end_idx = end_idx - 1;
+				}
 				if (ria->ria_end_idx < ria->ria_end_idx_min)
 					ria->ria_end_idx = ria->ria_end_idx_min;
 			}
@@ -617,7 +643,12 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 	if (rc)
 		GOTO(out_put_env, rc);
 
-	vvp_env_io(env)->vui_io_subtype = IO_NORMAL;
+	/* overwrite jobid inited in vvp_io_init() */
+	if (strncmp(ll_i2info(inode)->lli_jobid, work->lrw_jobid,
+		    sizeof(work->lrw_jobid)))
+		memcpy(ll_i2info(inode)->lli_jobid, work->lrw_jobid,
+		       sizeof(work->lrw_jobid));
+
 	vvp_env_io(env)->vui_fd = fd;
 	io->ci_state = CIS_LOCKED;
 	io->ci_async_readahead = true;
@@ -629,7 +660,7 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 	cl_2queue_init(queue);
 
 	rc = ll_read_ahead_pages(env, io, &queue->c2_qin, ras, ria,
-				 &ra_end_idx);
+				 &ra_end_idx, 0);
 	if (ria->ria_reserved != 0)
 		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
 	if (queue->c2_qin.pl_nr > 0) {
@@ -667,7 +698,7 @@ out_free_work:
 static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 			struct cl_page_list *queue,
 			struct ll_readahead_state *ras, bool hit,
-			struct file *file)
+			struct file *file, pgoff_t skip_index)
 {
 	struct vvp_io *vio = vvp_env_io(env);
 	struct ll_thread_info *lti = ll_env_info(env);
@@ -678,10 +709,26 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	struct cl_object *clob;
 	int ret = 0;
 	__u64 kms;
+	struct ll_sb_info *sbi;
+	struct ll_ra_info *ra;
+
 	ENTRY;
 
+        ENTRY;
+
 	clob = io->ci_obj;
 	inode = vvp_object_inode(clob);
+	sbi = ll_i2sbi(inode);
+	ra = &sbi->ll_ra_info;
+
+	/**
+	 * In case we have a limited max_cached_mb, readahead
+	 * should be stopped if it have run out of all LRU slots.
+	 */
+	if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) {
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+		RETURN(0);
+	}
 
 	memset(ria, 0, sizeof(*ria));
 	ret = ll_readahead_file_kms(env, io, &kms);
@@ -711,6 +758,9 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	if (ras->ras_window_pages > 0)
 		end_idx = ras->ras_window_start_idx + ras->ras_window_pages - 1;
 
+	if (skip_index)
+		end_idx = start_idx + ras->ras_window_pages - 1;
+
 	/* Enlarge the RA window to encompass the full read */
 	if (vio->vui_ra_valid &&
 	    end_idx < vio->vui_ra_start_idx + vio->vui_ra_pages - 1)
@@ -761,8 +811,21 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 			vio->vui_ra_start_idx + vio->vui_ra_pages - 1;
 		pages_min = vio->vui_ra_start_idx + vio->vui_ra_pages -
 				ria->ria_start_idx;
+		 /**
+		  * For performance reason, exceeding @ra_max_pages
+		  * are allowed, but this should be limited with RPC
+		  * size in case a large block size read issued. Trim
+		  * to RPC boundary.
+		  */
+		pages_min = min(pages_min, ras->ras_rpc_pages -
+				(ria->ria_start_idx % ras->ras_rpc_pages));
 	}
 
+	/* don't over reserved for mmap range read */
+	if (skip_index)
+		pages_min = 0;
+	if (pages_min > pages)
+		pages = pages_min;
 	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, pages,
 					    pages_min);
 	if (ria->ria_reserved < pages)
@@ -773,8 +836,8 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
 	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
 
-	ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx);
-
+	ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx,
+				  skip_index);
 	if (ria->ria_reserved != 0)
 		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
 
@@ -870,6 +933,10 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
 	ras_reset(ras, 0);
 	ras->ras_last_read_end_bytes = 0;
 	ras->ras_requests = 0;
+	ras->ras_range_min_start_idx = 0;
+	ras->ras_range_max_end_idx = 0;
+	ras->ras_range_requests = 0;
+	ras->ras_last_range_pages = 0;
 }
 
 /*
@@ -1018,6 +1085,73 @@ static inline bool is_loose_seq_read(struct ll_readahead_state *ras, loff_t pos)
 			     8UL << PAGE_SHIFT, 8UL << PAGE_SHIFT);
 }
 
+static inline bool is_loose_mmap_read(struct ll_sb_info *sbi,
+				      struct ll_readahead_state *ras,
+				      unsigned long pos)
+{
+	unsigned long range_pages = sbi->ll_ra_info.ra_range_pages;
+
+	return pos_in_window(pos, ras->ras_last_read_end_bytes,
+			     range_pages << PAGE_SHIFT,
+			     range_pages << PAGE_SHIFT);
+}
+
+/**
+ * We have observed slow mmap read performances for some
+ * applications. The problem is if access pattern is neither
+ * sequential nor stride, but could be still adjacent in a
+ * small range and then seek a random position.
+ *
+ * So the pattern could be something like this:
+ *
+ * [1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data]
+ *
+ *
+ * Every time an application reads mmap data, it may not only
+ * read a single 4KB page, but aslo a cluster of nearby pages in
+ * a range(e.g. 1MB) of the first page after a cache miss.
+ *
+ * The readahead engine is modified to track the range size of
+ * a cluster of mmap reads, so that after a seek and/or cache miss,
+ * the range size is used to efficiently prefetch multiple pages
+ * in a single RPC rather than many small RPCs.
+ */
+static void ras_detect_cluster_range(struct ll_readahead_state *ras,
+				     struct ll_sb_info *sbi,
+				     unsigned long pos, unsigned long count)
+{
+	pgoff_t last_pages, pages;
+	pgoff_t end_idx = (pos + count - 1) >> PAGE_SHIFT;
+
+	last_pages = ras->ras_range_max_end_idx -
+			ras->ras_range_min_start_idx + 1;
+	/* First time come here */
+	if (!ras->ras_range_max_end_idx)
+		goto out;
+
+	/* Random or Stride read */
+	if (!is_loose_mmap_read(sbi, ras, pos))
+		goto out;
+
+	ras->ras_range_requests++;
+	if (ras->ras_range_max_end_idx < end_idx)
+		ras->ras_range_max_end_idx = end_idx;
+
+	if (ras->ras_range_min_start_idx > (pos >> PAGE_SHIFT))
+		ras->ras_range_min_start_idx = pos >> PAGE_SHIFT;
+
+	/* Out of range, consider it as random or stride */
+	pages = ras->ras_range_max_end_idx -
+			ras->ras_range_min_start_idx + 1;
+	if (pages <= sbi->ll_ra_info.ra_range_pages)
+		return;
+out:
+	ras->ras_last_range_pages = last_pages;
+	ras->ras_range_requests = 0;
+	ras->ras_range_min_start_idx = pos >> PAGE_SHIFT;
+	ras->ras_range_max_end_idx = end_idx;
+}
+
 static void ras_detect_read_pattern(struct ll_readahead_state *ras,
 				    struct ll_sb_info *sbi,
 				    loff_t pos, size_t count, bool mmap)
@@ -1066,8 +1200,12 @@ static void ras_detect_read_pattern(struct ll_readahead_state *ras,
 	ras->ras_consecutive_bytes += count;
 	if (mmap) {
 		pgoff_t idx = ras->ras_consecutive_bytes >> PAGE_SHIFT;
+		unsigned long ra_range_pages =
+				max_t(unsigned long, RA_MIN_MMAP_RANGE_PAGES,
+				      sbi->ll_ra_info.ra_range_pages);
 
-		if ((idx >= 4 && (idx & 3UL) == 0) || stride_detect)
+		if ((idx >= ra_range_pages &&
+		     idx % ra_range_pages == 0) || stride_detect)
 			ras->ras_need_increase_window = true;
 	} else if ((ras->ras_consecutive_requests > 1 || stride_detect)) {
 		ras->ras_need_increase_window = true;
@@ -1154,7 +1292,7 @@ static bool index_in_stride_window(struct ll_readahead_state *ras,
  */
 static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 		       struct ll_readahead_state *ras, pgoff_t index,
-		       enum ras_update_flags flags)
+		       enum ras_update_flags flags, struct cl_io *io)
 {
 	struct ll_ra_info *ra = &sbi->ll_ra_info;
 	bool hit = flags & LL_RAS_HIT;
@@ -1176,10 +1314,48 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 	if (ras->ras_no_miss_check)
 		GOTO(out_unlock, 0);
 
-	if (flags & LL_RAS_MMAP)
+	if (io && io->ci_rand_read)
+		GOTO(out_unlock, 0);
+
+	if (io && io->ci_seq_read) {
+		if (!hit) {
+			/* to avoid many small read RPC here */
+			ras->ras_window_pages = sbi->ll_ra_info.ra_range_pages;
+			ll_ra_stats_inc_sbi(sbi, RA_STAT_MMAP_RANGE_READ);
+		}
+		goto skip;
+	}
+
+	if (flags & LL_RAS_MMAP) {
+		unsigned long ra_pages;
+
+		ras_detect_cluster_range(ras, sbi, index << PAGE_SHIFT,
+					 PAGE_SIZE);
 		ras_detect_read_pattern(ras, sbi, (loff_t)index << PAGE_SHIFT,
 					PAGE_SIZE, true);
 
+		/* we did not detect anything but we could prefetch */
+		if (!ras->ras_need_increase_window &&
+		    ras->ras_window_pages <= sbi->ll_ra_info.ra_range_pages &&
+		    ras->ras_range_requests >= 2) {
+			if (!hit) {
+				ra_pages = max_t(unsigned long,
+					RA_MIN_MMAP_RANGE_PAGES,
+					ras->ras_last_range_pages);
+				if (index < ra_pages / 2)
+					index = 0;
+				else
+					index -= ra_pages / 2;
+				ras->ras_window_pages = ra_pages;
+				ll_ra_stats_inc_sbi(sbi,
+					RA_STAT_MMAP_RANGE_READ);
+			} else {
+				ras->ras_window_pages = 0;
+			}
+			goto skip;
+		}
+	}
+
 	if (!hit && ras->ras_window_pages &&
 	    index < ras->ras_next_readahead_idx &&
 	    pos_in_window(index, ras->ras_window_start_idx, 0,
@@ -1218,6 +1394,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 			GOTO(out_unlock, 0);
 		}
 	}
+
+skip:
 	ras_set_start(ras, index);
 
 	if (stride_io_mode(ras)) {
@@ -1439,8 +1617,8 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 {
 	struct inode              *inode  = vvp_object_inode(page->cp_obj);
 	struct ll_sb_info         *sbi    = ll_i2sbi(inode);
-	struct ll_file_data       *fd     = file->private_data;
-	struct ll_readahead_state *ras    = &fd->fd_ras;
+	struct ll_file_data       *fd     = NULL;
+	struct ll_readahead_state *ras    = NULL;
 	struct cl_2queue          *queue  = &io->ci_queue;
 	struct cl_sync_io	  *anchor = NULL;
 	struct vvp_page           *vpg;
@@ -1450,10 +1628,15 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	pgoff_t io_end_index;
 	ENTRY;
 
+	if (file) {
+		fd = file->private_data;
+		ras = &fd->fd_ras;
+	}
+
 	vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
 	uptodate = vpg->vpg_defer_uptodate;
 
-	if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated) {
+	if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated && ras) {
 		struct vvp_io *vio = vvp_env_io(env);
 		enum ras_update_flags flags = 0;
 
@@ -1461,7 +1644,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			flags |= LL_RAS_HIT;
 		if (!vio->vui_ra_valid)
 			flags |= LL_RAS_MMAP;
-		ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+		ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
 	}
 
 	cl_2queue_init(queue);
@@ -1474,15 +1657,19 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 		cl_sync_io_init(anchor, 1);
 		page->cp_sync_io = anchor;
 
-		cl_2queue_add(queue, page);
+		cl_2queue_add(queue, page, true);
 	}
 
 	io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos);
 	io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos +
 				io->u.ci_rw.crw_count - 1);
-	if (ll_readahead_enabled(sbi)) {
+	if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) {
+		pgoff_t skip_index = 0;
+
+		if (ras->ras_next_readahead_idx < vvp_index(vpg))
+			skip_index = vvp_index(vpg);
 		rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
-				   uptodate, file);
+				   uptodate, file, skip_index);
 		CDEBUG(D_READA, DFID " %d pages read ahead at %lu\n",
 		       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
 	} else if (vvp_index(vpg) == io_start_index &&
@@ -1548,6 +1735,15 @@ static int kickoff_async_readahead(struct file *file, unsigned long pages)
 	pgoff_t start_idx = ras_align(ras, ras->ras_next_readahead_idx);
 	pgoff_t end_idx = start_idx + pages - 1;
 
+	/**
+	 * In case we have a limited max_cached_mb, readahead
+	 * should be stopped if it have run out of all LRU slots.
+	 */
+	if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) {
+		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+		return 0;
+	}
+
 	throttle = min(ra->ra_async_pages_per_file_threshold,
 		       ra->ra_max_pages_per_file);
 	/*
@@ -1577,6 +1773,8 @@ static int kickoff_async_readahead(struct file *file, unsigned long pages)
 		ras->ras_next_readahead_idx = end_idx + 1;
 		ras->ras_async_last_readpage_idx = start_idx;
 		spin_unlock(&ras->ras_lock);
+		memcpy(lrw->lrw_jobid, ll_i2info(inode)->lli_jobid,
+		       sizeof(lrw->lrw_jobid));
 		ll_readahead_work_add(inode, lrw);
 	} else {
 		return -ENOMEM;
@@ -1596,10 +1794,11 @@ static bool ll_use_fast_io(struct file *file,
 	unsigned long fast_read_pages =
 		max(RA_REMAIN_WINDOW_MIN, ras->ras_rpc_pages);
 	loff_t skip_pages;
+	loff_t stride_bytes = ras->ras_stride_bytes;
 
-	if (stride_io_mode(ras)) {
+	if (stride_io_mode(ras) && stride_bytes) {
 		skip_pages = (ras->ras_stride_length +
-			ras->ras_stride_bytes - 1) / ras->ras_stride_bytes;
+			ras->ras_stride_bytes - 1) / stride_bytes;
 		skip_pages *= fast_read_pages;
 	} else {
 		skip_pages = fast_read_pages;
@@ -1659,7 +1858,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 			/* For fast read, it updates read ahead state only
 			 * if the page is hit in cache because non cache page
 			 * case will be handled by slow read later. */
-			ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+			ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
 			/* avoid duplicate ras_update() call */
 			vpg->vpg_ra_updated = 1;
 
@@ -1697,9 +1896,9 @@ int ll_readpage(struct file *file, struct page *vmpage)
 	 */
 	if (file->f_flags & O_DIRECT &&
 	    lcc && lcc->lcc_type == LCC_RW &&
-	    !io->ci_ignore_lockless) {
+	    !io->ci_dio_lock) {
 		unlock_page(vmpage);
-		io->ci_ignore_lockless = 1;
+		io->ci_dio_lock = 1;
 		io->ci_need_restart = 1;
 		RETURN(-ENOLCK);
 	}