X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;ds=sidebyside;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=a41799bfe3cdfa9b7719e5bea21c05e08d93ef62;hb=ec03ee091f931125f3bbeb3628b0c5aaa4709930;hp=33a452626158b14c5598d5efe3803c4d5b8d1977;hpb=791f656a031f710ce21674b508ce8b331783a5b7;p=fs%2Flustre-release.git

diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c
index 33a4526..a41799b 100644
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -150,9 +150,9 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
 		iobuf->dr_elapsed_valid = 0;
 		LASSERT(iobuf->dr_dev == d);
 		LASSERT(iobuf->dr_frags > 0);
-		lprocfs_oh_tally(&d->od_brw_stats.hist[BRW_R_DIO_FRAGS+rw],
+		lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw],
 				 iobuf->dr_frags);
-		lprocfs_oh_tally_log2(&d->od_brw_stats.hist[BRW_R_IO_TIME+rw],
+		lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw],
 				      ktime_to_ms(iobuf->dr_elapsed));
 	}
 }
@@ -173,7 +173,7 @@ static void dio_complete_routine(struct bio *bio, int error)
 	 */
 
 	if (unlikely(iobuf == NULL)) {
-		CERROR("***** bio->bi_private is NULL!  This should never happen.  Normally, I would crash here, but instead I will dump the bio contents to the console.  Please report this to <https://jira.whamcloud.com/> , along with any interesting messages leading up to this point (like SCSI errors, perhaps).  Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n");
+		CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
 		CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
 		       ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
 		       bio->bi_next, (unsigned long)bio->bi_flags,
@@ -229,8 +229,8 @@ static void dio_complete_routine(struct bio *bio, int error)
 
 static void record_start_io(struct osd_iobuf *iobuf, int size)
 {
-	struct osd_device    *osd = iobuf->dr_dev;
-	struct obd_histogram *h = osd->od_brw_stats.hist;
+	struct osd_device *osd = iobuf->dr_dev;
+	struct obd_histogram *h = osd->od_brw_stats.bs_hist;
 
 	iobuf->dr_frags++;
 	atomic_inc(&iobuf->dr_numreqs);
@@ -320,20 +320,27 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev,
 {
 	struct blk_integrity *bi = bdev_get_integrity(bdev);
 	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct niobuf_local *lnb;
+	struct niobuf_local *lnb = NULL;
 	unsigned short sector_size = blk_integrity_interval(bi);
 	void *bio_prot_buf = page_address(bip->bip_vec->bv_page) +
 		bip->bip_vec->bv_offset;
 	struct bio_vec *bv;
 	sector_t sector = bio_start_sector(bio);
-	unsigned int sectors, total;
+	unsigned int i, sectors, total;
 	DECLARE_BVEC_ITER_ALL(iter_all);
 	__u16 *expected_guard;
 	int rc;
 
 	total = 0;
 	bio_for_each_segment_all(bv, bio, iter_all) {
-		lnb = iobuf->dr_lnbs[index];
+		for (i = index; i < iobuf->dr_npages; i++) {
+			if (iobuf->dr_pages[i] == bv->bv_page) {
+				lnb = iobuf->dr_lnbs[i];
+				break;
+			}
+		}
+		if (!lnb)
+			continue;
 		expected_guard = lnb->lnb_guards;
 		sectors = bv->bv_len / sector_size;
 		if (lnb->lnb_guard_rpc) {
@@ -348,6 +355,7 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev,
 		total += sectors * bi->tuple_size;
 		LASSERT(total <= bip_size(bio->bi_integrity));
 		index++;
+		lnb = NULL;
 	}
 	return 0;
 }
@@ -512,12 +520,24 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode,
 	for (page_idx = page_idx_start, block_idx = start_blocks;
 	     block_idx < block_idx_end; page_idx++,
 	     block_idx += blocks_left_page) {
+		/* For cases where the filesystems blocksize is not the
+		 * same as PAGE_SIZE (e.g. ARM with PAGE_SIZE=64KB and
+		 * blocksize=4KB), there will be multiple blocks to
+		 * read/write per page. Also, the start and end block may
+		 * not be aligned to the start and end of the page, so the
+		 * first page may skip some blocks at the start ("i != 0",
+		 * "blocks_left_page" is reduced), and the last page may
+		 * skip some blocks at the end (limited by "count").
+		 */
 		page = pages[page_idx];
 		LASSERT(page_idx < iobuf->dr_npages);
 
 		i = block_idx % blocks_per_page;
 		blocks_left_page = blocks_per_page - i;
-		for (page_offset = i * blocksize; i < blocks_left_page;
+		if (block_idx + blocks_left_page > block_idx_end)
+			blocks_left_page = block_idx_end - block_idx;
+		page_offset = i * blocksize;
+		for (i = 0; i < blocks_left_page;
 		     i += nblocks, page_offset += blocksize * nblocks) {
 			nblocks = 1;
 
@@ -1400,16 +1420,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		/* ignore quota for the whole request if any page is from
 		 * client cache or written by root.
 		 *
-		 * XXX once we drop the 1.8 client support, the checking
-		 * for whether page is from cache can be simplified as:
-		 * !(lnb[i].flags & OBD_BRW_SYNC)
-		 *
 		 * XXX we could handle this on per-lnb basis as done by
 		 * grant.
 		 */
 		if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
-		    (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
-		    OBD_BRW_FROM_GRANT)
+		    (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+		    !(lnb[i].lnb_flags & OBD_BRW_SYNC))
 			declare_flags |= OSD_QID_FORCE;
 
 		/*
@@ -1471,7 +1487,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	 * split more than once, but this is really rare.
 	 */
 	if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
+		/*
+		 * many concurrent threads may grow tree by the time
+		 * our transaction starts. so, consider 2 is a min depth.
+		 */
 		depth = ext_depth(inode);
+		depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
 		if (extents <= 1) {
 			credits += depth * 2 * extents;
 			new_meta = depth;
@@ -1488,7 +1509,6 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		new_meta = DIV_ROUND_UP(new_blocks,
 				LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 		credits += new_meta;
-		depth = 3;
 	}
 	dirty_groups += (extents + new_meta);
 
@@ -1509,7 +1529,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		credits += dirty_groups;
 
 	/* we can't dirty more gd blocks than exist */
-	if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
+	if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
 		credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
 	else
 		credits += dirty_groups;
@@ -1948,7 +1968,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
 		 * level.
 		 */
 		depth = inode != NULL ? ext_depth(inode) : 0;
-		depth = max(depth, 1) + 1;
+		depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH);
 		credits = depth;
 		/* if not append, then split may need to modify
 		 * existing blocks moving entries into the new ones
@@ -2285,7 +2305,8 @@ static int osd_fallocate_preallocate(const struct lu_env *env,
 	blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
 
 	/* Create and mark new extents as either zero or unwritten */
-	flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
+	flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ||
+		 !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ?
 		LDISKFS_GET_BLOCKS_CREATE_ZERO :
 		LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
 #ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
@@ -2294,12 +2315,6 @@ static int osd_fallocate_preallocate(const struct lu_env *env,
 #endif
 	inode_lock(inode);
 
-	/*
-	 * We only support preallocation for extent-based file only.
-	 */
-	if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)))
-		GOTO(out, rc = -EOPNOTSUPP);
-
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
 	    end > LDISKFS_I(inode)->i_disksize)) {
 		new_size = end;
@@ -2757,6 +2772,22 @@ void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
 	}
 }
 
+/* For a partial-page punch, flush punch range to disk immediately */
+static void osd_partial_page_flush_punch(struct osd_device *d,
+					 struct inode *inode, loff_t start,
+					 loff_t end)
+{
+	if (osd_use_page_cache(d)) {
+		filemap_fdatawrite_range(inode->i_mapping, start, end);
+	} else {
+		/* Notice we use "wait" version to ensure I/O is complete */
+		filemap_write_and_wait_range(inode->i_mapping, start,
+					     end);
+		invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT,
+					 end >> PAGE_SHIFT);
+	}
+}
+
 /*
  * For a partial-page truncate, flush the page to disk immediately to
  * avoid data corruption during direct disk write.  b=17397
@@ -2824,8 +2855,7 @@ void osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
 	struct file *file = osd_quasi_file(env, inode);
 
 	file->f_op->fallocate(file, mode, start, end - start);
-	osd_partial_page_flush(d, inode, start);
-	osd_partial_page_flush(d, inode, end - 1);
+	osd_partial_page_flush_punch(d, inode, start, end - 1);
 }
 
 void osd_process_truncates(const struct lu_env *env, struct list_head *list)