X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=dc4208a72510e36788b08cc60a77934a959b6124;hp=f99f625ad9cf25d8298e17454025e95951d0ae4c;hb=0844727c55d52ec24f6cfb7fa043755a6635949c;hpb=2eaa49ef0f16798d564883b16cea9e96fad52495

diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c
index f99f625..dc4208a 100644
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -27,7 +27,6 @@
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lustre/osd/osd_io.c
  *
@@ -45,6 +44,7 @@
 /* prerequisite for linux/xattr.h */
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/pagevec.h>
 
 /*
@@ -57,6 +57,7 @@
 
 /* ext_depth() */
 #include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
 
 static inline bool osd_use_page_cache(struct osd_device *d)
 {
@@ -149,9 +150,9 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
 		iobuf->dr_elapsed_valid = 0;
 		LASSERT(iobuf->dr_dev == d);
 		LASSERT(iobuf->dr_frags > 0);
-		lprocfs_oh_tally(&d->od_brw_stats.hist[BRW_R_DIO_FRAGS+rw],
+		lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw],
 				 iobuf->dr_frags);
-		lprocfs_oh_tally_log2(&d->od_brw_stats.hist[BRW_R_IO_TIME+rw],
+		lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw],
 				      ktime_to_ms(iobuf->dr_elapsed));
 	}
 }
@@ -172,7 +173,7 @@ static void dio_complete_routine(struct bio *bio, int error)
 	 */
 
 	if (unlikely(iobuf == NULL)) {
-		CERROR("***** bio->bi_private is NULL!  This should never happen.  Normally, I would crash here, but instead I will dump the bio contents to the console.  Please report this to <https://jira.whamcloud.com/> , along with any interesting messages leading up to this point (like SCSI errors, perhaps).  Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n");
+		CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
 		CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
 		       ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
 		       bio->bi_next, (unsigned long)bio->bi_flags,
@@ -228,8 +229,8 @@ static void dio_complete_routine(struct bio *bio, int error)
 
 static void record_start_io(struct osd_iobuf *iobuf, int size)
 {
-	struct osd_device    *osd = iobuf->dr_dev;
-	struct obd_histogram *h = osd->od_brw_stats.hist;
+	struct osd_device *osd = iobuf->dr_dev;
+	struct obd_histogram *h = osd->od_brw_stats.bs_hist;
 
 	iobuf->dr_frags++;
 	atomic_inc(&iobuf->dr_numreqs);
@@ -319,20 +320,27 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev,
 {
 	struct blk_integrity *bi = bdev_get_integrity(bdev);
 	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct niobuf_local *lnb;
+	struct niobuf_local *lnb = NULL;
 	unsigned short sector_size = blk_integrity_interval(bi);
 	void *bio_prot_buf = page_address(bip->bip_vec->bv_page) +
 		bip->bip_vec->bv_offset;
 	struct bio_vec *bv;
 	sector_t sector = bio_start_sector(bio);
-	unsigned int sectors, total;
+	unsigned int i, sectors, total;
 	DECLARE_BVEC_ITER_ALL(iter_all);
 	__u16 *expected_guard;
 	int rc;
 
 	total = 0;
 	bio_for_each_segment_all(bv, bio, iter_all) {
-		lnb = iobuf->dr_lnbs[index];
+		for (i = index; i < iobuf->dr_npages; i++) {
+			if (iobuf->dr_pages[i] == bv->bv_page) {
+				lnb = iobuf->dr_lnbs[i];
+				break;
+			}
+		}
+		if (!lnb)
+			continue;
 		expected_guard = lnb->lnb_guards;
 		sectors = bv->bv_len / sector_size;
 		if (lnb->lnb_guard_rpc) {
@@ -347,6 +355,7 @@ static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev,
 		total += sectors * bi->tuple_size;
 		LASSERT(total <= bip_size(bio->bi_integrity));
 		index++;
+		lnb = NULL;
 	}
 	return 0;
 }
@@ -907,6 +916,8 @@ bypass_checks:
 			GOTO(cleanup, rc = -ENOMEM);
 
 		lnb->lnb_locked = 1;
+		if (cache)
+			mark_page_accessed(lnb->lnb_page);
 	}
 
 #if 0
@@ -1210,12 +1221,12 @@ cont_map:
 			GOTO(cleanup, rc);
 		/*
 		 * decay extent blocks if we could allocate
-		 * good large(1M) extent.
+		 * good large extent.
 		 */
-		if (previous_total == 0 &&
-		    total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+		if (total - previous_total >=
+		    osd_extent_bytes(osd) >> inode->i_blkbits)
 			osd_decay_extent_bytes(osd,
-					       total << inode->i_blkbits);
+				(total - previous_total) << inode->i_blkbits);
 		/* look for next extent */
 		fp = NULL;
 		blocks += blocks_per_page * clen;
@@ -1300,6 +1311,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 struct osd_fextent {
 	sector_t	start;
 	sector_t	end;
+	__u32		flags;
 	unsigned int	mapped:1;
 };
 
@@ -1311,7 +1323,6 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
 	sector_t start;
 	struct fiemap_extent_info fei = { 0 };
 	struct fiemap_extent fe = { 0 };
-	mm_segment_t saved_fs;
 	int rc;
 
 	if (block >= cached_extent->start && block < cached_extent->end)
@@ -1327,14 +1338,12 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
 	fei.fi_extents_max = 1;
 	fei.fi_extents_start = &fe;
 
-	saved_fs = get_fs();
-	set_fs(KERNEL_DS);
 	rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
-	set_fs(saved_fs);
 	if (rc != 0)
 		return 0;
 
 	start = fe.fe_logical >> inode->i_blkbits;
+	cached_extent->flags = fe.fe_flags;
 	if (fei.fi_extents_mapped == 0) {
 		/* a special case - no extent found at this offset and forward.
 		 * we can consider this as a hole to EOF. it's safe to cache
@@ -1360,6 +1369,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
 	return cached_extent->mapped;
 }
 
+#define MAX_EXTENTS_PER_WRITE 100
 static int osd_declare_write_commit(const struct lu_env *env,
 				    struct dt_object *dt,
 				    struct niobuf_local *lnb, int npages,
@@ -1368,10 +1378,10 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	const struct osd_device	*osd = osd_obj2dev(osd_dt_obj(dt));
 	struct inode		*inode = osd_dt_obj(dt)->oo_inode;
 	struct osd_thandle	*oh;
-	int			extents = 0;
-	int			depth;
+	int			extents = 0, new_meta = 0;
+	int			depth, new_blocks = 0;
 	int			i;
-	int			newblocks = 0;
+	int			dirty_groups = 0;
 	int			rc = 0;
 	int			credits = 0;
 	long long		quota_space = 0;
@@ -1398,19 +1408,20 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		/* ignore quota for the whole request if any page is from
 		 * client cache or written by root.
 		 *
-		 * XXX once we drop the 1.8 client support, the checking
-		 * for whether page is from cache can be simplified as:
-		 * !(lnb[i].flags & OBD_BRW_SYNC)
-		 *
 		 * XXX we could handle this on per-lnb basis as done by
 		 * grant.
 		 */
 		if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
-		    (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
-		    OBD_BRW_FROM_GRANT)
+		    (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+		    !(lnb[i].lnb_flags & OBD_BRW_SYNC))
 			declare_flags |= OSD_QID_FORCE;
 
-		if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+		/*
+		 * Convert unwritten extent might need split extents, could
+		 * not skip it.
+		 */
+		if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+		    !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
 			lnb[i].lnb_flags |= OBD_BRW_MAPPED;
 			continue;
 		}
@@ -1421,11 +1432,11 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		}
 
 		/* count only unmapped changes */
-		newblocks++;
+		new_blocks++;
 		if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
 			if (extent.end != 0)
 				extents += (extent.end - extent.start +
-					extent_bytes - 1) / extent_bytes;
+					    extent_bytes - 1) / extent_bytes;
 			extent.start = lnb[i].lnb_file_offset;
 			extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
 		} else {
@@ -1440,35 +1451,59 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	 * overwrite case, no need to modify tree and
 	 * allocate blocks.
 	 */
-	if (!newblocks)
+	if (!extent.end)
 		goto out_declare;
 
 	extents += (extent.end - extent.start +
 		    extent_bytes - 1) / extent_bytes;
-	/*
-	 * each extent can go into new leaf causing a split
-	 * 5 is max tree depth: inode + 4 index blocks
-	 * with blockmaps, depth is 3 at most
+	/**
+	 * with system space usage growing up, mballoc codes won't
+	 * try best to scan block group to align best free extent as
+	 * we can. So extent bytes per extent could be decayed to a
+	 * very small value, this could make us reserve too many credits.
+	 * We could be more optimistic in the credit reservations, even
+	 * in a case where the filesystem is nearly full, it is extremely
+	 * unlikely that the worst case would ever be hit.
+	 */
+	if (extents > MAX_EXTENTS_PER_WRITE)
+		extents = MAX_EXTENTS_PER_WRITE;
+
+	/**
+	 * If we add a single extent, then in the worse case, each tree
+	 * level index/leaf need to be changed in case of the tree split.
+	 * If more extents are inserted, they could cause the whole tree
+	 * split more than once, but this is really rare.
 	 */
 	if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
 		/*
 		 * many concurrent threads may grow tree by the time
-		 * our transaction starts. so, consider 2 is a min depth
+		 * our transaction starts. so, consider 2 is a min depth.
 		 */
 		depth = ext_depth(inode);
-		depth = max(depth, 1) + 1;
-		newblocks += depth;
-		credits += depth * 2 * extents;
+		depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
+		if (extents <= 1) {
+			credits += depth * 2 * extents;
+			new_meta = depth;
+		} else {
+			credits += depth * 3 * extents;
+			new_meta = depth * 2 * extents;
+		}
 	} else {
-		depth = 3;
-		newblocks += depth;
-		credits += depth * extents;
+		/*
+		 * With N contiguous data blocks, we need at most
+		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+		 * 2 dindirect blocks, and 1 tindirect block
+		 */
+		new_meta = DIV_ROUND_UP(new_blocks,
+				LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+		credits += new_meta;
 	}
+	dirty_groups += (extents + new_meta);
 
 	oh->oh_declared_ext = extents;
 
 	/* quota space for metadata blocks */
-	quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+	quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
 
 	/* quota space should be reported in 1K blocks */
 	quota_space = toqb(quota_space);
@@ -1476,16 +1511,21 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	/* each new block can go in different group (bitmap + gd) */
 
 	/* we can't dirty more bitmap blocks than exist */
-	if (extents > LDISKFS_SB(osd_sb(osd))->s_groups_count)
+	if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
 		credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
 	else
-		credits += extents;
+		credits += dirty_groups;
 
 	/* we can't dirty more gd blocks than exist */
-	if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
+	if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
 		credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
 	else
-		credits += extents;
+		credits += dirty_groups;
+
+	CDEBUG(D_INODE,
+	       "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+	       osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+	       credits);
 
 out_declare:
 	osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
@@ -1916,7 +1956,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
 		 * level.
 		 */
 		depth = inode != NULL ? ext_depth(inode) : 0;
-		depth = max(depth, 1) + 1;
+		depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH);
 		credits = depth;
 		/* if not append, then split may need to modify
 		 * existing blocks moving entries into the new ones
@@ -2174,10 +2214,10 @@ static int osd_declare_fallocate(const struct lu_env *env,
 	ENTRY;
 
 	/*
-	 * Only mode == 0 (which is standard prealloc) is supported now.
+	 * mode == 0 (which is standard prealloc) and PUNCH is supported
 	 * Rest of mode options is not supported yet.
 	 */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		RETURN(-EOPNOTSUPP);
 
 	/* disable fallocate completely */
@@ -2187,6 +2227,16 @@ static int osd_declare_fallocate(const struct lu_env *env,
 	LASSERT(th);
 	LASSERT(inode);
 
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		rc = osd_declare_inode_qid(env, i_uid_read(inode),
+					   i_gid_read(inode),
+					   i_projid_read(inode), 0, oh,
+					   osd_dt_obj(dt), NULL, OSD_QID_BLK);
+		if (rc == 0)
+			rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
+		RETURN(rc);
+	}
+
 	/* quota space for metadata blocks
 	 * approximate metadata estimate should be good enough.
 	 */
@@ -2207,8 +2257,10 @@ static int osd_declare_fallocate(const struct lu_env *env,
 	RETURN(rc);
 }
 
-static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
-			 __u64 start, __u64 end, int mode, struct thandle *th)
+static int osd_fallocate_preallocate(const struct lu_env *env,
+				     struct dt_object *dt,
+				     __u64 start, __u64 end, int mode,
+				     struct thandle *th)
 {
 	struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
 	handle_t *handle = ldiskfs_journal_current_handle();
@@ -2241,20 +2293,16 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
 	blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
 
 	/* Create and mark new extents as either zero or unwritten */
-	flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
+	flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ||
+		 !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ?
 		LDISKFS_GET_BLOCKS_CREATE_ZERO :
 		LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
-
+#endif
 	inode_lock(inode);
 
-	/*
-	 * We only support preallocation for extent-based file only.
-	 */
-	if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)))
-		GOTO(out, rc = -EOPNOTSUPP);
-
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
 	    end > LDISKFS_I(inode)->i_disksize)) {
 		new_size = end;
@@ -2314,10 +2362,12 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
 				epos = end;
 			if (ldiskfs_update_inode_size(inode, epos) & 0x1)
 				inode->i_mtime = inode->i_ctime;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
 		} else {
 			if (epos > inode->i_size)
 				ldiskfs_set_inode_flag(inode,
 						       LDISKFS_INODE_EOFBLOCKS);
+#endif
 		}
 
 		ldiskfs_mark_inode_dirty(handle, inode);
@@ -2333,6 +2383,61 @@ out:
 	RETURN(rc);
 }
 
+static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
+			       __u64 start, __u64 end, int mode,
+			       struct thandle *th)
+{
+	struct osd_object *obj = osd_dt_obj(dt);
+	struct inode *inode = obj->oo_inode;
+	struct osd_access_lock *al;
+	struct osd_thandle *oh;
+	int rc = 0, found = 0;
+
+	ENTRY;
+
+	LASSERT(dt_object_exists(dt));
+	LASSERT(osd_invariant(obj));
+	LASSERT(inode != NULL);
+
+	dquot_initialize(inode);
+
+	LASSERT(th);
+	oh = container_of(th, struct osd_thandle, ot_super);
+	LASSERT(oh->ot_handle->h_transaction != NULL);
+
+	list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
+		if (obj != al->tl_obj)
+			continue;
+		LASSERT(al->tl_shared == 0);
+		found = 1;
+		/* do actual punch in osd_trans_stop() */
+		al->tl_start = start;
+		al->tl_end = end;
+		al->tl_mode = mode;
+		al->tl_punch = true;
+		break;
+	}
+
+	RETURN(rc);
+}
+
+static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
+			 __u64 start, __u64 end, int mode, struct thandle *th)
+{
+	int rc;
+
+	ENTRY;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		/* punch */
+		rc = osd_fallocate_punch(env, dt, start, end, mode, th);
+	} else {
+		/* standard preallocate */
+		rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
+	}
+	RETURN(rc);
+}
+
 static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
 			     __u64 start, __u64 end, struct thandle *th)
 {
@@ -2485,7 +2590,6 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
 	struct inode *inode = osd_dt_obj(dt)->oo_inode;
 	u64 len;
 	int rc;
-	mm_segment_t cur_fs;
 
 	LASSERT(inode);
 	if (inode->i_op->fiemap == NULL)
@@ -2505,18 +2609,10 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
 	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
 		filemap_write_and_wait(inode->i_mapping);
 
-	/* Save previous value address limit */
-	cur_fs = get_fs();
-	/* Set the address limit of the kernel */
-	set_fs(KERNEL_DS);
-
 	rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
 	fm->fm_flags = fieinfo.fi_flags;
 	fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
 
-	/* Restore the previous address limt */
-	set_fs(cur_fs);
-
 	return rc;
 }
 
@@ -2664,6 +2760,43 @@ void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
 	}
 }
 
+/* For a partial-page punch, flush punch range to disk immediately */
+static void osd_partial_page_flush_punch(struct osd_device *d,
+					 struct inode *inode, loff_t start,
+					 loff_t end)
+{
+	if (osd_use_page_cache(d)) {
+		filemap_fdatawrite_range(inode->i_mapping, start, end);
+	} else {
+		/* Notice we use "wait" version to ensure I/O is complete */
+		filemap_write_and_wait_range(inode->i_mapping, start,
+					     end);
+		invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT,
+					 end >> PAGE_SHIFT);
+	}
+}
+
+/*
+ * For a partial-page truncate, flush the page to disk immediately to
+ * avoid data corruption during direct disk write.  b=17397
+ */
+static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
+				   loff_t offset)
+{
+	if (!(offset & ~PAGE_MASK))
+		return;
+
+	if (osd_use_page_cache(d)) {
+		filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
+	} else {
+		/* Notice we use "wait" version to ensure I/O is complete */
+		filemap_write_and_wait_range(inode->i_mapping, offset,
+					     offset + 1);
+		invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
+					 offset >> PAGE_SHIFT);
+	}
+}
+
 void osd_execute_truncate(struct osd_object *obj)
 {
 	struct osd_device *d = osd_obj2dev(obj);
@@ -2699,24 +2832,21 @@ void osd_execute_truncate(struct osd_object *obj)
 		spin_unlock(&inode->i_lock);
 		osd_dirty_inode(inode, I_DIRTY_DATASYNC);
 	}
+	osd_partial_page_flush(d, inode, size);
+}
 
-	/*
-	 * For a partial-page truncate, flush the page to disk immediately to
-	 * avoid data corruption during direct disk write.  b=17397
-	 */
-	if ((size & ~PAGE_MASK) == 0)
-		return;
-	if (osd_use_page_cache(d)) {
-		filemap_fdatawrite_range(inode->i_mapping, size, size + 1);
-	} else {
-		/* Notice we use "wait" version to ensure I/O is complete */
-		filemap_write_and_wait_range(inode->i_mapping, size, size + 1);
-		invalidate_mapping_pages(inode->i_mapping, size >> PAGE_SHIFT,
-					 size >> PAGE_SHIFT);
-	}
+void osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
+		       loff_t start, loff_t end, int mode)
+{
+	struct osd_device *d = osd_obj2dev(obj);
+	struct inode *inode = obj->oo_inode;
+	struct file *file = osd_quasi_file(env, inode);
+
+	file->f_op->fallocate(file, mode, start, end - start);
+	osd_partial_page_flush_punch(d, inode, start, end - 1);
 }
 
-void osd_process_truncates(struct list_head *list)
+void osd_process_truncates(const struct lu_env *env, struct list_head *list)
 {
 	struct osd_access_lock *al;
 
@@ -2725,8 +2855,10 @@ void osd_process_truncates(struct list_head *list)
 	list_for_each_entry(al, list, tl_list) {
 		if (al->tl_shared)
 			continue;
-		if (!al->tl_truncate)
-			continue;
-		osd_execute_truncate(al->tl_obj);
+		if (al->tl_truncate)
+			osd_execute_truncate(al->tl_obj);
+		else if (al->tl_punch)
+			osd_execute_punch(env, al->tl_obj, al->tl_start,
+					  al->tl_end, al->tl_mode);
 	}
 }