X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=77843ff611d2bb77a1fc0e421a95997aa4ae3c7d;hp=dac36d241d4ca660fb201e7ac42377d61c8f25ed;hb=72617588ac8cb2e3e5a7b8e5ebc201cab524d938;hpb=8f793f14bf9928352623e61122f005252605b136

diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c
index dac36d2..77843ff 100644
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -159,7 +159,7 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
 #ifdef HAVE_BIO_ENDIO_USES_ONE_ARG
 static void dio_complete_routine(struct bio *bio)
 {
-	int error = bio->bi_status;
+	int error = blk_status_to_errno(bio->bi_status);
 #else
 static void dio_complete_routine(struct bio *bio, int error)
 {
@@ -440,6 +440,29 @@ static int osd_bio_init(struct bio *bio, struct osd_iobuf *iobuf,
 	RETURN(0);
 }
 
+static void osd_mark_page_io_done(struct osd_iobuf *iobuf,
+				  struct inode *inode,
+				  sector_t start_blocks,
+				  sector_t count)
+{
+	struct niobuf_local *lnb;
+	int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+	pgoff_t pg_start, pg_end;
+
+	pg_start = start_blocks / blocks_per_page;
+	if (start_blocks % blocks_per_page)
+		pg_start++;
+	if (count >= blocks_per_page)
+		pg_end = (start_blocks + count -
+			  blocks_per_page) / blocks_per_page;
+	else
+		return; /* nothing to mark */
+	for ( ; pg_start <= pg_end; pg_start++) {
+		lnb = iobuf->dr_lnbs[pg_start];
+		lnb->lnb_flags |= OBD_BRW_DONE;
+	}
+}
+
 static int osd_do_bio(struct osd_device *osd, struct inode *inode,
 		      struct osd_iobuf *iobuf, sector_t start_blocks,
 		      sector_t count)
@@ -612,6 +635,11 @@ out:
 			OBD_FREE_PTR(bio_private);
 	}
 
+	/* Write only now */
+	if (rc == 0 && iobuf->dr_rw)
+		osd_mark_page_io_done(iobuf, inode,
+				      start_blocks, count);
+
 	RETURN(rc);
 }
 
@@ -931,25 +959,36 @@ static int osd_chunk_trans_blocks(struct inode *inode, int nrblocks)
 	return ret;
 }
 
-static int osd_extend_trans(handle_t *handle, int needed)
+#ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
+static int osd_extend_restart_trans(handle_t *handle, int needed,
+				    struct inode *inode)
 {
-	if (ldiskfs_handle_has_enough_credits(handle, needed))
-		return 0;
+	int rc;
 
-	return ldiskfs_journal_extend(handle,
-				      needed - handle->h_buffer_credits);
-}
+	rc = ldiskfs_journal_ensure_credits(handle, needed,
+		ldiskfs_trans_default_revoke_credits(inode->i_sb));
+	/* this means journal has been restarted */
+	if (rc > 0)
+		rc = 0;
 
-static int osd_extend_restart_trans(handle_t *handle, int needed)
+	return rc;
+}
+#else
+static int osd_extend_restart_trans(handle_t *handle, int needed,
+				    struct inode *inode)
 {
+	int rc;
 
-	int rc = osd_extend_trans(handle, needed);
-
+	if (ldiskfs_handle_has_enough_credits(handle, needed))
+		return 0;
+	rc = ldiskfs_journal_extend(handle,
+				needed - handle->h_buffer_credits);
 	if (rc <= 0)
 		return rc;
 
 	return ldiskfs_journal_restart(handle, needed);
 }
+#endif /* HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */
 
 static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf,
 				 struct osd_device *osd, sector_t start_blocks,
@@ -977,12 +1016,45 @@ static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf,
 	return osd_do_bio(osd, inode, iobuf, start_blocks, count);
 }
 
+static unsigned int osd_extent_bytes(const struct osd_device *o)
+{
+	unsigned int *extent_bytes_ptr =
+			raw_cpu_ptr(o->od_extent_bytes_percpu);
+
+	if (likely(*extent_bytes_ptr))
+		return *extent_bytes_ptr;
+
+	/* initialize on first access or CPU hotplug */
+	if (!ldiskfs_has_feature_extents(osd_sb(o)))
+		*extent_bytes_ptr = 1 << osd_sb(o)->s_blocksize_bits;
+	else
+		*extent_bytes_ptr = OSD_DEFAULT_EXTENT_BYTES;
+
+	return *extent_bytes_ptr;
+}
+
+#define EXTENT_BYTES_DECAY 64
+static void osd_decay_extent_bytes(struct osd_device *osd,
+				   unsigned int new_bytes)
+{
+	unsigned int old_bytes;
+
+	if (!ldiskfs_has_feature_extents(osd_sb(osd)))
+		return;
+
+	old_bytes = osd_extent_bytes(osd);
+	*raw_cpu_ptr(osd->od_extent_bytes_percpu) =
+		(old_bytes * (EXTENT_BYTES_DECAY - 1) +
+		 min(new_bytes, OSD_DEFAULT_EXTENT_BYTES) +
+		 EXTENT_BYTES_DECAY - 1) / EXTENT_BYTES_DECAY;
+}
 
 static int osd_ldiskfs_map_inode_pages(struct inode *inode,
 				       struct osd_iobuf *iobuf,
 				       struct osd_device *osd,
 				       int create, __u64 user_size,
-				       int check_credits)
+				       int check_credits,
+				       struct thandle *thandle)
 {
 	int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
 	int rc = 0, i = 0, mapped_index = 0;
@@ -990,7 +1062,6 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode,
 	int clen = 0;
 	pgoff_t max_page_index;
 	handle_t *handle = NULL;
-	int credits;
 	sector_t start_blocks = 0, count = 0;
 	loff_t disk_size = 0;
 	struct page **page = iobuf->dr_pages;
@@ -1050,32 +1121,30 @@ cont_map:
 		 * transaction to make sure consistency.
 		 */
 		if (handle && check_credits) {
-			/*
-			 * credits to insert 1 extent into extent tree.
-			 */
-			credits = osd_chunk_trans_blocks(inode, blen);
-			rc = osd_extend_trans(handle, credits);
-			if (rc < 0)
-				GOTO(cleanup, rc);
+			struct osd_thandle *oh;
+
+			LASSERT(thandle != NULL);
+			oh = container_of(thandle, struct osd_thandle,
+					  ot_super);
 			/*
 			 * only issue IO if restart transaction needed,
 			 * as update disk size need hold inode lock, we
 			 * want to avoid that as much as possible.
 			 */
-			if (rc > 0) {
-				WARN_ON_ONCE(start_blocks == 0);
+			if (oh->oh_declared_ext <= 0) {
 				rc = osd_ldiskfs_map_write(inode,
 					iobuf, osd, start_blocks,
 					count, &disk_size, user_size);
 				if (rc)
 					GOTO(cleanup, rc);
-				rc = ldiskfs_journal_restart(handle, credits);
-				if (rc)
-					GOTO(cleanup, rc);
-				start_blocks += count;
-				/* reset IO block count */
-				count = 0;
+				thandle->th_restart_tran = 1;
+				GOTO(cleanup, rc = -EAGAIN);
 			}
+
+			if (OBD_FAIL_CHECK(OBD_FAIL_OST_RESTART_IO))
+				oh->oh_declared_ext = 0;
+			else
+				oh->oh_declared_ext--;
 		}
 		rc = ldiskfs_map_blocks(handle, inode, &map, create);
 		if (rc >= 0) {
@@ -1118,6 +1187,12 @@ cont_map:
 		}
 
 		if (rc == 0 && total < blen) {
+			/*
+			 * decay extent blocks if we could not
+			 * allocate extent once.
+			 */
+			osd_decay_extent_bytes(osd,
+				(total - previous_total) << inode->i_blkbits);
 			map.m_lblk = fp->index * blocks_per_page + total;
 			map.m_len = blen - total;
 			previous_total = total;
@@ -1125,7 +1200,14 @@ cont_map:
 		}
 		if (rc != 0)
 			GOTO(cleanup, rc);
-
+		/*
+		 * decay extent blocks if we could allocate
+		 * good large(1M) extent.
+		 */
+		if (previous_total == 0 &&
+		    total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+			osd_decay_extent_bytes(osd,
+					       total << inode->i_blkbits);
 		/* look for next extent */
 		fp = NULL;
 		blocks += blocks_per_page * clen;
@@ -1197,7 +1279,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 
 	if (iobuf->dr_npages) {
 		rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
-						 0, 0);
+						 0, 0, NULL);
 		if (likely(rc == 0)) {
 			rc = osd_do_bio(osd, inode, iobuf, 0, 0);
 			/* do IO stats for preparation reads */
@@ -1278,12 +1360,21 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	struct osd_fextent	mapped = { 0 }, extent = { 0 };
 	enum osd_quota_local_flags local_flags = 0;
 	enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
+	unsigned int		extent_bytes;
 	ENTRY;
 
 	LASSERT(handle != NULL);
 	oh = container_of(handle, struct osd_thandle, ot_super);
 	LASSERT(oh->ot_handle == NULL);
 
+	/*
+	 * We track a decaying average extent blocks per filesystem,
+	 * for most of time, it will be 1M, with filesystem becoming
+	 * heavily-fragmented, it will be reduced to 4K at the worst.
+	 */
+	extent_bytes = osd_extent_bytes(osd);
+	LASSERT(extent_bytes >= (1 << osd_sb(osd)->s_blocksize));
+
 	/* calculate number of extents (probably better to pass nb) */
 	for (i = 0; i < npages; i++) {
 		/* ignore quota for the whole request if any page is from
@@ -1306,10 +1397,18 @@ static int osd_declare_write_commit(const struct lu_env *env,
 			continue;
 		}
 
+		if (lnb[i].lnb_flags & OBD_BRW_DONE) {
+			lnb[i].lnb_flags |= OBD_BRW_MAPPED;
+			continue;
+		}
+
 		/* count only unmapped changes */
 		newblocks++;
 		if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
-			extents++;
+			if (extent.end != 0)
+				extents += (extent.end - extent.start +
+					extent_bytes - 1) / extent_bytes;
+			extent.start = lnb[i].lnb_file_offset;
 			extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
 		} else {
 			extent.end += lnb[i].lnb_len;
@@ -1325,6 +1424,9 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	 */
 	if (!newblocks)
 		goto out_declare;
+
+	extents += (extent.end - extent.start +
+		    extent_bytes - 1) / extent_bytes;
 	/*
 	 * each extent can go into new leaf causing a split
 	 * 5 is max tree depth: inode + 4 index blocks
@@ -1345,12 +1447,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		credits += depth * extents;
 	}
 
-	/*
-	 * try a bit more extents to avoid restart
-	 * as much as possible in normal case.
-	 */
-	if (npages > 1 && extents)
-		extents <<= 1;
+	oh->oh_declared_ext = extents;
 
 	/* quota space for metadata blocks */
 	quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
@@ -1409,9 +1506,6 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
 	struct inode *inode = osd_dt_obj(dt)->oo_inode;
 	struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
 	int rc = 0, i, check_credits = 0;
-	struct osd_thandle *oh = container_of(thandle,
-					      struct osd_thandle, ot_super);
-	unsigned int save_credits = oh->ot_credits;
 
 	LASSERT(inode);
 
@@ -1439,6 +1533,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
 			continue;
 		}
 
+		if (lnb[i].lnb_flags & OBD_BRW_DONE)
+			continue;
+
 		if (!(lnb[i].lnb_flags & OBD_BRW_MAPPED))
 			check_credits = 1;
 
@@ -1464,28 +1561,19 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
 	} else if (iobuf->dr_npages > 0) {
 		rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd,
 						 1, user_size,
-						 check_credits);
-		/*
-		 * Write might restart transaction, extend credits
-		 * if needed for operations such as attribute set.
-		 */
-		if (rc == 0) {
-			handle_t *handle = ldiskfs_journal_current_handle();
-
-			LASSERT(handle != NULL);
-			rc = osd_extend_restart_trans(handle, save_credits);
-		}
+						 check_credits,
+						 thandle);
 	} else {
 		/* no pages to write, no transno is needed */
 		thandle->th_local = 1;
 	}
 
-	if (rc != 0)
+	if (rc != 0 && !thandle->th_restart_tran)
 		osd_fini_iobuf(osd, iobuf);
 
 	osd_trans_exec_check(env, thandle, OSD_OT_WRITE);
 
-	if (unlikely(rc != 0)) {
+	if (unlikely(rc != 0 && !thandle->th_restart_tran)) {
 		/* if write fails, we should drop pages from the cache */
 		for (i = 0; i < npages; i++) {
 			if (lnb[i].lnb_page == NULL)
@@ -1570,7 +1658,7 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
 
 	if (iobuf->dr_npages) {
 		rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
-						 0, 0);
+						 0, 0, NULL);
 		if (!rc)
 			rc = osd_do_bio(osd, inode, iobuf, 0, 0);
 
@@ -1890,7 +1978,8 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
 		++bufsize;
 	}
 
-	dirty_inode = test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA,
+	/* only the first flag-set matters */
+	dirty_inode = !test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA,
 				       &ei->i_flags);
 
 	/* sparse checking is racy, but sparse is very rare case, leave as is */
@@ -2129,7 +2218,8 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
 	boff = start >> inode->i_blkbits;
 	blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
 
-	flags = LDISKFS_GET_BLOCKS_CREATE;
+	/* Create and Write zeros to new extents */
+	flags = LDISKFS_GET_BLOCKS_CREATE_ZERO;
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
 
@@ -2178,7 +2268,7 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
 		}
 
 		/* TODO: quota check */
-		rc = osd_extend_restart_trans(handle, credits);
+		rc = osd_extend_restart_trans(handle, credits, inode);
 		if (rc)
 			break;
 
@@ -2210,11 +2300,11 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
 	}
 
 out:
-	inode_unlock(inode);
-
 	/* extand credits if needed for operations such as attribute set */
 	if (rc >= 0)
-		rc = osd_extend_restart_trans(handle, save_credits);
+		rc = osd_extend_restart_trans(handle, save_credits, inode);
+
+	inode_unlock(inode);
 
 	RETURN(rc);
 }