*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*
* lustre/osd/osd_io.c
*
/* prerequisite for linux/xattr.h */
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagevec.h>
/*
/* ext_depth() */
#include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
static inline bool osd_use_page_cache(struct osd_device *d)
{
*/
if (unlikely(iobuf == NULL)) {
- CERROR("***** bio->bi_private is NULL! This should never happen. Normally, I would crash here, but instead I will dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/> , along with any interesting messages leading up to this point (like SCSI errors, perhaps). Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n");
+ CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
bio->bi_next, (unsigned long)bio->bi_flags,
{
struct blk_integrity *bi = bdev_get_integrity(bdev);
struct bio_integrity_payload *bip = bio->bi_integrity;
- struct niobuf_local *lnb;
+ struct niobuf_local *lnb = NULL;
unsigned short sector_size = blk_integrity_interval(bi);
void *bio_prot_buf = page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset;
struct bio_vec *bv;
sector_t sector = bio_start_sector(bio);
- unsigned int sectors, total;
+ unsigned int i, sectors, total;
DECLARE_BVEC_ITER_ALL(iter_all);
__u16 *expected_guard;
int rc;
total = 0;
bio_for_each_segment_all(bv, bio, iter_all) {
- lnb = iobuf->dr_lnbs[index];
+ for (i = index; i < iobuf->dr_npages; i++) {
+ if (iobuf->dr_pages[i] == bv->bv_page) {
+ lnb = iobuf->dr_lnbs[i];
+ break;
+ }
+ }
+ if (!lnb)
+ continue;
expected_guard = lnb->lnb_guards;
sectors = bv->bv_len / sector_size;
if (lnb->lnb_guard_rpc) {
total += sectors * bi->tuple_size;
LASSERT(total <= bip_size(bio->bi_integrity));
index++;
+ lnb = NULL;
}
return 0;
}
GOTO(cleanup, rc = -ENOMEM);
lnb->lnb_locked = 1;
+ if (cache)
+ mark_page_accessed(lnb->lnb_page);
}
#if 0
GOTO(cleanup, rc);
/*
* decay extent blocks if we could allocate
- * good large(1M) extent.
+ * good large extent.
*/
- if (previous_total == 0 &&
- total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+ if (total - previous_total >=
+ osd_extent_bytes(osd) >> inode->i_blkbits)
osd_decay_extent_bytes(osd,
- total << inode->i_blkbits);
+ (total - previous_total) << inode->i_blkbits);
/* look for next extent */
fp = NULL;
blocks += blocks_per_page * clen;
struct osd_fextent {
sector_t start;
sector_t end;
+ __u32 flags;
unsigned int mapped:1;
};
return 0;
start = fe.fe_logical >> inode->i_blkbits;
+ cached_extent->flags = fe.fe_flags;
if (fei.fi_extents_mapped == 0) {
/* a special case - no extent found at this offset and forward.
* we can consider this as a hole to EOF. it's safe to cache
return cached_extent->mapped;
}
+#define MAX_EXTENTS_PER_WRITE 100
static int osd_declare_write_commit(const struct lu_env *env,
struct dt_object *dt,
struct niobuf_local *lnb, int npages,
const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
struct inode *inode = osd_dt_obj(dt)->oo_inode;
struct osd_thandle *oh;
- int extents = 0;
- int depth;
+ int extents = 0, new_meta = 0;
+ int depth, new_blocks = 0;
int i;
- int newblocks = 0;
+ int dirty_groups = 0;
int rc = 0;
int credits = 0;
long long quota_space = 0;
OBD_BRW_FROM_GRANT)
declare_flags |= OSD_QID_FORCE;
- if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+ /*
+ * Convert unwritten extent might need split extents, could
+ * not skip it.
+ */
+ if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+ !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
lnb[i].lnb_flags |= OBD_BRW_MAPPED;
continue;
}
}
/* count only unmapped changes */
- newblocks++;
+ new_blocks++;
if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
if (extent.end != 0)
extents += (extent.end - extent.start +
- extent_bytes - 1) / extent_bytes;
+ extent_bytes - 1) / extent_bytes;
extent.start = lnb[i].lnb_file_offset;
extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
} else {
* overwrite case, no need to modify tree and
* allocate blocks.
*/
- if (!newblocks)
+ if (!extent.end)
goto out_declare;
extents += (extent.end - extent.start +
extent_bytes - 1) / extent_bytes;
- /*
- * each extent can go into new leaf causing a split
- * 5 is max tree depth: inode + 4 index blocks
- * with blockmaps, depth is 3 at most
+ /**
+ * with system space usage growing up, mballoc codes won't
+ * try best to scan block group to align best free extent as
+ * we can. So extent bytes per extent could be decayed to a
+ * very small value, this could make us reserve too many credits.
+ * We could be more optimistic in the credit reservations, even
+ * in a case where the filesystem is nearly full, it is extremely
+ * unlikely that the worst case would ever be hit.
+ */
+ if (extents > MAX_EXTENTS_PER_WRITE)
+ extents = MAX_EXTENTS_PER_WRITE;
+
+ /**
+ * If we add a single extent, then in the worse case, each tree
+ * level index/leaf need to be changed in case of the tree split.
+ * If more extents are inserted, they could cause the whole tree
+ * split more than once, but this is really rare.
*/
if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
/*
* many concurrent threads may grow tree by the time
- * our transaction starts. so, consider 2 is a min depth
+ * our transaction starts. so, consider 2 is a min depth.
*/
depth = ext_depth(inode);
- depth = max(depth, 1) + 1;
- newblocks += depth;
- credits += depth * 2 * extents;
+ depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
+ if (extents <= 1) {
+ credits += depth * 2 * extents;
+ new_meta = depth;
+ } else {
+ credits += depth * 3 * extents;
+ new_meta = depth * 2 * extents;
+ }
} else {
- depth = 3;
- newblocks += depth;
- credits += depth * extents;
+ /*
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
+ */
+ new_meta = DIV_ROUND_UP(new_blocks,
+ LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+ credits += new_meta;
}
+ dirty_groups += (extents + new_meta);
oh->oh_declared_ext = extents;
/* quota space for metadata blocks */
- quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+ quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
/* quota space should be reported in 1K blocks */
quota_space = toqb(quota_space);
/* each new block can go in different group (bitmap + gd) */
/* we can't dirty more bitmap blocks than exist */
- if (extents > LDISKFS_SB(osd_sb(osd))->s_groups_count)
+ if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
else
- credits += extents;
+ credits += dirty_groups;
/* we can't dirty more gd blocks than exist */
- if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
+ if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
else
- credits += extents;
+ credits += dirty_groups;
+
+ CDEBUG(D_INODE,
+ "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+ osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+ credits);
out_declare:
osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
* level.
*/
depth = inode != NULL ? ext_depth(inode) : 0;
- depth = max(depth, 1) + 1;
+ depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
credits = depth;
/* if not append, then split may need to modify
* existing blocks moving entries into the new ones
blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
/* Create and mark new extents as either zero or unwritten */
- flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
+ flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ||
+ !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ?
LDISKFS_GET_BLOCKS_CREATE_ZERO :
LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
-
+#endif
inode_lock(inode);
- /*
- * We only support preallocation for extent-based file only.
- */
- if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)))
- GOTO(out, rc = -EOPNOTSUPP);
-
if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
end > LDISKFS_I(inode)->i_disksize)) {
new_size = end;
epos = end;
if (ldiskfs_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
} else {
if (epos > inode->i_size)
ldiskfs_set_inode_flag(inode,
LDISKFS_INODE_EOFBLOCKS);
+#endif
}
ldiskfs_mark_inode_dirty(handle, inode);
}
}
+/* For a partial-page punch, flush punch range to disk immediately */
+static void osd_partial_page_flush_punch(struct osd_device *d,
+ struct inode *inode, loff_t start,
+ loff_t end)
+{
+ if (osd_use_page_cache(d)) {
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ } else {
+ /* Notice we use "wait" version to ensure I/O is complete */
+ filemap_write_and_wait_range(inode->i_mapping, start,
+ end);
+ invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ }
+}
+
/*
* For a partial-page truncate, flush the page to disk immediately to
* avoid data corruption during direct disk write. b=17397
struct file *file = osd_quasi_file(env, inode);
file->f_op->fallocate(file, mode, start, end - start);
- osd_partial_page_flush(d, inode, start);
- osd_partial_page_flush(d, inode, end - 1);
+ osd_partial_page_flush_punch(d, inode, start, end - 1);
}
void osd_process_truncates(const struct lu_env *env, struct list_head *list)