*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*
* lustre/osd/osd_io.c
*
/* prerequisite for linux/xattr.h */
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagevec.h>
/*
/* ext_depth() */
#include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
static inline bool osd_use_page_cache(struct osd_device *d)
{
GOTO(cleanup, rc = -ENOMEM);
lnb->lnb_locked = 1;
+ if (cache)
+ mark_page_accessed(lnb->lnb_page);
}
#if 0
GOTO(cleanup, rc);
/*
* decay extent blocks if we could allocate
- * good large(1M) extent.
+ * good large extent.
*/
- if (previous_total == 0 &&
- total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+ if (total - previous_total >=
+ osd_extent_bytes(osd) >> inode->i_blkbits)
osd_decay_extent_bytes(osd,
- total << inode->i_blkbits);
+ (total - previous_total) << inode->i_blkbits);
/* look for next extent */
fp = NULL;
blocks += blocks_per_page * clen;
struct osd_fextent {
sector_t start;
sector_t end;
+ __u32 flags;
unsigned int mapped:1;
};
return 0;
start = fe.fe_logical >> inode->i_blkbits;
+ cached_extent->flags = fe.fe_flags;
if (fei.fi_extents_mapped == 0) {
/* a special case - no extent found at this offset and forward.
* we can consider this as a hole to EOF. it's safe to cache
return cached_extent->mapped;
}
+#define MAX_EXTENTS_PER_WRITE 100
static int osd_declare_write_commit(const struct lu_env *env,
struct dt_object *dt,
struct niobuf_local *lnb, int npages,
const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
struct inode *inode = osd_dt_obj(dt)->oo_inode;
struct osd_thandle *oh;
- int extents = 0;
- int depth;
+ int extents = 0, new_meta = 0;
+ int depth, new_blocks = 0;
int i;
- int newblocks = 0;
+ int dirty_groups = 0;
int rc = 0;
int credits = 0;
long long quota_space = 0;
OBD_BRW_FROM_GRANT)
declare_flags |= OSD_QID_FORCE;
- if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+ /*
+ * Convert unwritten extent might need split extents, could
+ * not skip it.
+ */
+ if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+ !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
lnb[i].lnb_flags |= OBD_BRW_MAPPED;
continue;
}
}
/* count only unmapped changes */
- newblocks++;
+ new_blocks++;
if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
if (extent.end != 0)
extents += (extent.end - extent.start +
- extent_bytes - 1) / extent_bytes;
+ extent_bytes - 1) / extent_bytes;
extent.start = lnb[i].lnb_file_offset;
extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
} else {
* overwrite case, no need to modify tree and
* allocate blocks.
*/
- if (!newblocks)
+ if (!extent.end)
goto out_declare;
extents += (extent.end - extent.start +
extent_bytes - 1) / extent_bytes;
- /*
- * each extent can go into new leaf causing a split
- * 5 is max tree depth: inode + 4 index blocks
- * with blockmaps, depth is 3 at most
+ /**
+ * with system space usage growing up, mballoc codes won't
+ * try best to scan block group to align best free extent as
+ * we can. So extent bytes per extent could be decayed to a
+ * very small value, this could make us reserve too many credits.
+ * We could be more optimistic in the credit reservations, even
+ * in a case where the filesystem is nearly full, it is extremely
+ * unlikely that the worst case would ever be hit.
+ */
+ if (extents > MAX_EXTENTS_PER_WRITE)
+ extents = MAX_EXTENTS_PER_WRITE;
+
+ /**
+ * If we add a single extent, then in the worse case, each tree
+ * level index/leaf need to be changed in case of the tree split.
+ * If more extents are inserted, they could cause the whole tree
+ * split more than once, but this is really rare.
*/
if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
- /*
- * many concurrent threads may grow tree by the time
- * our transaction starts. so, consider 2 is a min depth
- */
depth = ext_depth(inode);
- depth = max(depth, 1) + 1;
- newblocks += depth;
- credits += depth * 2 * extents;
+ if (extents <= 1) {
+ credits += depth * 2 * extents;
+ new_meta = depth;
+ } else {
+ credits += depth * 3 * extents;
+ new_meta = depth * 2 * extents;
+ }
} else {
+ /*
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
+ */
+ new_meta = DIV_ROUND_UP(new_blocks,
+ LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+ credits += new_meta;
depth = 3;
- newblocks += depth;
- credits += depth * extents;
}
+ dirty_groups += (extents + new_meta);
oh->oh_declared_ext = extents;
/* quota space for metadata blocks */
- quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+ quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
/* quota space should be reported in 1K blocks */
quota_space = toqb(quota_space);
/* each new block can go in different group (bitmap + gd) */
/* we can't dirty more bitmap blocks than exist */
- if (extents > LDISKFS_SB(osd_sb(osd))->s_groups_count)
+ if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
else
- credits += extents;
+ credits += dirty_groups;
/* we can't dirty more gd blocks than exist */
if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
else
- credits += extents;
+ credits += dirty_groups;
+
+ CDEBUG(D_INODE,
+ "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+ osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+ credits);
out_declare:
osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
LDISKFS_GET_BLOCKS_CREATE_ZERO :
LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
-
+#endif
inode_lock(inode);
/*
epos = end;
if (ldiskfs_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
} else {
if (epos > inode->i_size)
ldiskfs_set_inode_flag(inode,
LDISKFS_INODE_EOFBLOCKS);
+#endif
}
ldiskfs_mark_inode_dirty(handle, inode);