*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*
* lustre/osd/osd_io.c
*
/* prerequisite for linux/xattr.h */
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagevec.h>
/*
/* ext_depth() */
#include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
static inline bool osd_use_page_cache(struct osd_device *d)
{
*/
if (unlikely(iobuf == NULL)) {
- CERROR("***** bio->bi_private is NULL! This should never happen. Normally, I would crash here, but instead I will dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/> , along with any interesting messages leading up to this point (like SCSI errors, perhaps). Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n");
+ CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
bio->bi_next, (unsigned long)bio->bi_flags,
{
struct blk_integrity *bi = bdev_get_integrity(bdev);
struct bio_integrity_payload *bip = bio->bi_integrity;
- struct niobuf_local *lnb;
+ struct niobuf_local *lnb = NULL;
unsigned short sector_size = blk_integrity_interval(bi);
void *bio_prot_buf = page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset;
struct bio_vec *bv;
sector_t sector = bio_start_sector(bio);
- unsigned int sectors, total;
+ unsigned int i, sectors, total;
DECLARE_BVEC_ITER_ALL(iter_all);
__u16 *expected_guard;
int rc;
total = 0;
bio_for_each_segment_all(bv, bio, iter_all) {
- lnb = iobuf->dr_lnbs[index];
+ for (i = index; i < iobuf->dr_npages; i++) {
+ if (iobuf->dr_pages[i] == bv->bv_page) {
+ lnb = iobuf->dr_lnbs[i];
+ break;
+ }
+ }
+ if (!lnb)
+ continue;
expected_guard = lnb->lnb_guards;
sectors = bv->bv_len / sector_size;
if (lnb->lnb_guard_rpc) {
total += sectors * bi->tuple_size;
LASSERT(total <= bip_size(bio->bi_integrity));
index++;
+ lnb = NULL;
}
return 0;
}
GOTO(cleanup, rc = -ENOMEM);
lnb->lnb_locked = 1;
+ if (cache)
+ mark_page_accessed(lnb->lnb_page);
}
#if 0
GOTO(cleanup, rc);
/*
* decay extent blocks if we could allocate
- * good large(1M) extent.
+ * good large extent.
*/
- if (previous_total == 0 &&
- total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+ if (total - previous_total >=
+ osd_extent_bytes(osd) >> inode->i_blkbits)
osd_decay_extent_bytes(osd,
- total << inode->i_blkbits);
+ (total - previous_total) << inode->i_blkbits);
/* look for next extent */
fp = NULL;
blocks += blocks_per_page * clen;
struct osd_fextent {
sector_t start;
sector_t end;
+ __u32 flags;
unsigned int mapped:1;
};
sector_t start;
struct fiemap_extent_info fei = { 0 };
struct fiemap_extent fe = { 0 };
- mm_segment_t saved_fs;
int rc;
if (block >= cached_extent->start && block < cached_extent->end)
fei.fi_extents_max = 1;
fei.fi_extents_start = &fe;
- saved_fs = get_fs();
- set_fs(KERNEL_DS);
rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
- set_fs(saved_fs);
if (rc != 0)
return 0;
start = fe.fe_logical >> inode->i_blkbits;
+ cached_extent->flags = fe.fe_flags;
+ if (fei.fi_extents_mapped == 0) {
+ /* a special case - no extent found at this offset and forward.
+ * we can consider this as a hole to EOF. it's safe to cache
+ * as other threads can not allocate/punch blocks this thread
+ * is working on (LDLM). */
+ cached_extent->start = block;
+ cached_extent->end = i_size_read(inode) >> inode->i_blkbits;
+ cached_extent->mapped = 0;
+ return 0;
+ }
if (start > block) {
cached_extent->start = block;
return cached_extent->mapped;
}
+#define MAX_EXTENTS_PER_WRITE 100
static int osd_declare_write_commit(const struct lu_env *env,
struct dt_object *dt,
struct niobuf_local *lnb, int npages,
const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
struct inode *inode = osd_dt_obj(dt)->oo_inode;
struct osd_thandle *oh;
- int extents = 0;
- int depth;
+ int extents = 0, new_meta = 0;
+ int depth, new_blocks = 0;
int i;
- int newblocks = 0;
+ int dirty_groups = 0;
int rc = 0;
int credits = 0;
long long quota_space = 0;
OBD_BRW_FROM_GRANT)
declare_flags |= OSD_QID_FORCE;
- if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+ /*
+ * Convert unwritten extent might need split extents, could
+ * not skip it.
+ */
+ if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+ !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
lnb[i].lnb_flags |= OBD_BRW_MAPPED;
continue;
}
}
/* count only unmapped changes */
- newblocks++;
+ new_blocks++;
if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
if (extent.end != 0)
extents += (extent.end - extent.start +
- extent_bytes - 1) / extent_bytes;
+ extent_bytes - 1) / extent_bytes;
extent.start = lnb[i].lnb_file_offset;
extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
} else {
* overwrite case, no need to modify tree and
* allocate blocks.
*/
- if (!newblocks)
+ if (!extent.end)
goto out_declare;
extents += (extent.end - extent.start +
extent_bytes - 1) / extent_bytes;
- /*
- * each extent can go into new leaf causing a split
- * 5 is max tree depth: inode + 4 index blocks
- * with blockmaps, depth is 3 at most
+ /**
+ * with system space usage growing up, mballoc codes won't
+ * try best to scan block group to align best free extent as
+ * we can. So extent bytes per extent could be decayed to a
+ * very small value, this could make us reserve too many credits.
+ * We could be more optimistic in the credit reservations, even
+ * in a case where the filesystem is nearly full, it is extremely
+ * unlikely that the worst case would ever be hit.
+ */
+ if (extents > MAX_EXTENTS_PER_WRITE)
+ extents = MAX_EXTENTS_PER_WRITE;
+
+ /**
+ * If we add a single extent, then in the worse case, each tree
+ * level index/leaf need to be changed in case of the tree split.
+ * If more extents are inserted, they could cause the whole tree
+ * split more than once, but this is really rare.
*/
if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
/*
* many concurrent threads may grow tree by the time
- * our transaction starts. so, consider 2 is a min depth
+ * our transaction starts. so, consider 2 is a min depth.
*/
depth = ext_depth(inode);
- depth = max(depth, 1) + 1;
- newblocks += depth;
- credits += depth * 2 * extents;
+ depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
+ if (extents <= 1) {
+ credits += depth * 2 * extents;
+ new_meta = depth;
+ } else {
+ credits += depth * 3 * extents;
+ new_meta = depth * 2 * extents;
+ }
} else {
- depth = 3;
- newblocks += depth;
- credits += depth * extents;
+ /*
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
+ */
+ new_meta = DIV_ROUND_UP(new_blocks,
+ LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+ credits += new_meta;
}
+ dirty_groups += (extents + new_meta);
oh->oh_declared_ext = extents;
/* quota space for metadata blocks */
- quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+ quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
/* quota space should be reported in 1K blocks */
quota_space = toqb(quota_space);
/* each new block can go in different group (bitmap + gd) */
/* we can't dirty more bitmap blocks than exist */
- if (extents > LDISKFS_SB(osd_sb(osd))->s_groups_count)
+ if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
else
- credits += extents;
+ credits += dirty_groups;
/* we can't dirty more gd blocks than exist */
- if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
+ if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
else
- credits += extents;
+ credits += dirty_groups;
+
+ CDEBUG(D_INODE,
+ "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+ osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+ credits);
out_declare:
osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
* level.
*/
depth = inode != NULL ? ext_depth(inode) : 0;
- depth = max(depth, 1) + 1;
+ depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
credits = depth;
/* if not append, then split may need to modify
* existing blocks moving entries into the new ones
ENTRY;
/*
- * Only mode == 0 (which is standard prealloc) is supported now.
+ * mode == 0 (which is standard prealloc) and PUNCH is supported
* Rest of mode options is not supported yet.
*/
- if (mode & ~FALLOC_FL_KEEP_SIZE)
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
RETURN(-EOPNOTSUPP);
/* disable fallocate completely */
LASSERT(th);
LASSERT(inode);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ rc = osd_declare_inode_qid(env, i_uid_read(inode),
+ i_gid_read(inode),
+ i_projid_read(inode), 0, oh,
+ osd_dt_obj(dt), NULL, OSD_QID_BLK);
+ if (rc == 0)
+ rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
+ RETURN(rc);
+ }
+
/* quota space for metadata blocks
* approximate metadata estimate should be good enough.
*/
RETURN(rc);
}
-static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
- __u64 start, __u64 end, int mode, struct thandle *th)
+static int osd_fallocate_preallocate(const struct lu_env *env,
+ struct dt_object *dt,
+ __u64 start, __u64 end, int mode,
+ struct thandle *th)
{
struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
handle_t *handle = ldiskfs_journal_current_handle();
blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
/* Create and mark new extents as either zero or unwritten */
- flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
+ flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ||
+ !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ?
LDISKFS_GET_BLOCKS_CREATE_ZERO :
LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
-
+#endif
inode_lock(inode);
- /*
- * We only support preallocation for extent-based file only.
- */
- if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)))
- GOTO(out, rc = -EOPNOTSUPP);
-
if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
end > LDISKFS_I(inode)->i_disksize)) {
new_size = end;
epos = end;
if (ldiskfs_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
} else {
if (epos > inode->i_size)
ldiskfs_set_inode_flag(inode,
LDISKFS_INODE_EOFBLOCKS);
+#endif
}
ldiskfs_mark_inode_dirty(handle, inode);
RETURN(rc);
}
+static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, int mode,
+ struct thandle *th)
+{
+ struct osd_object *obj = osd_dt_obj(dt);
+ struct inode *inode = obj->oo_inode;
+ struct osd_access_lock *al;
+ struct osd_thandle *oh;
+ int rc = 0, found = 0;
+
+ ENTRY;
+
+ LASSERT(dt_object_exists(dt));
+ LASSERT(osd_invariant(obj));
+ LASSERT(inode != NULL);
+
+ dquot_initialize(inode);
+
+ LASSERT(th);
+ oh = container_of(th, struct osd_thandle, ot_super);
+ LASSERT(oh->ot_handle->h_transaction != NULL);
+
+ list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
+ if (obj != al->tl_obj)
+ continue;
+ LASSERT(al->tl_shared == 0);
+ found = 1;
+ /* do actual punch in osd_trans_stop() */
+ al->tl_start = start;
+ al->tl_end = end;
+ al->tl_mode = mode;
+ al->tl_punch = true;
+ break;
+ }
+
+ RETURN(rc);
+}
+
+static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, int mode, struct thandle *th)
+{
+ int rc;
+
+ ENTRY;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ /* punch */
+ rc = osd_fallocate_punch(env, dt, start, end, mode, th);
+ } else {
+ /* standard preallocate */
+ rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
+ }
+ RETURN(rc);
+}
+
static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
__u64 start, __u64 end, struct thandle *th)
{
struct inode *inode = osd_dt_obj(dt)->oo_inode;
u64 len;
int rc;
- mm_segment_t cur_fs;
LASSERT(inode);
if (inode->i_op->fiemap == NULL)
if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
filemap_write_and_wait(inode->i_mapping);
- /* Save previous value address limit */
- cur_fs = get_fs();
- /* Set the address limit of the kernel */
- set_fs(KERNEL_DS);
-
rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
fm->fm_flags = fieinfo.fi_flags;
fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
- /* Restore the previous address limt */
- set_fs(cur_fs);
-
return rc;
}
}
}
+/* For a partial-page punch, flush punch range to disk immediately */
+static void osd_partial_page_flush_punch(struct osd_device *d,
+ struct inode *inode, loff_t start,
+ loff_t end)
+{
+ if (osd_use_page_cache(d)) {
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ } else {
+ /* Notice we use "wait" version to ensure I/O is complete */
+ filemap_write_and_wait_range(inode->i_mapping, start,
+ end);
+ invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ }
+}
+
+/*
+ * For a partial-page truncate, flush the page to disk immediately to
+ * avoid data corruption during direct disk write. b=17397
+ */
+static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
+ loff_t offset)
+{
+ if (!(offset & ~PAGE_MASK))
+ return;
+
+ if (osd_use_page_cache(d)) {
+ filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
+ } else {
+ /* Notice we use "wait" version to ensure I/O is complete */
+ filemap_write_and_wait_range(inode->i_mapping, offset,
+ offset + 1);
+ invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
+ offset >> PAGE_SHIFT);
+ }
+}
+
void osd_execute_truncate(struct osd_object *obj)
{
struct osd_device *d = osd_obj2dev(obj);
spin_unlock(&inode->i_lock);
osd_dirty_inode(inode, I_DIRTY_DATASYNC);
}
+ osd_partial_page_flush(d, inode, size);
+}
- /*
- * For a partial-page truncate, flush the page to disk immediately to
- * avoid data corruption during direct disk write. b=17397
- */
- if ((size & ~PAGE_MASK) == 0)
- return;
- if (osd_use_page_cache(d)) {
- filemap_fdatawrite_range(inode->i_mapping, size, size + 1);
- } else {
- /* Notice we use "wait" version to ensure I/O is complete */
- filemap_write_and_wait_range(inode->i_mapping, size, size + 1);
- invalidate_mapping_pages(inode->i_mapping, size >> PAGE_SHIFT,
- size >> PAGE_SHIFT);
- }
+void osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
+ loff_t start, loff_t end, int mode)
+{
+ struct osd_device *d = osd_obj2dev(obj);
+ struct inode *inode = obj->oo_inode;
+ struct file *file = osd_quasi_file(env, inode);
+
+ file->f_op->fallocate(file, mode, start, end - start);
+ osd_partial_page_flush_punch(d, inode, start, end - 1);
}
-void osd_process_truncates(struct list_head *list)
+void osd_process_truncates(const struct lu_env *env, struct list_head *list)
{
struct osd_access_lock *al;
list_for_each_entry(al, list, tl_list) {
if (al->tl_shared)
continue;
- if (!al->tl_truncate)
- continue;
- osd_execute_truncate(al->tl_obj);
+ if (al->tl_truncate)
+ osd_execute_truncate(al->tl_obj);
+ else if (al->tl_punch)
+ osd_execute_punch(env, al->tl_obj, al->tl_start,
+ al->tl_end, al->tl_mode);
}
}