*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*
* lustre/osd/osd_io.c
*
GOTO(cleanup, rc);
/*
* decay extent blocks if we could allocate
- * good large(1M) extent.
+ * good large extent.
*/
- if (previous_total == 0 &&
- total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+ if (total - previous_total >=
+ osd_extent_bytes(osd) >> inode->i_blkbits)
osd_decay_extent_bytes(osd,
- total << inode->i_blkbits);
+ (total - previous_total) << inode->i_blkbits);
/* look for next extent */
fp = NULL;
blocks += blocks_per_page * clen;
sector_t start;
struct fiemap_extent_info fei = { 0 };
struct fiemap_extent fe = { 0 };
- mm_segment_t saved_fs;
int rc;
if (block >= cached_extent->start && block < cached_extent->end)
fei.fi_extents_max = 1;
fei.fi_extents_start = &fe;
- saved_fs = get_fs();
- set_fs(KERNEL_DS);
rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
- set_fs(saved_fs);
if (rc != 0)
return 0;
start = fe.fe_logical >> inode->i_blkbits;
+ if (fei.fi_extents_mapped == 0) {
+ /* a special case - no extent found at this offset and forward.
+ * we can consider this as a hole to EOF. it's safe to cache
+ * as other threads can not allocate/punch blocks this thread
+ * is working on (LDLM). */
+ cached_extent->start = block;
+ cached_extent->end = i_size_read(inode) >> inode->i_blkbits;
+ cached_extent->mapped = 0;
+ return 0;
+ }
if (start > block) {
cached_extent->start = block;
return cached_extent->mapped;
}
+#define MAX_EXTENTS_PER_WRITE 100
static int osd_declare_write_commit(const struct lu_env *env,
struct dt_object *dt,
struct niobuf_local *lnb, int npages,
if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
if (extent.end != 0)
extents += (extent.end - extent.start +
- extent_bytes - 1) / extent_bytes;
+ extent_bytes - 1) / extent_bytes;
extent.start = lnb[i].lnb_file_offset;
extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
} else {
extents += (extent.end - extent.start +
extent_bytes - 1) / extent_bytes;
+ /**
+ * with system space usage growing up, mballoc codes won't
+ * try best to scan block group to align best free extent as
+ * we can. So extent bytes per extent could be decayed to a
+ * very small value, this could make us reserve too many credits.
+ * We could be more optimistic in the credit reservations, even
+ * in a case where the filesystem is nearly full, it is extremely
+ * unlikely that the worst case would ever be hit.
+ */
+ if (extents > MAX_EXTENTS_PER_WRITE)
+ extents = MAX_EXTENTS_PER_WRITE;
+
/*
* each extent can go into new leaf causing a split
* 5 is max tree depth: inode + 4 index blocks
else
credits += extents;
+ CDEBUG(D_INODE,
+ "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+ osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+ credits);
+
out_declare:
osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
ENTRY;
/*
- * Only mode == 0 (which is standard prealloc) is supported now.
+ * mode == 0 (which is standard prealloc) and PUNCH is supported
* Rest of mode options is not supported yet.
*/
- if (mode & ~FALLOC_FL_KEEP_SIZE)
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ RETURN(-EOPNOTSUPP);
+
+ /* disable fallocate completely */
+ if (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks < 0)
RETURN(-EOPNOTSUPP);
LASSERT(th);
LASSERT(inode);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ rc = osd_declare_inode_qid(env, i_uid_read(inode),
+ i_gid_read(inode),
+ i_projid_read(inode), 0, oh,
+ osd_dt_obj(dt), NULL, OSD_QID_BLK);
+ if (rc == 0)
+ rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
+ RETURN(rc);
+ }
+
/* quota space for metadata blocks
* approximate metadata estimate should be good enough.
*/
RETURN(rc);
}
-static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
- __u64 start, __u64 end, int mode, struct thandle *th)
+static int osd_fallocate_preallocate(const struct lu_env *env,
+ struct dt_object *dt,
+ __u64 start, __u64 end, int mode,
+ struct thandle *th)
{
struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
handle_t *handle = ldiskfs_journal_current_handle();
boff = start >> inode->i_blkbits;
blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
- /* Create and Write zeros to new extents */
- flags = LDISKFS_GET_BLOCKS_CREATE_ZERO;
+ /* Create and mark new extents as either zero or unwritten */
+ flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
+ LDISKFS_GET_BLOCKS_CREATE_ZERO :
+ LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
RETURN(rc);
}
+static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, int mode,
+ struct thandle *th)
+{
+ struct osd_object *obj = osd_dt_obj(dt);
+ struct inode *inode = obj->oo_inode;
+ struct osd_access_lock *al;
+ struct osd_thandle *oh;
+ int rc = 0, found = 0;
+
+ ENTRY;
+
+ LASSERT(dt_object_exists(dt));
+ LASSERT(osd_invariant(obj));
+ LASSERT(inode != NULL);
+
+ dquot_initialize(inode);
+
+ LASSERT(th);
+ oh = container_of(th, struct osd_thandle, ot_super);
+ LASSERT(oh->ot_handle->h_transaction != NULL);
+
+ list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
+ if (obj != al->tl_obj)
+ continue;
+ LASSERT(al->tl_shared == 0);
+ found = 1;
+ /* do actual punch in osd_trans_stop() */
+ al->tl_start = start;
+ al->tl_end = end;
+ al->tl_mode = mode;
+ al->tl_punch = true;
+ break;
+ }
+
+ RETURN(rc);
+}
+
+static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, int mode, struct thandle *th)
+{
+ int rc;
+
+ ENTRY;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ /* punch */
+ rc = osd_fallocate_punch(env, dt, start, end, mode, th);
+ } else {
+ /* standard preallocate */
+ rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
+ }
+ RETURN(rc);
+}
+
static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
__u64 start, __u64 end, struct thandle *th)
{
struct inode *inode = osd_dt_obj(dt)->oo_inode;
u64 len;
int rc;
- mm_segment_t cur_fs;
LASSERT(inode);
if (inode->i_op->fiemap == NULL)
if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
filemap_write_and_wait(inode->i_mapping);
- /* Save previous value address limit */
- cur_fs = get_fs();
- /* Set the address limit of the kernel */
- set_fs(KERNEL_DS);
-
rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
fm->fm_flags = fieinfo.fi_flags;
fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
- /* Restore the previous address limt */
- set_fs(cur_fs);
-
return rc;
}
}
}
+/*
+ * For a partial-page truncate, flush the page to disk immediately to
+ * avoid data corruption during direct disk write. b=17397
+ */
+static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
+ loff_t offset)
+{
+ if (!(offset & ~PAGE_MASK))
+ return;
+
+ if (osd_use_page_cache(d)) {
+ filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
+ } else {
+ /* Notice we use "wait" version to ensure I/O is complete */
+ filemap_write_and_wait_range(inode->i_mapping, offset,
+ offset + 1);
+ invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
+ offset >> PAGE_SHIFT);
+ }
+}
+
void osd_execute_truncate(struct osd_object *obj)
{
struct osd_device *d = osd_obj2dev(obj);
spin_unlock(&inode->i_lock);
osd_dirty_inode(inode, I_DIRTY_DATASYNC);
}
+ osd_partial_page_flush(d, inode, size);
+}
- /*
- * For a partial-page truncate, flush the page to disk immediately to
- * avoid data corruption during direct disk write. b=17397
- */
- if ((size & ~PAGE_MASK) == 0)
- return;
- if (osd_use_page_cache(d)) {
- filemap_fdatawrite_range(inode->i_mapping, size, size + 1);
- } else {
- /* Notice we use "wait" version to ensure I/O is complete */
- filemap_write_and_wait_range(inode->i_mapping, size, size + 1);
- invalidate_mapping_pages(inode->i_mapping, size >> PAGE_SHIFT,
- size >> PAGE_SHIFT);
- }
+void osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
+ loff_t start, loff_t end, int mode)
+{
+ struct osd_device *d = osd_obj2dev(obj);
+ struct inode *inode = obj->oo_inode;
+ struct file *file = osd_quasi_file(env, inode);
+
+ file->f_op->fallocate(file, mode, start, end - start);
+ osd_partial_page_flush(d, inode, start);
+ osd_partial_page_flush(d, inode, end - 1);
}
-void osd_process_truncates(struct list_head *list)
+void osd_process_truncates(const struct lu_env *env, struct list_head *list)
{
struct osd_access_lock *al;
list_for_each_entry(al, list, tl_list) {
if (al->tl_shared)
continue;
- if (!al->tl_truncate)
- continue;
- osd_execute_truncate(al->tl_obj);
+ if (al->tl_truncate)
+ osd_execute_truncate(al->tl_obj);
+ else if (al->tl_punch)
+ osd_execute_punch(env, al->tl_obj, al->tl_start,
+ al->tl_end, al->tl_mode);
}
}