DT_BUFS_TYPE_LOCAL = 0x0004,
};
+/* supplementary error hint */
+enum dt_fallocate_error_t {
+ DT_FALLOC_ERR_NONE = 0x0000,
+ DT_FALLOC_ERR_NEED_ZERO = 0x0001, /* need to fill zero by brw */
+};
+
/*
* Per-dt-object operations on "file body" - unstructure raw data.
*/
*/
int (*dbo_declare_fallocate)(const struct lu_env *env,
struct dt_object *dt, __u64 start,
- __u64 end, int mode, struct thandle *th);
+ __u64 end, int mode, struct thandle *th,
+ enum dt_fallocate_error_t *error_code);
/**
* dbo_fallocate() - Allocate specified region for an object
static inline int dt_declare_fallocate(const struct lu_env *env,
struct dt_object *dt, __u64 start,
- __u64 end, int mode, struct thandle *th)
+ __u64 end, int mode, struct thandle *th,
+ enum dt_fallocate_error_t *error_code)
{
LASSERT(dt);
return -EOPNOTSUPP;
return dt->do_body_ops->dbo_declare_fallocate(env, dt, start, end,
- mode, th);
+ mode, th, error_code);
}
static inline int dt_falloc(const struct lu_env *env, struct dt_object *dt,
RETURN(-EOPNOTSUPP);
/*
- * mode == 0 (which is standard prealloc) and PUNCH is supported
+ * mode == 0 (which is standard prealloc) and PUNCH/ZERO are supported
* Rest of mode options are not supported yet.
*/
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
RETURN(rc);
}
+/*
+ * @do_up_read: write() might be invoked through other code paths where the
+ * state of mo->mot_dom_sem differs from the state assumed by down_read().
+ * Take care here to avoid potential inconsistencies or deadlocks.
+ */
static int mdt_commitrw_write(const struct lu_env *env, struct obd_export *exp,
struct mdt_device *mdt, struct mdt_object *mo,
struct lu_attr *la, struct obdo *oa, int objcount,
int niocount, struct niobuf_local *lnb,
- unsigned long granted, int old_rc)
+ unsigned long granted, int old_rc,
+ bool do_up_read)
{
struct dt_device *dt = mdt->mdt_bottom;
struct dt_object *dob;
out:
dt_bufs_put(env, dob, lnb, niocount);
- up_read(&mo->mot_dom_sem);
+ if (do_up_read)
+ up_read(&mo->mot_dom_sem);
if (granted > 0)
tgt_grant_commit(exp, granted, old_rc);
if (rc)
ktime_us_delta(ktime_get(), kstart));
rc = mdt_commitrw_write(env, exp, mdt, mo, la, oa, objcount,
- npages, lnb, oa->o_grant_used, old_rc);
+ npages, lnb, oa->o_grant_used, old_rc,
+ true);
if (rc == 0)
obdo_from_la(oa, la, VALID_FLAGS | LA_GID | LA_UID);
else
static int mdt_object_fallocate(const struct lu_env *env, struct dt_device *dt,
struct dt_object *dob, __u64 start, __u64 end,
- int mode, struct lu_attr *la)
+ int mode, struct lu_attr *la,
+ enum dt_fallocate_error_t *error_code)
{
int rc;
bool restart;
if (rc)
GOTO(stop, rc);
- rc = dt_declare_fallocate(env, dob, start, end, mode, th);
+ rc = dt_declare_fallocate(env, dob, start, end, mode, th,
+ error_code);
if (rc)
GOTO(stop, rc);
}
/**
+ * mdt_object_fallocate_zero(): brw(ZERO) over specified region
+ * @mo: object to be applied on
+ * @start: region start position
+ * @end: region end position
+ *
+ * There maybe cases when we need to use BRW to mimic fallocate ops,
+ * e.g. when fallocate(zero) is invoked on indirect-mapping inode.
+ */
+static int
+mdt_object_fallocate_zero(const struct lu_env *env, struct obd_export *exp,
+ struct mdt_device *mdt, struct mdt_object *mo,
+ __u64 start, __u64 end, struct lu_attr *la)
+{
+ struct tgt_thread_big_cache *tbc = NULL;
+ struct dt_object *dob = mdt_obj2dt(mo);
+ struct niobuf_local *lnbs = NULL;
+ struct obdo oa;
+ int npages = 0;
+ int rc = 0;
+
+ LASSERT(env->le_ses->lc_thread->t_data);
+ tbc = env->le_ses->lc_thread->t_data;
+ while (start < end) {
+ struct niobuf_remote rnb;
+ /* limit memory usage each round to ~64KB */
+ int mem_threshold = 65536;
+ __u64 next_end = 0;
+ int i = 0;
+
+ oa.o_size = 0;
+ lnbs = NULL;
+ npages = 0;
+
+ next_end = (start + mem_threshold + PAGE_SIZE - 1) & PAGE_MASK;
+ next_end = min(end, next_end);
+ rnb.rnb_offset = start;
+ rnb.rnb_len = next_end - start;
+ rc = dt_bufs_get(env, dob, &rnb, tbc->local,
+ PTLRPC_MAX_BRW_PAGES, DT_BUFS_TYPE_WRITE);
+ if (unlikely(rc < 0))
+ GOTO(out, rc);
+
+ npages = rc;
+ lnbs = tbc->local;
+ /* read in partial pages, then zero out rest part */
+ rc = dt_write_prep(env, dob, lnbs, npages);
+ if (rc)
+ GOTO(out, rc);
+
+ for (i = 0; i < npages; i++) {
+ memset(kmap(lnbs[i].lnb_page) + lnbs[i].lnb_page_offset,
+ 0, lnbs[i].lnb_len);
+ kunmap(lnbs[i].lnb_page);
+ }
+
+ /* mdt_write will handle write, resource put, etc. */
+ rc = mdt_commitrw_write(env, exp, mdt, mo, la, &oa, 0, npages,
+ lnbs, 0, 0, false);
+ if (rc)
+ GOTO(out, rc);
+
+ start = next_end;
+ }
+ npages = 0;
+ lnbs = NULL;
+out:
+ if (npages && lnbs)
+ dt_bufs_put(env, dob, lnbs, npages);
+ RETURN(rc);
+}
+
+/**
* MDT request handler for OST_FALLOCATE RPC.
*
* This is part of request processing. Validate request fields,
struct lu_attr *la;
__u64 flags = 0;
struct lustre_handle lh = { 0, };
+ enum dt_fallocate_error_t error_code = DT_FALLOC_ERR_NONE;
int rc, mode;
__u64 start, end;
bool srvlock;
PFID(&tsi->tsi_fid), mode, start, end);
/*
- * mode == 0 (which is standard prealloc) and PUNCH is supported
+ * mode == 0 (which is standard prealloc) and PUNCH/ZERO is supported
* Rest of mode options are not supported yet.
*/
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
tgt_fmd_update(tsi->tsi_exp, &tsi->tsi_fid,
tgt_ses_req(tsi)->rq_xid);
- rc = mdt_object_fallocate(tsi->tsi_env, mdt->mdt_bottom, dob, start,
- end, mode, la);
+ rc = mdt_object_fallocate(tsi->tsi_env, mdt->mdt_bottom, dob,
+ start, end, mode, la, &error_code);
+ /* in case file is indirect-mapping, mimic brw */
+ if (rc == -EOPNOTSUPP && error_code == DT_FALLOC_ERR_NEED_ZERO)
+ rc = mdt_object_fallocate_zero(tsi->tsi_env, exp, mdt,
+ mo, start, end, la);
+
up_write(&mo->mot_dom_sem);
if (rc)
GOTO(out_put, rc);
mode = oa->o_falloc_mode;
/*
- * mode == 0 (which is standard prealloc) and PUNCH is supported
+ * mode == 0 (which is standard prealloc) and PUNCH/ZERO are supported
* Rest of mode options are not supported yet.
*/
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
* warning.
*/
if (mode & FALLOC_FL_PUNCH_HOLE && !(mode & FALLOC_FL_KEEP_SIZE)) {
- CWARN("%s: PUNCH mode misses KEEP_SIZE flag, setting it\n",
+ CDEBUG(D_INFO, "%s: PUNCH mode misses KEEP_SIZE flag, setting it\n",
tsi->tsi_tgt->lut_obd->obd_name);
mode |= FALLOC_FL_KEEP_SIZE;
}
GOTO(stop, rc);
}
- rc = dt_declare_fallocate(env, dob, start, end, mode, th);
+ rc = dt_declare_fallocate(env, dob, start, end, mode, th, NULL);
if (rc)
GOTO(stop, rc);
RETURN(rc);
}
+ osc_set_io_portal(req);
+ ptlrpc_at_set_req_timeout(req);
+
osc_pack_req_body(req, oa);
ptlrpc_request_set_replen(req);
static int osd_declare_fallocate(const struct lu_env *env,
struct dt_object *dt, __u64 start, __u64 end,
- int mode, struct thandle *th)
+ int mode, struct thandle *th,
+ enum dt_fallocate_error_t *error_code)
{
struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
ENTRY;
/*
- * mode == 0 (which is standard prealloc) and PUNCH/ZERO is supported
+ * mode == 0 (which is standard prealloc) and PUNCH/ZERO are supported
* Rest of mode options is not supported yet.
*/
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_ZERO_RANGE))
RETURN(-EOPNOTSUPP);
- /* TODO: should fix this for DoM/Indirect in another patch */
- if ((mode & FALLOC_FL_ZERO_RANGE) &&
- !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS))
- RETURN(-EOPNOTSUPP);
-
/* disable fallocate completely */
if (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks < 0)
RETURN(-EOPNOTSUPP);
+ /* 'Enabled' in another code paths, try that again */
+ if ((mode & FALLOC_FL_ZERO_RANGE) &&
+ !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) {
+ LASSERT(error_code);
+ *error_code = DT_FALLOC_ERR_NEED_ZERO;
+ RETURN(-EOPNOTSUPP);
+ }
+
LASSERT(th);
LASSERT(inode);
# Fallocate tests
(( $MDS1_VERSION >= $(version_code 2.14.52) )) &&
- testlist+=" 150b 150bb 150c 150d 150f 150g"
+ testlist+=" 150b 150bb 150c 150d 150f 150g 150ia 150ib 150ic"
SANITY_ONLY=${SANITY_ONLY:-$testlist}
SANITY_REPEAT=${SANITY_REPEAT:-1}
check_set_fallocate_or_skip
stack_trap "rm -f $DIR/$tfile; wait_delete_completed"
+ [[ "$DOM" == "yes" ]] &&
+ $LFS setstripe -E1M -L mdt -E eof $DIR/$tfile
+
echo "Verify fallocate(zero): range within the file"
yes 'A' | dd of=$DIR/$tfile bs=$PAGE_SIZE count=8 ||
- error "dd failed for bs 4096 and count 8"
+ error "dd failed for bs $PAGE_SIZE and count 8"
# zero range page aligned
local offset=$((2 * PAGE_SIZE))
check_set_fallocate_or_skip
stack_trap "rm -f $DIR/$tfile; wait_delete_completed"
+ [[ "$DOM" == "yes" ]] &&
+ $LFS setstripe -E1M -L mdt -E eof $DIR/$tfile
+
local blocks_after_punch=$((4 * PAGE_SIZE / 512))
local blocks_after_zero_fill=$((8 * PAGE_SIZE / 512))
- local blocks_after_extend=$((16 * PAGE_SIZE / 512))
+ local blocks_after_extend_ext=$((16 * PAGE_SIZE / 512))
+ local blocks_after_extend_ind=$((16 * PAGE_SIZE / 512 + 8))
local expect_len=$((8 * PAGE_SIZE))
# file size [0, 32K)
echo "Verify fallocate(zero): range within the file"
yes 'A' | dd of=$DIR/$tfile bs=$PAGE_SIZE count=8 ||
- error "dd failed for bs 4096 and count 8"
+ error "dd failed for bs $PAGE_SIZE and count 8"
# punch across [8K,24K)
local offset=$((2 * PAGE_SIZE))
length=$((8 * PAGE_SIZE))
out=$(fallocate -z -n --offset $offset -l $length $DIR/$tfile 2>&1) ||
skip_eopnotsupp "$out|falloc(zero): off $offset, len $length"
-
# block allocate, size remains
blocks=$(stat -c '%b' $DIR/$tfile)
- (( blocks == blocks_after_extend )) ||
- error "extend failed:$blocks!=$blocks_after_extend"
-
+ if [[ "$DOM" == "yes" ]]; then
+ (( blocks == blocks_after_extend_ind )) ||
+ error "extend failed:$blocks!=$blocks_after_extend_ind"
+ else
+ (( blocks == blocks_after_extend_ext )) ||
+ error "extend failed:$blocks!=$blocks_after_extend_ext"
+ fi
lsz=$(stat -c '%s' $DIR/$tfile)
(( lsz == expect_len)) ||
error "zero extend failed(len):$lsz!=$expect_len"
}
run_test 150ib "Verify fallocate zero-range PREALLOC functionality"
+test_150ic() {
+ (( $MDS1_VERSION >= $(version_code 2.16.54) )) ||
+ skip "need MDS1 version >= 2.16.54 for falloc zero-range"
+
+ if [[ "$ost1_FSTYPE" = "zfs" || "$mds1_FSTYPE" = "zfs" ]]; then
+ skip "zero-range mode is not implemented on OSD ZFS"
+ fi
+
+ [[ "$DOM" == "yes" ]] ||
+ skip "only check on DoM component"
+ check_set_fallocate_or_skip
+
+ # set a larger dom-size for test
+ local MB1=1048576
+ local mdtname=${FSNAME}-MDT0000-mdtlov
+ local dom_limit_saved=$(do_facet mds1 $LCTL get_param -n \
+ lod.$mdtname.dom_stripesize)
+ local dom_limit=$((256 * MB1))
+ do_facet mds1 $LCTL set_param -n \
+ lod.$mdtname.dom_stripesize=$dom_limit
+ stack_trap "do_facet mds1 $LCTL set_param -n \
+ lod.$mdtname.dom_stripesize=$dom_limit_saved"
+ stack_trap "rm -f $DIR/$tfile; wait_delete_completed"
+
+ echo "Verify fallocate zero-range: range extending the file"
+ $LFS setstripe -E 256M -L mdt -E eof $DIR/$tfile ||
+ error "$LFS setstripe DoM failed"
+ # now let's extend the range to [0, 128M), to trigger BRWs
+ local offset=0
+ local length=$((128 * MB1))
+ # Given per block 4KB size, per index block could hold
+ # 1024 block index. 128MB needs data block 32768,
+ # index block 1(L2) + 32(L1)
+ local want_blocks=$((128 * MB1 / 512 + 33 * 8))
+
+ touch $DIR/$tfile
+ out=$(fallocate -z --offset $offset -l $length $DIR/$tfile 2>&1) ||
+ skip_eopnotsupp "$out|fallocate: offset $offset and len $length"
+
+ # Verify zero prealloc worked.
+ local blocks=$(stat -c '%b' $DIR/$tfile)
+ (( blocks == want_blocks )) ||
+ error "zero prealloc failed:$blocks!=$want_blocks"
+
+ local expect="fde9e0818281836e4fc0edfede2b8762"
+ local cksum=($(md5sum $DIR/$tfile))
+ [[ "${cksum[0]}" == "$expect" ]] ||
+ error "unexpected MD5SUM after fallo(large-zero): ${cksum[0]}"
+}
+run_test 150ic "Verify fallocate LARGE zero PREALLOC functionality"
+
#LU-2902 roc_hit was not able to read all values from lproc
function roc_hit_init() {
local osts=${1:-$(osts_nodes)}