/* FLR */
#define OBD_FAIL_FLR_LV_DELAY 0x1A01
-#define OBD_FAIL_FLR_LV_INC 0x1A02
+#define OBD_FAIL_FLR_LV_INC 0x1A02 /* unused since 2.15 */
#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03
/* DT */
#define LCME_TEMPLATE_FLAGS (LCME_FL_PREF_RW | LCME_FL_NOSYNC | \
LCME_FL_EXTENSION)
-/* the highest bit in obdo::o_layout_version is used to mark if the file is
- * being resynced. */
-#define LU_LAYOUT_RESYNC LCME_FL_NEG
-
/* lcme_id can be specified as certain flags, and the the first
* bit of lcme_id is used to indicate that the ID is representing
* certain LCME_FL_* but not a real ID. Which implies we can have
LCME_ID_NOT_ID = LCME_FL_NEG
};
+/* layout version equals to lcme_id, except some bits have special meanings */
+enum layout_version_flags {
+ /* layout version reaches the high water mark to be increased to
+ * circularly reuse the smallest value
+ */
+ LU_LAYOUT_HIGEN = 0x40000000,
+ /* the highest bit is used to mark if the file is being resynced */
+ LU_LAYOUT_RESYNC = 0x80000000,
+};
+
#define LCME_ID_MASK LCME_ID_MAX
struct lov_comp_md_entry_v1 {
bool lease_broken = false;
fmode_t fmode = 0;
enum mds_op_bias bias = 0;
+ int fdv;
struct file *layout_file = NULL;
void *data = NULL;
size_t data_size = 0;
bias = MDS_CLOSE_RESYNC_DONE;
break;
case LL_LEASE_LAYOUT_MERGE: {
- int fd;
if (ioc->lil_count != 1)
GOTO(out_lease_close, rc = -EINVAL);
arg += sizeof(*ioc);
- if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
+ if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
GOTO(out_lease_close, rc = -EFAULT);
- layout_file = fget(fd);
+ layout_file = fget(fdv);
if (!layout_file)
GOTO(out_lease_close, rc = -EBADF);
break;
}
case LL_LEASE_LAYOUT_SPLIT: {
- int fdv;
int mirror_id;
if (ioc->lil_count != 2)
EXIT;
out:
- switch (ioc->lil_flags) {
- case LL_LEASE_RESYNC_DONE:
- if (data)
- OBD_FREE(data, data_size);
- break;
- case LL_LEASE_LAYOUT_MERGE:
- case LL_LEASE_LAYOUT_SPLIT:
- if (layout_file)
- fput(layout_file);
+ if (ioc->lil_flags == LL_LEASE_RESYNC_DONE && data)
+ OBD_FREE(data, data_size);
- ll_layout_refresh(inode, &fd->fd_layout_version);
- break;
- case LL_LEASE_PCC_ATTACH:
+ if (layout_file)
+ fput(layout_file);
+
+ if (ioc->lil_flags == LL_LEASE_PCC_ATTACH) {
if (!rc)
rc = rc2;
rc = pcc_readwrite_attach_fini(file, inode,
param.pa_layout_gen,
lease_broken, rc,
attached);
- break;
}
+ ll_layout_refresh(inode, &fd->fd_layout_version);
+
if (!rc)
rc = ll_lease_type_from_fmode(fmode);
RETURN(rc);
lo->ldo_layout_gen++;
lo->ldo_layout_gen |= preserve;
/* Zero is not a valid generation */
- if (unlikely((lo->ldo_layout_gen & LCME_ID_MASK) == 0))
+ if (unlikely((lo->ldo_layout_gen & LCME_ID_MASK) == 0)) {
lo->ldo_layout_gen++;
+ lo->ldo_layout_gen &= ~LU_LAYOUT_RESYNC;
+ }
}
struct lod_it {
RETURN(rc);
}
-static bool lod_obj_attr_set_comp_skip_cb(const struct lu_env *env,
- struct lod_object *lo, int comp_idx,
- struct lod_obj_stripe_cb_data *data)
-{
- struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
- bool skipped = false;
-
- if (!(data->locd_attr->la_valid & LA_LAYOUT_VERSION))
- return skipped;
-
- switch (lo->ldo_flr_state) {
- case LCM_FL_WRITE_PENDING: {
- int i;
-
- /* skip stale components */
- if (lod_comp->llc_flags & LCME_FL_STALE) {
- skipped = true;
- break;
- }
-
- /* skip valid and overlapping components, therefore any
- * attempts to write overlapped components will never succeed
- * because client will get EINPROGRESS. */
- for (i = 0; i < lo->ldo_comp_cnt; i++) {
- if (i == comp_idx)
- continue;
-
- if (lo->ldo_comp_entries[i].llc_flags & LCME_FL_STALE)
- continue;
-
- if (lu_extent_is_overlapped(&lod_comp->llc_extent,
- &lo->ldo_comp_entries[i].llc_extent)) {
- skipped = true;
- break;
- }
- }
- break;
- }
- case LCM_FL_RDONLY:
- case LCM_FL_SYNC_PENDING:
- break;
- default:
- LASSERTF(0, "impossible: %d\n", lo->ldo_flr_state);
- break;
- }
-
- CDEBUG(D_LAYOUT, DFID": %s to set component %x to version: %u\n",
- PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
- skipped ? "skipped" : "chose", lod_comp->llc_id,
- data->locd_attr->la_layout_version);
-
- return skipped;
-}
-
static inline int
lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
struct dt_object *dt, struct thandle *th,
data.locd_attr = attr;
data.locd_declare = false;
- data.locd_comp_skip_cb = lod_obj_attr_set_comp_skip_cb;
data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
rc = lod_obj_for_each_stripe(env, lo, th, &data);
}
/* fix on-disk layout gen */
lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
-
/* transfer layout version to OST objects. */
if (lo->ldo_mirror_count > 1) {
struct lod_obj_stripe_cb_data data = { {0} };
* This way it can make sure that the layout version is
* monotonously increased in this writing era. */
lod_obj_inc_layout_gen(lo);
- if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
- __u32 layout_version;
-
- get_random_bytes(&layout_version, sizeof(layout_version));
- lo->ldo_layout_gen = layout_version & 0xffff;
- }
rc = lod_declare_instantiate_components(env, lo, th, 0);
if (rc)
GOTO(out, rc);
layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
+ layout_attr->la_layout_version = 0;
if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
if (rc)
GOTO(out, rc);
+ lod_obj_inc_layout_gen(lo);
+
/* 3. transfer layout version to OST objects.
* transfer new layout version to OST objects so that stale writes
* can be denied. It also ends an era of writing by setting
* LU_LAYOUT_RESYNC. Normal client can never use this bit to
* send write RPC; only resync RPCs could do it. */
layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
+ layout_attr->la_layout_version = 0;
if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
if (rc)
GOTO(out, rc);
-
- lod_obj_inc_layout_gen(lo);
out:
if (rc)
lod_striping_free(env, lo);
lod_obj_inc_layout_gen(lo);
layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
+ layout_attr->la_layout_version = 0;
rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
if (rc)
GOTO(out, rc);
}
lod_comp->llc_stripe_count = 0;
} else {
+ lod_comp->llc_layout_gen = 0;
lod_comp->llc_stripe = stripe;
lod_comp->llc_ost_indices = ost_indices;
lod_comp->llc_stripes_allocated = stripe_len;
GOTO(out, rc);
rc = mdo_xattr_del(env, vic, XATTR_NAME_LOV, handle);
- if (rc) /* wtf? */
+ if (rc)
GOTO(out_restore, rc);
(void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle,
return 0;
}
+/**
+ * whether the requestion IO contains older layout version than that on the
+ * disk.
+ */
+static inline bool ofd_layout_version_less(__u32 req_version,
+ __u32 ondisk_version)
+{
+ __u32 req = req_version & ~LU_LAYOUT_RESYNC;
+ __u32 ondisk = ondisk_version & ~LU_LAYOUT_RESYNC;
+
+ /**
+ * request layout version could be circularly increased to the samllest
+ * value, in that case @req < @ondisk but @req does not have the high
+ * end bit set while @ondisk does.
+ */
+ return (req < ondisk) &&
+ ((req & LU_LAYOUT_HIGEN) == (ondisk & LU_LAYOUT_HIGEN));
+}
+
#endif /* _OFD_INTERNAL_H */
int ofd_verify_layout_version(const struct lu_env *env,
struct ofd_object *fo, const struct obdo *oa)
{
- __u32 layout_version;
int rc;
ENTRY;
rc = ofd_object_ff_load(env, fo);
if (rc < 0) {
if (rc == -ENODATA)
- rc = -EINPROGRESS;
+ rc = 0;
GOTO(out, rc);
}
- layout_version = fo->ofo_ff.ff_layout_version;
- if (oa->o_layout_version >= layout_version &&
- oa->o_layout_version <= layout_version + fo->ofo_ff.ff_range)
- GOTO(out, rc = 0);
-
- /* normal traffic, decide if to return ESTALE or EINPROGRESS */
- layout_version &= ~LU_LAYOUT_RESYNC;
-
- /* this update is not legitimate */
- if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) <= layout_version)
+ /**
+ * this update is not legitimate, whose layout version is older than
+ * that on the disk.
+ */
+ if (ofd_layout_version_less(oa->o_layout_version,
+ fo->ofo_ff.ff_layout_version))
GOTO(out, rc = -ESTALE);
- /* layout version may not be transmitted yet */
- if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) > layout_version)
- GOTO(out, rc = -EINPROGRESS);
-
- EXIT;
-
out:
- CDEBUG(D_INODE, DFID " verify layout version: %u vs. %u/%u, rc: %d\n",
+ CDEBUG(D_INODE, DFID " verify layout version: %u vs. %u/%u: rc = %d\n",
PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
oa->o_layout_version, fo->ofo_ff.ff_layout_version,
fo->ofo_ff.ff_range, rc);
- return rc;
+ RETURN(rc);
}
rc = ofd_verify_layout_version(env, fo, oa);
if (rc)
GOTO(err, rc);
- oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
}
rc = dt_write_prep(env, ofd_object_child(fo), lnb, *nr_local);
PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
ff->ff_layout_version, oa->o_layout_version);
- /* only the MDS has the authority to update layout version */
- if (!(exp_connect_flags(ofd_info(env)->fti_exp) &
- OBD_CONNECT_MDS)) {
- CERROR(DFID": update layout version from client\n",
- PFID(&fo->ofo_ff.ff_parent));
-
- RETURN(-EPERM);
- }
-
+ /**
+ * resync write from client on non-primary objects and
+ * resync start from MDS on primary objects will contain
+ * LU_LAYOUT_RESYNC flag in the @oa.
+ *
+ * The layout version checking for write/punch from client
+ * happens in ofd_verify_layout_version() before coming to
+ * here, so that resync with smaller layout version client
+ * will be rejected there, the biggest resync version will
+ * be recorded in the OFD objects.
+ */
if (ff->ff_layout_version & LU_LAYOUT_RESYNC) {
/* this opens a new era of writing */
ff->ff_layout_version = 0;
}
/* it's not allowed to change it to a smaller value */
- if (oa->o_layout_version < ff->ff_layout_version)
+ if (ofd_layout_version_less(oa->o_layout_version,
+ ff->ff_layout_version))
RETURN(-EINVAL);
if (ff->ff_layout_version == 0 ||
rc = ofd_verify_layout_version(env, fo, oa);
if (rc)
GOTO(unlock, rc);
-
- oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
}
if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags & LUSTRE_ENCRYPT_FL) {
else
body->oa.o_valid |= rec->lsr_valid;
- if (body->oa.o_valid & OBD_MD_LAYOUT_VERSION) {
- OBD_FAIL_TIMEOUT(OBD_FAIL_FLR_LV_DELAY, cfs_fail_val);
- if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_FLR_LV_INC)))
- body->oa.o_layout_version = LU_LAYOUT_RESYNC |
- (body->oa.o_layout_version + 1);
- }
-
osp_sync_send_new_rpc(d, llh, h, req);
RETURN(0);
}
}
run_test 35 "allow to write to mirrored files"
+get_file_layout_version() {
+ $LFS getstripe $1 | awk '/lcm_layout_gen/{print $2}'
+}
+
+get_ost_layout_version() {
+ $MULTIOP $1 oXc | awk '/ostlayoutversion/{print $2}'
+}
+
verify_ost_layout_version() {
local tf=$1
# get file layout version
- local flv=$($LFS getstripe $tf | awk '/lcm_layout_gen/{print $2}')
+ local flv=$(get_file_layout_version $tf)
# layout version from OST objects
- local olv=$($MULTIOP $tf oXc | awk '/ostlayoutversion/{print $2}')
+ local olv=$(get_ost_layout_version $tf)
- [ $flv -eq $olv ] || error "layout version mismatch: $flv vs. $olv"
+ (( flv >= olv )) || error "layout version mismatch: $flv vs. $olv"
}
create_file_36() {
done
}
-test_36() {
+test_36a() {
local tf=$DIR/$tfile
stack_trap "rm -f $tf $tf-2 $tf-3"
local st=$(date +%s)
$MULTIOP $tf-2 oO_WRONLY:w1024Yc || error "write mirrored file error"
- [ $(date +%s) -ge $((st+delay_sec)) ] ||
- error "write finished before layout version is transmitted"
-
# verify OST layout version
verify_ost_layout_version $tf
do_facet $mds_facet $LCTL set_param fail_loc=0
+}
+run_test 36a "write to mirrored files"
- # test case 3
- mds_idx=mds$(($($LFS getstripe -m $tf-3) + 1))
+test_36b() {
+ local tf=$DIR/$tfile
- #define OBD_FAIL_FLR_LV_INC 0x1A02
- do_facet $mds_facet $LCTL set_param fail_loc=0x1A02
+ (( OST1_VERSION >= $(version_code 2.15.50) )) ||
+ skip "Need OST version at least 2.15.50"
- # write open file should return error
- $MULTIOP $tf-3 oO_WRONLY:O_SYNC:w1024c &&
- error "write a mirrored file succeeded" || true
+ (( OSTCOUNT >= 2 )) || skip "need >= 2 OSTs"
- do_facet $mds_facet $LCTL set_param fail_loc=0
+ # create 2 mirrors using different OSTs
+ $LFS setstripe -N -c1 -i0 --flags=prefer -N -c1 -i1 $tf ||
+ error "create mirrored file"
+
+ # write 1M data to one mirror
+ dd if=/dev/zero of=$tf bs=1M count=1 || error "write file error"
+ sync
+
+ # set prefer mirror to another mirror
+ $LFS setstripe --comp-set -I0x10001 --comp-flags=^prefer $tf ||
+ error "clear prefer mirror error"
+ $LFS setstripe --comp-set -I0x20002 --comp-flags=prefer $tf ||
+ error "set prefer mirror error"
+
+ # the second write should not hung
+ dd if=/dev/zero of=$tf bs=1M count=1 || error "write file error"
+}
+run_test 36b "write should not hung when prefered mirror is stale"
+
+test_36c() {
+ local tf=$DIR/$tfile
+
+ (( OST1_VERSION >= $(version_code 2.15.50) )) ||
+ skip "Need OST version at least 2.15.50"
+
+ (( OSTCOUNT >= 2 )) || skip "need >= 2 OSTs"
+
+ # create 2 mirrors using different OSTs
+ $LFS setstripe -N -c1 -i0 --flags=prefer -N -c1 -i1 $tf ||
+ error "create mirrored file"
+
+ # write it in the background
+ dd if=/dev/zero of=$tf bs=1M count=600 &
+ local pid=$!
+
+ sleep 1
+
+ $LFS setstripe --comp-set -I0x10001 --comp-flags=^prefer $tf ||
+ error "clear prefer mirror error"
+ $LFS setstripe --comp-set -I0x20002 --comp-flags=prefer $tf ||
+ error "set prefer mirror error"
+
+ wait $pid
+}
+run_test 36c "change prefer mirror during write shouldn't hung"
+
+test_36d() {
+ local tf=$DIR/$tfile
+
+ (( OST1_VERSION >= $(version_code 2.15.50) )) ||
+ skip "Need OST version at least 2.15.50"
+
+ echo " ** create $tf"
+ $LFS mirror create -N $tf || error "create $tf failed"
+
+ for i in 1 2; do
+ echo " ** mirror extend $tf ($i/2)"
+ $LFS mirror extend -N $tf || error "mirror extend $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ done
+
+ for i in 1 2; do
+ echo " ** write $tf ($i/2)"
+ dd if=/dev/zero of=$tf bs=1k count=1 || error "write $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ (( flv == olv )) ||
+ error "write update OST layout failed $flv/$olv"
+ done
+
+ echo " ** resync $tf"
+ $LFS mirror resync $tf || error "mirror resync $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+
+ for i in 1 2; do
+ echo " ** truncate $tf ($i/2)"
+ $TRUNCATE $tf $((1024 * 1024)) || error "truncate $tf fails"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ (( flv == olv || flv == olv + 1 )) ||
+ error "truncate update OST layout failed $flv/$olv"
+ done
+
+ echo " ** resync $tf"
+ $LFS mirror resync $tf || error "mirror resync $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+
+ for i in 1 2; do
+ echo " ** write $tf ($i/2)"
+ dd if=/dev/zero of=$tf bs=1k count=1 || error "write $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ (( flv == olv )) ||
+ error "write update OST layout failed $flv/$olv"
+ done
}
-run_test 36 "write to mirrored files"
+run_test 36d "write/punch FLR file update OST layout version"
create_files_37() {
local tf
test_50a() {
$LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' ||
skip "OST does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset
test_50b() {
$LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' ||
skip "OST does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset
skip "OST does not support SEEK_HOLE"
(( $LINUX_VERSION_CODE > $(version_code 3.0.0) )) ||
skip "client kernel does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset