/* FLR */
#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE 0x1A00
#define OBD_FAIL_FLR_LV_DELAY 0x1A01
-#define OBD_FAIL_FLR_LV_INC 0x1A02
+#define OBD_FAIL_FLR_LV_INC 0x1A02 /* not used */
#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03
/* DT */
#define LCME_TEMPLATE_FLAGS (LCME_FL_PREF_RW | LCME_FL_NOSYNC | \
LCME_FL_EXTENSION)
-/* the highest bit in obdo::o_layout_version is used to mark if the file is
- * being resynced. */
-#define LU_LAYOUT_RESYNC LCME_FL_NEG
-
/* lcme_id can be specified as certain flags, and the the first
* bit of lcme_id is used to indicate that the ID is representing
* certain LCME_FL_* but not a real ID. Which implies we can have
LCME_ID_NOT_ID = LCME_FL_NEG
};
+/* layout version equals to lcme_id, except some bits have special meanings */
+enum layout_version_flags {
+ /* layout version reaches the high water mark to be increased to
+ * circularly reuse the smallest value */
+ LU_LAYOUT_HIGEN = 0x40000000,
+ /* the highest bit is used to mark if the file is being resynced */
+ LU_LAYOUT_RESYNC = 0x80000000,
+};
+
#define LCME_ID_MASK LCME_ID_MAX
struct lov_comp_md_entry_v1 {
bool lease_broken = false;
fmode_t fmode = 0;
enum mds_op_bias bias = 0;
+ int fdv;
struct file *layout_file = NULL;
void *data = NULL;
size_t data_size = 0;
bias = MDS_CLOSE_RESYNC_DONE;
break;
case LL_LEASE_LAYOUT_MERGE: {
- int fd;
if (ioc->lil_count != 1)
GOTO(out_lease_close, rc = -EINVAL);
arg += sizeof(*ioc);
- if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
+ if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
GOTO(out_lease_close, rc = -EFAULT);
- layout_file = fget(fd);
+ layout_file = fget(fdv);
if (!layout_file)
GOTO(out_lease_close, rc = -EBADF);
break;
}
case LL_LEASE_LAYOUT_SPLIT: {
- int fdv;
int mirror_id;
if (ioc->lil_count != 2)
EXIT;
out:
- switch (ioc->lil_flags) {
- case LL_LEASE_RESYNC_DONE:
- if (data)
- OBD_FREE(data, data_size);
- break;
- case LL_LEASE_LAYOUT_MERGE:
- case LL_LEASE_LAYOUT_SPLIT:
- if (layout_file)
- fput(layout_file);
+ if (ioc->lil_flags == LL_LEASE_RESYNC_DONE && data)
+ OBD_FREE(data, data_size);
- ll_layout_refresh(inode, &fd->fd_layout_version);
- break;
- case LL_LEASE_PCC_ATTACH:
+ if (layout_file)
+ fput(layout_file);
+
+ if (ioc->lil_flags == LL_LEASE_PCC_ATTACH) {
if (!rc)
rc = rc2;
rc = pcc_readwrite_attach_fini(file, inode,
param.pa_layout_gen,
lease_broken, rc,
attached);
- break;
}
+ ll_layout_refresh(inode, &fd->fd_layout_version);
+
if (!rc)
rc = ll_lease_type_from_fmode(fmode);
RETURN(rc);
lo->ldo_layout_gen++;
lo->ldo_layout_gen |= preserve;
/* Zero is not a valid generation */
- if (unlikely((lo->ldo_layout_gen & LCME_ID_MASK) == 0))
+ if (unlikely((lo->ldo_layout_gen & LCME_ID_MASK) == 0)) {
lo->ldo_layout_gen++;
+ lo->ldo_layout_gen &= ~LU_LAYOUT_RESYNC;
+ }
}
struct lod_it {
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
struct lod_layout_component *lod_comp;
struct lu_extent extent = { 0 };
int rc;
* This way it can make sure that the layout version is
* monotonously increased in this writing era. */
lod_obj_inc_layout_gen(lo);
- if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
- __u32 layout_version;
-
- get_random_bytes(&layout_version, sizeof(layout_version));
- lo->ldo_layout_gen = layout_version & 0xffff;
- }
rc = lod_declare_instantiate_components(env, lo, th, 0);
if (rc)
GOTO(out, rc);
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
- if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
- layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
- rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
- if (rc)
- GOTO(out, rc);
-
out:
if (rc)
lod_striping_free(env, lo);
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
struct lod_layout_component *lod_comp;
struct lu_extent extent = { 0 };
int primary = -1;
if (rc)
GOTO(out, rc);
- /* 3. transfer layout version to OST objects.
- * transfer new layout version to OST objects so that stale writes
- * can be denied. It also ends an era of writing by setting
- * LU_LAYOUT_RESYNC. Normal client can never use this bit to
- * send write RPC; only resync RPCs could do it. */
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
- if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
- layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
- rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
- if (rc)
- GOTO(out, rc);
-
lod_obj_inc_layout_gen(lo);
out:
if (rc)
struct md_layout_change *mlc, struct thandle *th)
{
struct lu_attr *attr = &lod_env_info(env)->lti_attr;
- struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
- struct lod_object *lo = lod_dt_obj(dt);
int rc;
ENTRY;
}
rc = lod_striped_create(env, dt, attr, NULL, th);
- if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
- layout_attr->la_layout_version |= lo->ldo_layout_gen;
- rc = lod_attr_set(env, dt, layout_attr, th);
- }
RETURN(rc);
}
}
lod_comp->llc_stripe_count = 0;
} else {
+ lod_comp->llc_layout_gen = 0;
lod_comp->llc_stripe = stripe;
lod_comp->llc_ost_indices = ost_indices;
lod_comp->llc_stripes_allocated = stripe_len;
GOTO(out, rc);
rc = mdo_xattr_del(env, vic, XATTR_NAME_LOV, handle);
- if (rc) /* wtf? */
+ if (rc)
GOTO(out_restore, rc);
(void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle,
return 0;
}
+/* whether the requestion IO contains older layout version than that on the
+ * disk. */
+static inline bool ofd_layout_version_less(__u32 req_version,
+ __u32 ondisk_version)
+{
+ __u32 req = req_version & ~LU_LAYOUT_RESYNC;
+ __u32 ondisk = ondisk_version & ~LU_LAYOUT_RESYNC;
+
+ /**
+ * request layout version could be circularly increased to the samllest
+ * value, in that case @req < @ondisk but @req does not have the high
+ * end bit set while @ondisk does.
+ */
+ return (req < ondisk) &&
+ ((req & LU_LAYOUT_HIGEN) == (ondisk & LU_LAYOUT_HIGEN));
+}
+
#endif /* _OFD_INTERNAL_H */
int ofd_verify_layout_version(const struct lu_env *env,
struct ofd_object *fo, const struct obdo *oa)
{
- __u32 layout_version;
int rc;
ENTRY;
rc = ofd_object_ff_load(env, fo);
if (rc < 0) {
if (rc == -ENODATA)
- rc = -EINPROGRESS;
+ rc = 0;
GOTO(out, rc);
}
- layout_version = fo->ofo_ff.ff_layout_version;
- if (oa->o_layout_version >= layout_version &&
- oa->o_layout_version <= layout_version + fo->ofo_ff.ff_range)
- GOTO(out, rc = 0);
-
- /* normal traffic, decide if to return ESTALE or EINPROGRESS */
- layout_version &= ~LU_LAYOUT_RESYNC;
-
- /* this update is not legitimate */
- if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) <= layout_version)
- GOTO(out, rc = -ESTALE);
-
- /* layout version may not be transmitted yet */
- if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) > layout_version)
- GOTO(out, rc = -EINPROGRESS);
+ /**
+ * this update is not legitimate, whose layout version is older than
+ * that on the disk.
+ */
+ if (ofd_layout_version_less(oa->o_layout_version,
+ fo->ofo_ff.ff_layout_version +
+ fo->ofo_ff.ff_range))
+ RETURN(-ESTALE);
EXIT;
ofd_object_put(env, fo);
GOTO(out, rc);
}
-
- oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
}
if (ptlrpc_connection_is_local(exp->exp_connection))
PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
ff->ff_layout_version, oa->o_layout_version);
- /* only the MDS has the authority to update layout version */
- if (!(exp_connect_flags(ofd_info(env)->fti_exp) &
- OBD_CONNECT_MDS)) {
- CERROR(DFID": update layout version from client\n",
- PFID(&fo->ofo_ff.ff_parent));
-
- RETURN(-EPERM);
- }
-
if (ff->ff_layout_version & LU_LAYOUT_RESYNC) {
/* this opens a new era of writing */
ff->ff_layout_version = 0;
ff->ff_range = 0;
}
- /* it's not allowed to change it to a smaller value */
- if (oa->o_layout_version < ff->ff_layout_version)
+ /*it's not allowed to change it to a smaller value */
+ if (ofd_layout_version_less(oa->o_layout_version,
+ ff->ff_layout_version))
RETURN(-EINVAL);
if (ff->ff_layout_version == 0 ||
rc = ofd_verify_layout_version(env, fo, oa);
if (rc)
GOTO(unlock, rc);
-
- oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
}
if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags & LUSTRE_ENCRYPT_FL) {
else
body->oa.o_valid |= rec->lsr_valid;
- if (body->oa.o_valid & OBD_MD_LAYOUT_VERSION) {
- OBD_FAIL_TIMEOUT(OBD_FAIL_FLR_LV_DELAY, cfs_fail_val);
- if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_FLR_LV_INC)))
- ++body->oa.o_layout_version;
- }
-
osp_sync_send_new_rpc(d, llh, h, req);
RETURN(0);
}
}
run_test 35 "allow to write to mirrored files"
+get_file_layout_version() {
+ local tf=$1
+ local flv=$($LFS getstripe $tf | awk '/lcm_layout_gen/{print $2}')
+
+ echo -n $flv
+}
+
+get_ost_layout_version() {
+ local tf=$1
+ local olv=$($MULTIOP $tf oXc | awk '/ostlayoutversion/{print $2}')
+
+ echo -n $flv
+}
+
verify_ost_layout_version() {
local tf=$1
# get file layout version
- local flv=$($LFS getstripe $tf | awk '/lcm_layout_gen/{print $2}')
+ local flv=$(get_file_layout_version $tf)
# layout version from OST objects
- local olv=$($MULTIOP $tf oXc | awk '/ostlayoutversion/{print $2}')
+ local olv=$(get_ost_layout_version $tf)
- [ $flv -eq $olv ] || error "layout version mismatch: $flv vs. $olv"
+ (( flv >= olv )) || error "layout version mismatch: $flv vs. $olv"
}
create_file_36() {
done
}
-test_36() {
+test_36a() {
local tf=$DIR/$tfile
create_file_36 $tf $tf-2 $tf-3
local st=$(date +%s)
$MULTIOP $tf-2 oO_WRONLY:w1024Yc || error "write mirrored file error"
- [ $(date +%s) -ge $((st+delay_sec)) ] ||
- error "write finished before layout version is transmitted"
-
# verify OST layout version
verify_ost_layout_version $tf
do_facet $mds_facet $LCTL set_param fail_loc=0
+}
+run_test 36a "write to mirrored files"
- # test case 3
- mds_idx=mds$(($($LFS getstripe -m $tf-3) + 1))
+test_36b() {
+ local tf=$DIR/$tfile
- #define OBD_FAIL_FLR_LV_INC 0x1A02
- do_facet $mds_facet $LCTL set_param fail_loc=0x1A02
+ (( OSTCOUNT < 2 )) && skip "need >= 2 OSTs" && return
- # write open file should return error
- $MULTIOP $tf-3 oO_WRONLY:O_SYNC:w1024c &&
- error "write a mirrored file succeeded" || true
+ # create 2 mirrors using different OSTs
+ $LFS setstripe -N -c1 -i0 --flags=prefer -N -c1 -i1 $tf ||
+ error "create mirrored file"
- do_facet $mds_facet $LCTL set_param fail_loc=0
+ # write 1M data to one mirror
+ dd if=/dev/zero of=$tf bs=1M count=1 || error "write file error"
+ sync
+
+ # set prefer mirror to another mirror
+ $LFS setstripe --comp-set -I0x10001 --comp-flags=^prefer $tf ||
+ error "clear prefer mirror error"
+ $LFS setstripe --comp-set -I0x20002 --comp-flags=prefer $tf ||
+ error "set prefer mirror error"
+
+ # the second write should not hung
+ dd if=/dev/zero of=$tf bs=1M count=1 || error "write file error"
+}
+run_test 36b "write should not hung when prefered mirror is stale"
+
+test_36c() {
+ local tf=$DIR/$tfile
+
+ (( OSTCOUNT < 2 )) && skip "need >= 2 OSTs" && return
+
+ # create 2 mirrors using different OSTs
+ $LFS setstripe -N -c1 -i0 --flags=prefer -N -c1 -i1 $tf ||
+ error "create mirrored file"
+
+ # write it in the background
+ dd if=/dev/zero of=$tf bs=1M count=600 &
+ local pid=$!
+
+ sleep 1
+
+ $LFS setstripe --comp-set -I0x10001 --comp-flags=^prefer $tf ||
+ error "clear prefer mirror error"
+ $LFS setstripe --comp-set -I0x20002 --comp-flags=prefer $tf ||
+ error "set prefer mirror error"
+
+ wait $pid
+}
+run_test 36c "change prefer mirror during write shouldn't hung"
+
+test_36d() {
+ local tf=$DIR/$tfile
+
+ echo " ** create $tf"
+ $LFS mirror create -N $tf || error "create $tf failed"
+
+ for i in 1 2; do
+ echo " ** mirror extend $tf"
+ $LFS mirror extend -N $tf || error "mirror extend $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ done
+
+ for i in 1 2; do
+ echo " ** write $tf"
+ dd if=/dev/zero of=$tf bs=1k count=1 || error "write $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ (( flv == olv )) ||
+ error "write update OST layout failed $flv/$olv"
+ done
+
+ echo " ** resync $tf"
+ $LFS mirror resync $tf || error "mirror resync $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+
+ for i in 1 2; do
+ echo " ** truncate $tf"
+ $TRUNCATE $tf $((1024 * 1024)) || error "truncate $tf fails"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ (( flv == olv || flv == olv + 1 )) ||
+ error "truncate update OST layout failed $flv/$olv"
+ done
+
+ for i in 1 2; do
+ echo " ** write $tf"
+ dd if=/dev/zero of=$tf bs=1k count=1 || error "write $tf failed"
+ flv=$(get_file_layout_version $tf)
+ olv=$(get_ost_layout_version $tf)
+ echo " flv=$flv olv=$olv"
+ (( flv == olv )) ||
+ error "write update OST layout failed $flv/$olv"
+ done
}
-run_test 36 "write to mirrored files"
+run_test 36d "write/punch FLR file update OST layout version"
create_files_37() {
local tf
test_50a() {
$LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' ||
skip "OST does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset
test_50b() {
$LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' ||
skip "OST does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset