/* FLR */
#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE 0x1A00
#define OBD_FAIL_FLR_LV_DELAY 0x1A01
-#define OBD_FAIL_FLR_LV_INC 0x1A02
+#define OBD_FAIL_FLR_LV_INC 0x1A02 /* not used */
#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03
/* DT */
#define LCME_TEMPLATE_FLAGS (LCME_FL_PREF_RW | LCME_FL_NOSYNC | \
LCME_FL_EXTENSION)
-/* the highest bit in obdo::o_layout_version is used to mark if the file is
- * being resynced. */
-#define LU_LAYOUT_RESYNC LCME_FL_NEG
-
/* lcme_id can be specified as certain flags, and the the first
* bit of lcme_id is used to indicate that the ID is representing
* certain LCME_FL_* but not a real ID. Which implies we can have
LCME_ID_NOT_ID = LCME_FL_NEG
};
+/* layout version equals to lcme_id, except some bits have special meanings */
+enum layout_version_flags {
+ /* layout version reaches the high water mark to be increased to
+ * circularly reuse the smallest value */
+ LU_LAYOUT_HIGEN = 0x40000000,
+ /* the highest bit is used to mark if the file is being resynced */
+ LU_LAYOUT_RESYNC = 0x80000000,
+};
+
#define LCME_ID_MASK LCME_ID_MAX
struct lov_comp_md_entry_v1 {
bool lease_broken = false;
fmode_t fmode = 0;
enum mds_op_bias bias = 0;
+ int fdv;
struct file *layout_file = NULL;
void *data = NULL;
size_t data_size = 0;
bias = MDS_CLOSE_RESYNC_DONE;
break;
case LL_LEASE_LAYOUT_MERGE: {
- int fd;
if (ioc->lil_count != 1)
GOTO(out_lease_close, rc = -EINVAL);
arg += sizeof(*ioc);
- if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
+ if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
GOTO(out_lease_close, rc = -EFAULT);
- layout_file = fget(fd);
+ layout_file = fget(fdv);
if (!layout_file)
GOTO(out_lease_close, rc = -EBADF);
break;
}
case LL_LEASE_LAYOUT_SPLIT: {
- int fdv;
int mirror_id;
if (ioc->lil_count != 2)
EXIT;
out:
- switch (ioc->lil_flags) {
- case LL_LEASE_RESYNC_DONE:
- if (data)
- OBD_FREE(data, data_size);
- break;
- case LL_LEASE_LAYOUT_MERGE:
- case LL_LEASE_LAYOUT_SPLIT:
- if (layout_file)
- fput(layout_file);
+ if (ioc->lil_flags == LL_LEASE_RESYNC_DONE && data)
+ OBD_FREE(data, data_size);
- ll_layout_refresh(inode, &fd->fd_layout_version);
- break;
- case LL_LEASE_PCC_ATTACH:
+ if (layout_file)
+ fput(layout_file);
+
+ if (ioc->lil_flags == LL_LEASE_PCC_ATTACH) {
if (!rc)
rc = rc2;
rc = pcc_readwrite_attach_fini(file, inode,
param.pa_layout_gen,
lease_broken, rc,
attached);
- break;
}
+ ll_layout_refresh(inode, &fd->fd_layout_version);
+
if (!rc)
rc = ll_lease_type_from_fmode(fmode);
RETURN(rc);
lo->ldo_layout_gen++;
lo->ldo_layout_gen |= preserve;
/* Zero is not a valid generation */
- if (unlikely((lo->ldo_layout_gen & LCME_ID_MASK) == 0))
+ if (unlikely((lo->ldo_layout_gen & LCME_ID_MASK) == 0)) {
lo->ldo_layout_gen++;
+ lo->ldo_layout_gen &= ~LU_LAYOUT_RESYNC;
+ }
}
struct lod_it {
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
struct lu_buf *buf = &info->lti_buf;
struct lod_object *lo = lod_dt_obj(dt);
struct lov_comp_md_v1 *lcm;
}
/* fixup layout information */
+ lod_obj_inc_layout_gen(lo);
+ lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
lcm->lcm_size = cpu_to_le32(size);
lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
if (rc)
GOTO(out, rc);
- lod_obj_inc_layout_gen(lo);
- lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
-
- /* transfer layout version to OST objects. */
- if (lo->ldo_mirror_count > 1) {
- struct lod_obj_stripe_cb_data data = { {0} };
-
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0;
- data.locd_attr = layout_attr;
- data.locd_declare = true;
- data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
- rc = lod_obj_for_each_stripe(env, lo, th, &data);
- if (rc)
- GOTO(out, rc);
- }
-
rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
struct dt_object *dt, const struct lu_buf *mbuf,
struct thandle *th)
{
- struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
struct lod_object *lo = lod_dt_obj(dt);
struct lov_comp_md_v1 *lcm = mbuf->lb_buf;
int rc;
/* fix on-disk layout gen */
lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
-
- /* transfer layout version to OST objects. */
- if (lo->ldo_mirror_count > 1) {
- struct lod_obj_stripe_cb_data data = { {0} };
-
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0;
- data.locd_attr = layout_attr;
- data.locd_declare = true;
- data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
- rc = lod_obj_for_each_stripe(env, lo, th, &data);
- if (rc)
- RETURN(rc);
- }
-
rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), mbuf,
XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
RETURN(rc);
const char *name, int fl, struct thandle *th)
{
struct dt_object *next = dt_object_child(dt);
- struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
struct lod_object *lo = lod_dt_obj(dt);
- struct lod_obj_stripe_cb_data data = { {0} };
int rc = 0;
ENTRY;
rc = lod_striping_reload(env, lo, buf, LVF_ALL_STALE);
if (rc)
RETURN(rc);
-
- if (lo->ldo_mirror_count > 1 &&
- layout_attr->la_valid & LA_LAYOUT_VERSION) {
- /* mirror split */
- layout_attr->la_layout_version =
- lo->ldo_layout_gen;
- data.locd_attr = layout_attr;
- data.locd_declare = false;
- data.locd_stripe_cb =
- lod_obj_stripe_attr_set_cb;
- rc = lod_obj_for_each_stripe(env, lo, th,
- &data);
- if (rc)
- RETURN(rc);
- }
} else if (fl & LU_XATTR_PURGE) {
rc = lod_layout_purge(env, dt, buf, th);
} else if (dt_object_remote(dt)) {
rc = lod_striped_create(env, dt, NULL, NULL, th);
if (rc)
RETURN(rc);
-
- if (fl & LU_XATTR_MERGE && lo->ldo_mirror_count > 1 &&
- layout_attr->la_valid & LA_LAYOUT_VERSION) {
- /* mirror merge exec phase */
- layout_attr->la_layout_version =
- lo->ldo_layout_gen;
- data.locd_attr = layout_attr;
- data.locd_declare = false;
- data.locd_stripe_cb =
- lod_obj_stripe_attr_set_cb;
- rc = lod_obj_for_each_stripe(env, lo, th,
- &data);
- if (rc)
- RETURN(rc);
- }
}
RETURN(rc);
} else if (strcmp(name, XATTR_NAME_FID) == 0) {
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
struct lod_layout_component *lod_comp;
struct lu_extent extent = { 0 };
int rc;
* This way it can make sure that the layout version is
* monotonously increased in this writing era. */
lod_obj_inc_layout_gen(lo);
- if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
- __u32 layout_version;
-
- get_random_bytes(&layout_version, sizeof(layout_version));
- lo->ldo_layout_gen = layout_version & 0xffff;
- }
rc = lod_declare_instantiate_components(env, lo, th, 0);
if (rc)
GOTO(out, rc);
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
- if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
- layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
- rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
- if (rc)
- GOTO(out, rc);
-
out:
if (rc)
lod_striping_free(env, lo);
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
struct lod_layout_component *lod_comp;
struct lu_extent extent = { 0 };
int primary = -1;
if (rc)
GOTO(out, rc);
- /* 3. transfer layout version to OST objects.
- * transfer new layout version to OST objects so that stale writes
- * can be denied. It also ends an era of writing by setting
- * LU_LAYOUT_RESYNC. Normal client can never use this bit to
- * send write RPC; only resync RPCs could do it. */
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
- if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
- layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
- rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
- if (rc)
- GOTO(out, rc);
-
lod_obj_inc_layout_gen(lo);
out:
if (rc)
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
unsigned sync_components = 0;
unsigned resync_components = 0;
int i;
lo->ldo_flr_state = LCM_FL_RDONLY;
lod_obj_inc_layout_gen(lo);
- layout_attr->la_valid = LA_LAYOUT_VERSION;
- layout_attr->la_layout_version = 0; /* set current version */
- rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
- if (rc)
- GOTO(out, rc);
-
info->lti_buf.lb_len = lod_comp_md_size(lo, false);
rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
&info->lti_buf, XATTR_NAME_LOV, 0, th);
struct md_layout_change *mlc, struct thandle *th)
{
struct lu_attr *attr = &lod_env_info(env)->lti_attr;
- struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
- struct lod_object *lo = lod_dt_obj(dt);
int rc;
ENTRY;
}
rc = lod_striped_create(env, dt, attr, NULL, th);
- if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
- layout_attr->la_layout_version |= lo->ldo_layout_gen;
- rc = lod_attr_set(env, dt, layout_attr, th);
- }
RETURN(rc);
}
}
lod_comp->llc_stripe_count = 0;
} else {
+ lod_comp->llc_layout_gen = 0;
lod_comp->llc_stripe = stripe;
lod_comp->llc_ost_indices = ost_indices;
lod_comp->llc_stripes_allocated = stripe_len;
GOTO(out, rc);
rc = mdo_xattr_del(env, vic, XATTR_NAME_LOV, handle);
- if (rc) /* wtf? */
+ if (rc)
GOTO(out_restore, rc);
(void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle,
return 0;
}
+/* whether the requestion IO contains older layout version than that on the
+ * disk. */
+static inline bool ofd_layout_version_less(__u32 req_version,
+ __u32 ondisk_version)
+{
+ __u32 req = req_version & ~LU_LAYOUT_RESYNC;
+ __u32 ondisk = ondisk_version & ~LU_LAYOUT_RESYNC;
+
+ /**
+ * request layout version could be circularly increased to the samllest
+ * value, in that case @req < @ondisk but @req does not have the high
+ * end bit set while @ondisk does.
+ */
+ return (req < ondisk) &&
+ ((req & LU_LAYOUT_HIGEN) == (ondisk & LU_LAYOUT_HIGEN));
+}
+
#endif /* _OFD_INTERNAL_H */
int ofd_verify_layout_version(const struct lu_env *env,
struct ofd_object *fo, const struct obdo *oa)
{
- __u32 layout_version;
int rc;
ENTRY;
rc = ofd_object_ff_load(env, fo);
if (rc < 0) {
if (rc == -ENODATA)
- rc = -EINPROGRESS;
+ rc = 0;
GOTO(out, rc);
}
- layout_version = fo->ofo_ff.ff_layout_version;
- if (oa->o_layout_version >= layout_version &&
- oa->o_layout_version <= layout_version + fo->ofo_ff.ff_range)
- GOTO(out, rc = 0);
-
- /* normal traffic, decide if to return ESTALE or EINPROGRESS */
- layout_version &= ~LU_LAYOUT_RESYNC;
-
- /* this update is not legitimate */
- if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) <= layout_version)
- GOTO(out, rc = -ESTALE);
-
- /* layout version may not be transmitted yet */
- if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) > layout_version)
- GOTO(out, rc = -EINPROGRESS);
+ /**
+ * this update is not legitimate, whose layout version is older than
+ * that on the disk.
+ */
+ if (ofd_layout_version_less(oa->o_layout_version,
+ fo->ofo_ff.ff_layout_version +
+ fo->ofo_ff.ff_range))
+ RETURN(-ESTALE);
EXIT;
ff->ff_range = 0;
}
- /* it's not allowed to change it to a smaller value */
- if (oa->o_layout_version < ff->ff_layout_version)
+ /*it's not allowed to change it to a smaller value */
+ if (ofd_layout_version_less(oa->o_layout_version,
+ ff->ff_layout_version))
RETURN(-EINVAL);
if (ff->ff_layout_version == 0 ||
else
body->oa.o_valid |= rec->lsr_valid;
- if (body->oa.o_valid & OBD_MD_LAYOUT_VERSION) {
- OBD_FAIL_TIMEOUT(OBD_FAIL_FLR_LV_DELAY, cfs_fail_val);
- if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_FLR_LV_INC)))
- body->oa.o_layout_version = LU_LAYOUT_RESYNC |
- (body->oa.o_layout_version + 1);
- }
-
osp_sync_send_new_rpc(d, llh, h, req);
RETURN(0);
}
# layout version from OST objects
local olv=$($MULTIOP $tf oXc | awk '/ostlayoutversion/{print $2}')
- [ $flv -eq $olv ] || error "layout version mismatch: $flv vs. $olv"
+ (( flv >= olv )) || error "layout version mismatch: $flv vs. $olv"
}
create_file_36() {
done
}
-test_36() {
+test_36a() {
local tf=$DIR/$tfile
stack_trap "rm -f $tf $tf-2 $tf-3"
local st=$(date +%s)
$MULTIOP $tf-2 oO_WRONLY:w1024Yc || error "write mirrored file error"
- [ $(date +%s) -ge $((st+delay_sec)) ] ||
- error "write finished before layout version is transmitted"
-
# verify OST layout version
verify_ost_layout_version $tf
do_facet $mds_facet $LCTL set_param fail_loc=0
+}
+run_test 36a "write to mirrored files"
- # test case 3
- mds_idx=mds$(($($LFS getstripe -m $tf-3) + 1))
+test_36b() {
+ local tf=$DIR/$tfile
- #define OBD_FAIL_FLR_LV_INC 0x1A02
- do_facet $mds_facet $LCTL set_param fail_loc=0x1A02
+ (( OSTCOUNT < 2 )) && skip "need >= 2 OSTs" && return
- # write open file should return error
- $MULTIOP $tf-3 oO_WRONLY:O_SYNC:w1024c &&
- error "write a mirrored file succeeded" || true
+ # create 2 mirrors using different OSTs
+ $LFS setstripe -N -c1 -i0 --flags=prefer -N -c1 -i1 $tf ||
+ error "create mirrored file"
- do_facet $mds_facet $LCTL set_param fail_loc=0
+ # write 1M data to one mirror
+ dd if=/dev/zero of=$tf bs=1M count=1 || error "write file error"
+ sync
+
+ # set prefer mirror to another mirror
+ $LFS setstripe --comp-set -I0x10001 --comp-flags=^prefer $tf ||
+ error "clear prefer mirror error"
+ $LFS setstripe --comp-set -I0x20002 --comp-flags=prefer $tf ||
+ error "set prefer mirror error"
+
+ # the second write should not hung
+ dd if=/dev/zero of=$tf bs=1M count=1 || error "write file error"
+}
+run_test 36b "write should not hung when prefered mirror is stale"
+
+test_36c() {
+ local tf=$DIR/$tfile
+
+ (( OSTCOUNT < 2 )) && skip "need >= 2 OSTs" && return
+
+ # create 2 mirrors using different OSTs
+ $LFS setstripe -N -c1 -i0 --flags=prefer -N -c1 -i1 $tf ||
+ error "create mirrored file"
+
+ # write it in the background
+ dd if=/dev/zero of=$tf bs=1M count=600 &
+ local pid=$!
+
+ sleep 1
+
+ $LFS setstripe --comp-set -I0x10001 --comp-flags=^prefer $tf ||
+ error "clear prefer mirror error"
+ $LFS setstripe --comp-set -I0x20002 --comp-flags=prefer $tf ||
+ error "set prefer mirror error"
+
+ wait $pid
}
-run_test 36 "write to mirrored files"
+run_test 36c "change prefer mirror during write shouldn't hung"
create_files_37() {
local tf
test_50a() {
$LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' ||
skip "OST does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset
test_50b() {
$LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' ||
skip "OST does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset
skip "OST does not support SEEK_HOLE"
(( $LINUX_VERSION_CODE > $(version_code 3.0.0) )) ||
skip "client kernel does not support SEEK_HOLE"
+ [ "$FSTYPE" != "zfs" ] ||
+ skip "lseek for ZFS is not accurate if obj is not committed"
local file=$DIR/$tdir/$tfile
local offset