const struct lmv_stripe_md *lsm = &lsmo->lso_lsm;
int i;
- CDEBUG_LIMIT(mask,
+ CDEBUG(mask,
"dump LMV: refs %u magic=%#x count=%u index=%u hash=%s:%#x max_inherit=%hhu max_inherit_rr=%hhu version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",
lsm->lsm_md_magic, atomic_read(&lsmo->lso_refs),
lsm->lsm_md_stripe_count, lsm->lsm_md_master_mdt_index,
return;
for (i = 0; i < lsm->lsm_md_stripe_count; i++)
- CDEBUG(mask, "stripe[%d] "DFID"\n",
- i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
+ CDEBUG_LIMIT(mask, "stripe[%d] "DFID"\n",
+ i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
}
static inline bool
sp_permitted:1, /* do not check permission */
sp_migrate_close:1, /* close the file during migrate */
sp_migrate_nsonly:1, /* migrate dirent only */
- sp_dmv_imp_inherit:1; /* implicit default LMV inherit */
+ sp_dmv_imp_inherit:1, /* implicit default LMV inherit */
+ sp_replay:1; /* replay, op may be partially executed */
/** to create directory */
const struct dt_index_features *sp_feat;
MDS_OPEN_PCC | MDS_OP_WITH_FID | \
MDS_OPEN_DEFAULT_LMV)
+/* mkdir fetches LMV, reuse bit of MDS_OPEN_RESYNC */
+#define MDS_MKDIR_LMV MDS_OPEN_RESYNC
/********* Changelogs **********/
/** Changelog record types */
}
/* The slot has been occupied. */
- if (!fid_is_zero(&lmv1->lmv_stripe_fids[index])) {
+ if (!fid_is_zero(&lmv1->lmv_stripe_fids[index]) &&
+ !CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME)) {
struct lu_fid fid0;
fid_le_to_cpu(&fid0,
return 0;
}
+static int lod_prep_md_replayed_create(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ const struct lu_buf *lmv_buf,
+ struct dt_object_format *dof,
+ struct thandle *th)
+{
+ struct lod_object *lo = lod_dt_obj(dt);
+ int rc;
+
+ ENTRY;
+
+ mutex_lock(&lo->ldo_layout_mutex);
+ rc = lod_parse_dir_striping(env, lo, lmv_buf);
+ if (rc == 0) {
+ lo->ldo_dir_stripe_loaded = 1;
+ lo->ldo_dir_striped = 1;
+ rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
+ }
+ mutex_unlock(&lo->ldo_layout_mutex);
+
+ RETURN(rc);
+}
+
/**
*
* Free cached foreign LMV
if (lo->ldo_is_foreign) {
rc = lod_alloc_foreign_lmv(lo, lum_buf->lb_len);
if (rc != 0)
- GOTO(out, rc);
+ RETURN(rc);
memcpy(lo->ldo_foreign_lmv, lum, lum_buf->lb_len);
lo->ldo_dir_stripe_loaded = 1;
}
- GOTO(out, rc = 0);
+ RETURN(0);
}
- /* prepare dir striped objects */
- rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
- if (rc != 0) {
+ /* client replay striped directory creation with LMV, this happens when
+ * all involved MDTs were rebooted, or MDT recovery was aborted.
+ */
+ if (le32_to_cpu(lum->lum_magic) == LMV_MAGIC_V1)
+ rc = lod_prep_md_replayed_create(env, dt, attr, lum_buf, dof,
+ th);
+ else
+ rc = lod_prep_md_striped_create(env, dt, attr, lum, dof, th);
+ if (rc != 0)
/* failed to create striping, let's reset
* config so that others don't get confused */
lod_striping_free(env, lo);
- GOTO(out, rc);
- }
-out:
+
RETURN(rc);
}
*
* \param[in] env execution environment
* \param[in] dt the striped object
- * \param[in] buf not used currently
+ * \param[in] buf buf lmv_user_md for create, or lmv_mds_md for replay
* \param[in] name not used currently
* \param[in] fl xattr flag (see OSD API description)
* \param[in] th transaction handle
const struct lu_buf *buf, const char *name,
int fl, struct thandle *th)
{
- struct lod_object *lo = lod_dt_obj(dt);
- struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *attr = &info->lti_attr;
+ struct lod_object *lo = lod_dt_obj(dt);
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lu_attr *attr = &info->lti_attr;
struct dt_object_format *dof = &info->lti_format;
- struct lu_buf lmv_buf;
- struct lu_buf slave_lmv_buf;
- struct lmv_mds_md_v1 *lmm;
- struct lmv_mds_md_v1 *slave_lmm = NULL;
- struct dt_insert_rec *rec = &info->lti_dt_rec;
- int i;
- int rc;
- ENTRY;
+ struct lu_buf lmv_buf;
+ struct lu_buf slave_lmv_buf;
+ struct lmv_user_md *lum = buf->lb_buf;
+ struct lmv_mds_md_v1 *lmm;
+ struct lmv_mds_md_v1 *slave_lmm = NULL;
+ struct dt_insert_rec *rec = &info->lti_dt_rec;
+ int i;
+ int rc;
+ ENTRY;
+ /* lum is used to know whether it's replay */
+ LASSERT(lum);
if (!S_ISDIR(dt->do_lu.lo_header->loh_attr))
RETURN(-ENOTDIR);
struct lu_name *sname;
struct linkea_data ldata = { NULL };
struct lu_buf linkea_buf;
+ bool stripe_created = false;
/* OBD_FAIL_MDS_STRIPE_FID may leave stripe uninitialized */
if (!dto)
if (i && CFS_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_CREATE))
continue;
+ /* if it's replay by client request, and stripe exists on remote
+ * MDT, it means mkdir was partially executed: stripe was
+ * created on remote MDT successfully, but target not in last
+ * run.
+ */
+ if (unlikely((le32_to_cpu(lum->lum_magic) == LMV_MAGIC_V1) &&
+ dt_object_exists(dto) && dt_object_remote(dto)))
+ stripe_created = true;
+
/* don't create stripe if:
* 1. it's source stripe of migrating directory
* 2. it's existed stripe of splitting directory
(lod_is_splitting(lo) && i < lo->ldo_dir_split_offset)) {
if (!dt_object_exists(dto))
GOTO(out, rc = -EINVAL);
- } else {
+ } else if (!stripe_created) {
dt_write_lock(env, dto, DT_TGT_CHILD);
rc = lod_sub_create(env, dto, attr, NULL, dof, th);
if (rc != 0) {
lo->ldo_dir_split_offset > i)
continue;
- rec->rec_fid = lu_object_fid(&dt->do_lu);
- rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
- (const struct dt_key *)dotdot, th);
- if (rc != 0)
- GOTO(out, rc);
-
if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_SLAVE_NAME) &&
cfs_fail_val == i)
snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
snprintf(stripe_name, sizeof(info->lti_key), DFID":%d",
PFID(lu_object_fid(&dto->do_lu)), i);
- sname = lod_name_get(env, stripe_name, strlen(stripe_name));
- rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
- sname, lu_object_fid(&dt->do_lu));
- if (rc != 0)
- GOTO(out, rc);
+ if (!stripe_created) {
+ rec->rec_fid = lu_object_fid(&dt->do_lu);
+ rc = lod_sub_insert(env, dto, (struct dt_rec *)rec,
+ (const struct dt_key *)dotdot, th);
+ if (rc != 0)
+ GOTO(out, rc);
- linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
- linkea_buf.lb_len = ldata.ld_leh->leh_len;
- rc = lod_sub_xattr_set(env, dto, &linkea_buf,
- XATTR_NAME_LINK, 0, th);
- if (rc != 0)
- GOTO(out, rc);
+ sname = lod_name_get(env, stripe_name,
+ strlen(stripe_name));
+ rc = linkea_links_new(&ldata, &info->lti_linkea_buf,
+ sname, lu_object_fid(&dt->do_lu));
+ if (rc != 0)
+ GOTO(out, rc);
+
+ linkea_buf.lb_buf = ldata.ld_buf->lb_buf;
+ linkea_buf.lb_len = ldata.ld_leh->leh_len;
+ rc = lod_sub_xattr_set(env, dto, &linkea_buf,
+ XATTR_NAME_LINK, 0, th);
+ if (rc != 0)
+ GOTO(out, rc);
+ }
rec->rec_fid = lu_object_fid(&dto->do_lu);
rc = lod_sub_insert(env, dt_object_child(dt),
LASSERT(ergo(lds != NULL,
lds->lds_def_striping_set ||
lds->lds_dir_def_striping_set));
+ LASSERT(lmu);
if (!LMVEA_DELETE_VALUES(lo->ldo_dir_stripe_count,
lo->ldo_dir_stripe_offset)) {
- if (!lmu) {
+ if (!lmu->lb_buf) {
+ /* mkdir by default LMV */
struct lmv_user_md_v1 *v1 = info->lti_ea_store;
int stripe_count = lo->ldo_dir_stripe_count;
th);
if (rc != 0)
RETURN(rc);
- } else {
+ } else if (lmu->lb_buf) {
/* foreign LMV EA case */
- if (lmu) {
+ if (declare) {
struct lmv_foreign_md *lfm = lmu->lb_buf;
- if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) {
+ if (lfm->lfm_magic == LMV_MAGIC_FOREIGN)
rc = lod_declare_xattr_set_lmv(env, dt, attr,
lmu, dof, th);
- }
- } else {
- if (lo->ldo_is_foreign) {
- LASSERT(lo->ldo_foreign_lmv != NULL &&
- lo->ldo_foreign_lmv_size > 0);
- info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
- info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
- lmu = &info->lti_buf;
- rc = lod_xattr_set_lmv(env, dt, lmu,
- XATTR_NAME_LMV, 0, th);
- }
+ } else if (lo->ldo_is_foreign) {
+ LASSERT(lo->ldo_foreign_lmv != NULL &&
+ lo->ldo_foreign_lmv_size > 0);
+ info->lti_buf.lb_buf = lo->ldo_foreign_lmv;
+ info->lti_buf.lb_len = lo->ldo_foreign_lmv_size;
+ lmu = &info->lti_buf;
+ rc = lod_xattr_set_lmv(env, dt, lmu, XATTR_NAME_LMV, 0,
+ th);
}
}
static int lod_dir_striping_create(const struct lu_env *env,
struct dt_object *dt,
struct lu_attr *attr,
+ const struct lu_buf *lmu,
struct dt_object_format *dof,
struct thandle *th)
{
- return lod_dir_striping_create_internal(env, dt, attr, NULL, dof, th,
+ return lod_dir_striping_create_internal(env, dt, attr, lmu, dof, th,
false);
}
!strcmp(name, XATTR_NAME_LMV)) {
switch (fl) {
case LU_XATTR_CREATE:
- rc = lod_dir_striping_create(env, dt, NULL, NULL, th);
+ rc = lod_dir_striping_create(env, dt, NULL, buf, NULL,
+ th);
break;
case 0:
case LU_XATTR_REPLACE:
const struct lmv_user_md_v1 *lum1 = ah->dah_eadata;
/* other default values are 0 */
- lc->ldo_dir_stripe_offset = -1;
+ lc->ldo_dir_stripe_offset = LMV_OFFSET_DEFAULT;
/* no default striping configuration is needed for
* foreign dirs
lod_get_default_striping(env, lp, ah, lds);
/* It should always honour the specified stripes */
- /* Note: old client (< 2.7)might also do lfs mkdir, whose EA
- * will have old magic. In this case, we should ignore the
- * stripe count and try to create dir by default stripe.
- */
if (ah->dah_eadata && ah->dah_eadata_len &&
!ah->dah_eadata_is_dmv &&
(le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC ||
- le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC)) {
+ le32_to_cpu(lum1->lum_magic) == LMV_USER_MAGIC_SPECIFIC ||
+ le32_to_cpu(lum1->lum_magic) == LMV_MAGIC_V1)) {
lc->ldo_dir_stripe_count =
le32_to_cpu(lum1->lum_stripe_count);
lc->ldo_dir_stripe_offset =
} else if (dof->dof_type == DFT_DIR) {
struct seq_server_site *ss;
struct lu_buf buf = { NULL };
- struct lu_buf *lmu = NULL;
ss = lu_site2seq(dt->do_lu.lo_dev->ld_site);
GOTO(out, rc = -EINVAL);
}
} else if (hint && hint->dah_eadata) {
- lmu = &buf;
- lmu->lb_buf = (void *)hint->dah_eadata;
- lmu->lb_len = hint->dah_eadata_len;
+ buf.lb_buf = (void *)hint->dah_eadata;
+ buf.lb_len = hint->dah_eadata_len;
}
- rc = lod_declare_dir_striping_create(env, dt, attr, lmu, dof,
+ rc = lod_declare_dir_striping_create(env, dt, attr, &buf, dof,
th);
}
out:
lo->ldo_dir_migrate_offset = lo->ldo_dir_stripe_count;
lo->ldo_dir_migrate_hash = le32_to_cpu(lmv->lmv_hash_type);
lo->ldo_dir_stripe_count += stripe_count;
+ lo->ldo_dir_layout_version++;
lo->ldo_dir_stripes_allocated += stripe_count;
/* plain directory split creates target as a plain directory, while
int mdc_set_open_replay_data(struct obd_export *exp,
struct obd_client_handle *och,
struct lookup_intent *it);
+int mdc_save_lmm(struct ptlrpc_request *req, void *data, u32 size);
void mdc_commit_open(struct ptlrpc_request *req);
void mdc_replay_open(struct ptlrpc_request *req);
rec->cr_suppgid1 = op_data->op_suppgids[0];
rec->cr_suppgid2 = op_data->op_suppgids[1];
flags = 0;
+
+ if (S_ISDIR(mode))
+ flags |= MDS_MKDIR_LMV;
if (op_data->op_bias & MDS_CREATE_VOLATILE)
flags |= MDS_OPEN_VOLATILE;
if (op_data->op_bias & MDS_SETSTRIPE_CREATE)
}
/**
- * Save a large LOV EA into the request buffer so that it is available
+ * Save a large LOV/LMV EA into the request buffer so that it is available
* for replay. We don't do this in the initial request because the
* original request doesn't need this buffer (at most it sends just the
* lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
* but this is incredibly unlikely, and questionable whether the client
* could do MDS recovery under OOM anyways...
*/
-static int mdc_save_lovea(struct ptlrpc_request *req, void *data, u32 size)
+int mdc_save_lmm(struct ptlrpc_request *req, void *data, u32 size)
{
struct req_capsule *pill = &req->rq_pill;
- void *lovea;
+ void *lmm;
int rc = 0;
if (req_capsule_get_size(pill, &RMF_EADATA, RCL_CLIENT) < size) {
}
req_capsule_set_size(pill, &RMF_EADATA, RCL_CLIENT, size);
- lovea = req_capsule_client_get(pill, &RMF_EADATA);
- if (lovea) {
- memcpy(lovea, data, size);
- lov_fix_ea_for_replay(lovea);
+ lmm = req_capsule_client_get(pill, &RMF_EADATA);
+ if (lmm) {
+ memcpy(lmm, data, size);
+ lov_fix_ea_for_replay(lmm);
}
return rc;
* (for example error one).
*/
if ((it->it_op & IT_OPEN) && req->rq_replay) {
- rc = mdc_save_lovea(req, eadata,
- body->mbo_eadatasize);
+ rc = mdc_save_lmm(req, eadata,
+ body->mbo_eadatasize);
if (rc) {
body->mbo_valid &= ~OBD_MD_FLEASIZE;
body->mbo_eadatasize = 0;
* another set of OST objects).
*/
if (req->rq_transno)
- (void)mdc_save_lovea(req, lvb_data, lvb_len);
+ mdc_save_lmm(req, lvb_data, lvb_len);
}
}
kernel_cap_t cap_effective, __u64 rdev,
struct ptlrpc_request **request)
{
- struct ptlrpc_request *req;
- int level, rc;
- int count, resends = 0;
- struct obd_import *import = exp->exp_obd->u.cli.cl_import;
- int generation = import->imp_generation;
+ struct ptlrpc_request *req;
+ int level, rc;
+ int count, resends = 0;
+ struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+ int generation = import->imp_generation;
LIST_HEAD(cancels);
- ENTRY;
+
+ ENTRY;
/* For case if upper layer did not alloc fid, do it now. */
if (!fid_is_sane(&op_data->op_fid2)) {
}
rebuild:
- count = 0;
- if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
- (fid_is_sane(&op_data->op_fid1)))
- count = mdc_resource_get_unused(exp, &op_data->op_fid1,
- &cancels, LCK_EX,
- MDS_INODELOCK_UPDATE);
+ count = 0;
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+ (fid_is_sane(&op_data->op_fid1)))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
- req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
&RQF_MDS_REINT_CREATE_ACL);
- if (req == NULL) {
- ldlm_lock_list_put(&cancels, l_bl_ast, count);
- RETURN(-ENOMEM);
- }
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ RETURN(-ENOMEM);
+ }
- req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
- op_data->op_namelen + 1);
- req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
- data && datalen ? datalen : 0);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+ data && datalen ? datalen : 0);
req_capsule_set_size(&req->rq_pill, &RMF_FILE_SECCTX_NAME,
RCL_CLIENT, op_data->op_file_secctx_name != NULL ?
RETURN(rc);
}
- /*
- * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
- * tgt, for symlinks or lov MD data.
- */
+ /*
+ * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+ * tgt, for symlinks or lov MD data.
+ */
mdc_create_pack(&req->rq_pill, op_data, data, datalen, mode, uid,
gid, cap_effective, rdev);
- ptlrpc_request_set_replen(req);
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ exp->exp_obd->u.cli.cl_default_mds_easize);
+ ptlrpc_request_set_replen(req);
/* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
* logic here */
req->rq_no_retry_einprogress = 1;
- if (resends) {
- req->rq_generation_set = 1;
- req->rq_import_generation = generation;
+ if (resends) {
+ req->rq_generation_set = 1;
+ req->rq_import_generation = generation;
req->rq_sent = ktime_get_real_seconds() + resends;
- }
- level = LUSTRE_IMP_FULL;
+ }
+ level = LUSTRE_IMP_FULL;
resend:
rc = mdc_reint(req, level);
- /* Resend if we were told to. */
- if (rc == -ERESTARTSYS) {
- level = LUSTRE_IMP_RECOVER;
- goto resend;
- } else if (rc == -EINPROGRESS) {
+ /* Resend if we were told to. */
+ if (rc == -ERESTARTSYS) {
+ level = LUSTRE_IMP_RECOVER;
+ goto resend;
+ } else if (rc == -EINPROGRESS) {
/* Retry create infinitely until succeed or get other
* error code or interrupted. */
ptlrpc_req_finished(req);
PFID(&op_data->op_fid1),
PFID(&op_data->op_fid2));
goto rebuild;
- } else {
- CDEBUG(D_HA, "resend cross eviction\n");
- RETURN(-EIO);
- }
- }
+ } else {
+ CDEBUG(D_HA, "resend cross eviction\n");
+ RETURN(-EIO);
+ }
+ } else if (rc == 0 && S_ISDIR(mode)) {
+ struct mdt_body *body;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ CERROR("%s: cannot swab mdt_body: rc = %d\n",
+ exp->exp_obd->obd_name, rc);
+ RETURN(rc);
+ }
+
+ if ((body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_MEA)) ==
+ (OBD_MD_FLDIREA | OBD_MD_MEA)) {
+ void *eadata;
+
+ /* clear valid, because mkdir doesn't need to initialize
+ * LMV, which will be delayed to lookup.
+ */
+ body->mbo_valid &= ~(OBD_MD_FLDIREA | OBD_MD_MEA);
+ mdc_update_max_ea_from_body(exp, body);
+ /* The eadata is opaque; just check that it is there.
+ * Eventually, obd_unpackmd() will check the contents.
+ */
+ eadata = req_capsule_server_sized_get(&req->rq_pill,
+ &RMF_MDT_MD,
+ body->mbo_eadatasize);
+ if (eadata == NULL)
+ RETURN(-EPROTO);
+
+ /* save the reply LMV EA in case we have to replay a
+ * create for recovery.
+ */
+ rc = mdc_save_lmm(req, eadata, body->mbo_eadatasize);
+ }
+ }
- *request = req;
- RETURN(rc);
+ *request = req;
+ RETURN(rc);
}
int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
const struct lmv_user_md *lum = spec->u.sp_ea.eadata;
if (!lmv_user_magic_supported(le32_to_cpu(lum->lum_magic)) &&
- le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC_V0) {
+ !(spec->sp_replay &&
+ lum->lum_magic == cpu_to_le32(LMV_MAGIC_V1))) {
rc = -EINVAL;
CERROR("%s: invalid lmv_user_md: magic=%x hash=%x stripe_offset=%d stripe_count=%u: rc = %d\n",
mdd2obd_dev(m)->obd_name,
le32_to_cpu(lum->lum_hash_type),
(int)le32_to_cpu(lum->lum_stripe_offset),
le32_to_cpu(lum->lum_stripe_count), rc);
-
RETURN(rc);
}
}
jobid_len = strnlen(jobid, LUSTRE_JOBID_SIZE);
buf = mdd_buf_get_const(env, jobid, jobid_len);
- rc = mdo_xattr_set(env, son, buf, spec->sp_cr_job_xattr,
- LU_XATTR_CREATE, handle);
-
+ rc = mdo_xattr_set(env, son, buf, spec->sp_cr_job_xattr, 0,
+ handle);
/* this xattr is nonessential, so ignore errors. */
if (rc != 0) {
CDEBUG(D_INODE,
const char *name = lname->ln_name;
struct dt_allocation_hint *hint = &mdd_env_info(env)->mdi_hint;
int acl_size = LUSTRE_POSIX_ACL_MAX_SIZE_OLD;
+ bool name_inserted = false;
int rc, rc2;
ENTRY;
/* Sanity checks before big job. */
rc = mdd_create_sanity_check(env, pobj, pattr, lname, attr, spec);
- if (rc)
+ if (unlikely(rc == -EEXIST && S_ISDIR(attr->la_mode) &&
+ spec->sp_replay && mdd_object_remote(mdd_pobj)))
+ /* if it's replay by client request, and name is found in
+ * parent directory on remote MDT, it means mkdir was partially
+ * executed: name was successfully added, but target not.
+ */
+ name_inserted = true;
+ else if (rc)
RETURN(rc);
if (CFS_FAIL_CHECK(OBD_FAIL_MDS_DQACQ_NET))
/* migrate may create 1-stripe directory, adjust stripe count
* before lod_ah_init().
*/
- if (lmu && lmu->lum_stripe_count == cpu_to_le32(1))
+ if (lmu && lmu->lum_magic == cpu_to_le32(LMV_USER_MAGIC) &&
+ lmu->lum_stripe_count == cpu_to_le32(1))
lmu->lum_stripe_count = 0;
}
rc = mdd_orphan_insert(env, son, handle);
GOTO(out_volatile, rc);
} else {
- rc = __mdd_index_insert(env, mdd_pobj, mdd_object_fid(son),
- attr->la_mode, name, handle);
- if (rc != 0)
- GOTO(err_created, rc);
+ if (likely(!name_inserted)) {
+ rc = __mdd_index_insert(env, mdd_pobj,
+ mdd_object_fid(son),
+ attr->la_mode, name, handle);
+ if (rc != 0)
+ GOTO(err_created, rc);
+ }
mdd_links_add(env, son, mdd_object_fid(mdd_pobj), lname,
handle, ldata, 1);
rc = mdd_dir_split_plain(env, mdd, pobj, obj, tobj, &xattrs,
mlc, hint, handle);
} else {
+ struct lu_buf *buf = &info->mdi_buf[0];
+
+ buf->lb_buf = mlc->mlc_spec->u.sp_ea.eadata;
+ buf->lb_len = mlc->mlc_spec->u.sp_ea.eadatalen;
+
mdd_write_lock(env, obj, DT_TGT_CHILD);
- rc = mdo_xattr_set(env, obj, NULL, XATTR_NAME_LMV,
+ rc = mdo_xattr_set(env, obj, buf, XATTR_NAME_LMV,
LU_XATTR_CREATE, handle);
mdd_write_unlock(env, obj);
if (rc)
struct mdt_object *child);
int mdt_pack_encctx_in_reply(struct mdt_thread_info *info,
struct mdt_object *child);
+void mdt_prep_ma_buf_from_rep(struct mdt_thread_info *info,
+ struct mdt_object *obj, struct md_attr *ma,
+ __u64 open_flags);
static inline struct mdt_device *mdt_dev(struct lu_device *d)
{
/**
* prep ma_lmm/ma_lmv for md_attr from reply
*/
-static void mdt_prep_ma_buf_from_rep(struct mdt_thread_info *info,
- struct mdt_object *obj,
- struct md_attr *ma, __u64 open_flags)
+void mdt_prep_ma_buf_from_rep(struct mdt_thread_info *info,
+ struct mdt_object *obj, struct md_attr *ma,
+ __u64 open_flags)
{
struct req_capsule *pill;
struct ptlrpc_request *req = mdt_info_req(mti);
struct obd_export *exp = req->rq_export;
struct mdt_device *mdt = mti->mti_mdt;
+ struct md_attr *ma = &mti->mti_attr;
struct mdt_object *child;
struct mdt_body *body;
int rc;
}
body = req_capsule_server_get(mti->mti_pill, &RMF_MDT_BODY);
- mti->mti_attr.ma_need = MA_INODE;
- mti->mti_attr.ma_valid = 0;
- rc = mdt_attr_get_complex(mti, child, &mti->mti_attr);
+ ma->ma_need = MA_INODE;
+ if (S_ISDIR(ma->ma_attr.la_mode) &&
+ (mti->mti_spec.sp_cr_flags & MDS_MKDIR_LMV))
+ mdt_prep_ma_buf_from_rep(mti, child, ma, 0);
+ ma->ma_valid = 0;
+ rc = mdt_attr_get_complex(mti, child, ma);
if (rc == -ENOENT) {
- mdt_fake_ma(&mti->mti_attr);
+ mdt_fake_ma(ma);
} else if (rc == -EREMOTE) {
/* object was created on remote server */
if (!mdt_is_dne_client(exp))
req->rq_status = rc;
body->mbo_valid |= OBD_MD_MDS;
}
- mdt_pack_attr2body(mti, body, &mti->mti_attr.ma_attr,
- mdt_object_fid(child));
+ if (ma->ma_valid & MA_LMV) {
+ body->mbo_eadatasize = ma->ma_lmv_size;
+ body->mbo_valid |= (OBD_MD_FLDIREA|OBD_MD_MEA);
+ }
+ mdt_pack_attr2body(mti, body, &ma->ma_attr, mdt_object_fid(child));
mdt_object_put(mti->mti_env, child);
}
struct md_op_spec *spec = &info->mti_spec;
struct lu_ucred *uc = mdt_ucred(info);
bool restripe = false;
+ bool recreate_obj = false;
int rc;
ENTRY;
parent->mot_obj.lo_header->loh_attr & LOHA_FSCRYPT_MD)
GOTO(put_parent, rc = -EPERM);
+ info->mti_spec.sp_replay = req_is_replay(mdt_info_req(info));
+
/*
* LU-10235: check if name exists locklessly first to avoid massive
* lock recalls on existing directories.
*/
- rc = mdt_lookup_version_check(info, parent, &rr->rr_name,
- &info->mti_tmp_fid1, 1);
+ rc = mdo_lookup(info->mti_env, mdt_object_child(parent), &rr->rr_name,
+ &info->mti_tmp_fid1, &info->mti_spec);
if (rc == 0) {
- if (!restripe)
+ /* mkdir may be partially executed: name entry was successfully
+ * inserted into parent diretory on remote MDT, while target not
+ * created on local MDT. This happens when update log recovery
+ * is aborted, and mkdir is replayed by client request.
+ */
+ if (unlikely(!(info->mti_spec.sp_replay &&
+ mdt_object_remote(parent)) &&
+ !restripe))
GOTO(put_parent, rc = -EEXIST);
- rc = mdt_restripe(info, parent, &rr->rr_name, rr->rr_fid2, spec,
- ma);
- }
+ child = mdt_object_find(info->mti_env, info->mti_mdt,
+ &info->mti_tmp_fid1);
+ if (unlikely(IS_ERR(child)))
+ GOTO(put_parent, rc = PTR_ERR(child));
- /* -ENOENT is expected here */
- if (rc != -ENOENT)
+ if (mdt_object_exists(child)) {
+ mdt_object_put(info->mti_env, child);
+ rc = -EEXIST;
+ if (restripe)
+ rc = mdt_restripe(info, parent, &rr->rr_name,
+ rr->rr_fid2, spec, ma);
+ GOTO(put_parent, rc);
+ }
+ mdt_object_put(info->mti_env, child);
+ recreate_obj = true;
+ } else if (rc != -ENOENT) {
GOTO(put_parent, rc);
+ }
- OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP, cfs_fail_val);
+ if (unlikely(info->mti_spec.sp_replay)) {
+ /* check version only during replay */
+ rc = mdt_version_check(mdt_info_req(info), ENOENT_VERSION, 1);
+ if (rc)
+ GOTO(put_parent, rc);
+ } else {
+ CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP,
+ cfs_fail_val);
- /* save version of file name for replay, it must be ENOENT here */
- mdt_enoent_version_save(info, 1);
+ /* save version of file name for replay, must be ENOENT here */
+ mdt_enoent_version_save(info, 1);
+ }
CFS_RACE(OBD_FAIL_MDS_CREATE_RACE);
*/
rc = mdo_lookup(info->mti_env, mdt_object_child(parent), &rr->rr_name,
&info->mti_tmp_fid1, &info->mti_spec);
- if (unlikely(rc == 0))
+ if (unlikely(rc == 0 && !recreate_obj))
GOTO(unlock_parent, rc = -EEXIST);
child = mdt_object_new(info->mti_env, mdt, rr->rr_fid2);
rc = mdo_create(info->mti_env, mdt_object_child(parent), &rr->rr_name,
mdt_object_child(child), &info->mti_spec, ma);
- if (rc == 0)
- rc = mdt_attr_get_complex(info, child, ma);
+ if (rc < 0)
+ GOTO(put_child, rc);
+ if (S_ISDIR(ma->ma_attr.la_mode) &&
+ (info->mti_spec.sp_cr_flags & MDS_MKDIR_LMV))
+ mdt_prep_ma_buf_from_rep(info, child, ma, 0);
+
+ rc = mdt_attr_get_complex(info, child, ma);
if (rc < 0)
GOTO(put_child, rc);
+ if (ma->ma_valid & MA_LMV) {
+ mdt_dump_lmv(D_INFO, ma->ma_lmv);
+ repbody->mbo_eadatasize = ma->ma_lmv_size;
+ repbody->mbo_valid |= (OBD_MD_FLDIREA|OBD_MD_MEA);
+ }
+
/* save child locks to eliminate dependey between 'mkdir a' and
* 'mkdir a/b' if b is a remote directory
*/
*/
LASSERT(buf->lb_len == sizeof(dt_obj_version_t));
- CDEBUG(D_INODE, "Set version %#llx (old %#llx) for inode %lu\n",
- *version, LDISKFS_I(inode)->i_fs_version, inode->i_ino);
+ CDEBUG(D_INODE,
+ DFID" set version %#llx (old %#llx) for inode %lu\n",
+ PFID(lu_object_fid(&dt->do_lu)), *version,
+ LDISKFS_I(inode)->i_fs_version, inode->i_ino);
LDISKFS_I(inode)->i_fs_version = *version;
/*
&RMF_FILE_ENCCTX,
};
+static const struct req_msg_field *mds_reint_create_acl_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_CAPA1,
+ &RMF_MDT_MD
+};
+
static const struct req_msg_field *mds_reint_open_client[] = {
&RMF_PTLRPC_BODY,
&RMF_REC_REINT,
struct req_format RQF_MDS_REINT_CREATE_ACL =
DEFINE_REQ_FMT0("MDS_REINT_CREATE_ACL",
- mds_reint_create_acl_client, mdt_body_capa);
+ mds_reint_create_acl_client,
+ mds_reint_create_acl_server);
EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_ACL);
struct req_format RQF_MDS_REINT_CREATE_SLAVE =
T130_PID=0
test_130_base() {
- test_mkdir -p $DIR/$tdir
+ test_mkdir -p -c1 $DIR/$tdir
# Prevent interference from layout intent RPCs due to
# asynchronous writeback. These will be tested in 130c below.
replay_barrier mds2
$LFS mkdir -i1 -c2 $striped_dir
- stack_trap fail_abort_cleanup RETURN
fail_abort mds2 abort_recov_mdt
- createmany -o $striped_dir/f-%d 20 &&
- error "createmany -o $DIR/$tfile should fail"
+ if (( $MDS1_VERSION >= $(version_code 2.15.54.138) )); then
+ # after 2.15.54.138 striped mkdir can replay by client request
+ createmany -o $striped_dir/f-%d 20 ||
+ error "createmany -o $DIR/$tfile failed"
+ fi
fail mds2
# LU-16159 abort_recovery will cancel update logs, the second recovery
# won't replay $striped_dir creation
- (( $MDS1_VERSION >= $(version_code 2.15.52) )) ||
- striped_dir_check_100 || error "striped dir check failed"
+ (( $MDS1_VERSION >= $(version_code 2.15.52) &&
+ $MDS1_VERSION < $(version_code 2.15.54.138) )) &&
+ fail_abort_cleanup && return 0
+
+ striped_dir_check_100 || error "striped dir check failed"
}
run_test 100c "DNE: create striped dir, abort_recov_mdt mds2"
}
run_test 100d "DNE: cancel update logs upon recovery abort"
+test_100e() {
+ (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
+ (( MDS1_VERSION >= $(version_code 2.15.54.79) )) ||
+ skip "Need MDS version 2.15.54.79+"
+ [[ $FAILURE_MODE != "HARD" ||
+ "$(facet_host mds1)" != "$(facet_host mds2)" ]] ||
+ skip "MDTs needs to be on diff hosts for HARD fail mode"
+
+ local old
+ local new
+ local striped_dir=$DIR/$tdir/striped_dir
+
+ mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+
+ replay_barrier mds1
+ replay_barrier mds2
+
+ $LFS mkdir -i 0,1 $striped_dir
+ old=$($LFS getdirstripe $striped_dir)
+ echo $old
+
+ fail mds1,mds2
+
+ new=$($LFS getdirstripe $striped_dir)
+ echo $new
+ [ "$old" == "$new" ] ||
+ error "$striped_dir layout mismatch"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 100e "DNE: create striped dir on MDT0 and MDT1, fail MDT0, MDT1"
+
test_101() { #LU-5648
mkdir -p $DIR/$tdir/d1
mkdir -p $DIR/$tdir/d2