+/**
+ * swap layouts between 2 lustre objects
+ */
+static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1,
+ struct md_object *obj2, __u64 flags)
+{
+ struct mdd_thread_info *info = mdd_env_info(env);
+ struct mdd_object *fst_o = md2mdd_obj(obj1);
+ struct mdd_object *snd_o = md2mdd_obj(obj2);
+ struct mdd_device *mdd = mdo2mdd(obj1);
+ struct lov_mds_md *fst_lmm, *snd_lmm;
+ struct lu_buf *fst_buf = &info->mti_buf[0];
+ struct lu_buf *snd_buf = &info->mti_buf[1];
+ struct lu_buf *fst_hsm_buf = &info->mti_buf[2];
+ struct lu_buf *snd_hsm_buf = &info->mti_buf[3];
+ struct ost_id *saved_oi = NULL;
+ struct thandle *handle;
+ __u16 fst_gen, snd_gen;
+ int fst_fl;
+ int rc;
+ int rc2;
+ ENTRY;
+
+ CLASSERT(ARRAY_SIZE(info->mti_buf) >= 4);
+ memset(info->mti_buf, 0, sizeof(info->mti_buf));
+
+ /* we have to sort the 2 obj, so locking will always
+ * be in the same order, even in case of 2 concurrent swaps */
+ rc = lu_fid_cmp(mdo2fid(fst_o), mdo2fid(snd_o));
+ if (rc == 0) /* same fid ? */
+ RETURN(-EPERM);
+
+ if (rc < 0)
+ swap(fst_o, snd_o);
+
+ /* check if layout swapping is allowed */
+ rc = mdd_layout_swap_allowed(env, fst_o, snd_o);
+ if (rc != 0)
+ RETURN(rc);
+
+ handle = mdd_trans_create(env, mdd);
+ if (IS_ERR(handle))
+ RETURN(PTR_ERR(handle));
+
+ /* objects are already sorted */
+ mdd_write_lock(env, fst_o, MOR_TGT_CHILD);
+ mdd_write_lock(env, snd_o, MOR_TGT_CHILD);
+
+ rc = mdd_get_lov_ea(env, fst_o, fst_buf);
+ if (rc < 0 && rc != -ENODATA)
+ GOTO(stop, rc);
+
+ rc = mdd_get_lov_ea(env, snd_o, snd_buf);
+ if (rc < 0 && rc != -ENODATA)
+ GOTO(stop, rc);
+
+ /* swapping 2 non existant layouts is a success */
+ if (fst_buf->lb_buf == NULL && snd_buf->lb_buf == NULL)
+ GOTO(stop, rc = 0);
+
+ /* to help inode migration between MDT, it is better to
+ * start by the no layout file (if one), so we order the swap */
+ if (snd_buf->lb_buf == NULL) {
+ swap(fst_o, snd_o);
+ swap(fst_buf, snd_buf);
+ }
+
+ /* lmm and generation layout initialization */
+ if (fst_buf->lb_buf != NULL) {
+ fst_lmm = fst_buf->lb_buf;
+ fst_gen = le16_to_cpu(fst_lmm->lmm_layout_gen);
+ fst_fl = LU_XATTR_REPLACE;
+ } else {
+ fst_lmm = NULL;
+ fst_gen = 0;
+ fst_fl = LU_XATTR_CREATE;
+ }
+
+ snd_lmm = snd_buf->lb_buf;
+ snd_gen = le16_to_cpu(snd_lmm->lmm_layout_gen);
+
+ /* increase the generation layout numbers */
+ snd_gen++;
+ fst_gen++;
+
+ /* set the file specific informations in lmm */
+ if (fst_lmm != NULL) {
+ saved_oi = &info->mti_oa.o_oi;
+
+ *saved_oi = fst_lmm->lmm_oi;
+ fst_lmm->lmm_layout_gen = cpu_to_le16(snd_gen);
+ fst_lmm->lmm_oi = snd_lmm->lmm_oi;
+ snd_lmm->lmm_oi = *saved_oi;
+ } else {
+ if (snd_lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1))
+ snd_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
+ else if (snd_lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3))
+ snd_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V3_DEF);
+ else
+ GOTO(stop, rc = -EPROTO);
+ }
+ snd_lmm->lmm_layout_gen = cpu_to_le16(fst_gen);
+
+ /* Prepare HSM attribute if it's required */
+ if (flags & SWAP_LAYOUTS_MDS_HSM) {
+ const int buflen = sizeof(struct hsm_attrs);
+
+ lu_buf_alloc(fst_hsm_buf, buflen);
+ lu_buf_alloc(snd_hsm_buf, buflen);
+ if (fst_hsm_buf->lb_buf == NULL || snd_hsm_buf->lb_buf == NULL)
+ GOTO(stop, rc = -ENOMEM);
+
+ /* Read HSM attribute */
+ rc = mdo_xattr_get(env, fst_o, fst_hsm_buf, XATTR_NAME_HSM,
+ BYPASS_CAPA);
+ if (rc < 0)
+ GOTO(stop, rc);
+
+ rc = mdo_xattr_get(env, snd_o, snd_hsm_buf, XATTR_NAME_HSM,
+ BYPASS_CAPA);
+ if (rc < 0)
+ GOTO(stop, rc);
+
+ rc = mdd_declare_xattr_set(env, mdd, fst_o, snd_hsm_buf,
+ XATTR_NAME_HSM, LU_XATTR_REPLACE,
+ handle);
+ if (rc < 0)
+ GOTO(stop, rc);
+
+ rc = mdd_declare_xattr_set(env, mdd, snd_o, fst_hsm_buf,
+ XATTR_NAME_HSM, LU_XATTR_REPLACE,
+ handle);
+ if (rc < 0)
+ GOTO(stop, rc);
+ }
+
+ /* prepare transaction */
+ rc = mdd_declare_xattr_set(env, mdd, fst_o, snd_buf, XATTR_NAME_LOV,
+ fst_fl, handle);
+ if (rc != 0)
+ GOTO(stop, rc);
+
+ if (fst_buf->lb_buf != NULL)
+ rc = mdd_declare_xattr_set(env, mdd, snd_o, fst_buf,
+ XATTR_NAME_LOV, LU_XATTR_REPLACE,
+ handle);
+ else
+ rc = mdd_declare_xattr_del(env, mdd, snd_o, XATTR_NAME_LOV,
+ handle);
+ if (rc != 0)
+ GOTO(stop, rc);
+
+ rc = mdd_trans_start(env, mdd, handle);
+ if (rc != 0)
+ GOTO(stop, rc);
+
+ if (flags & SWAP_LAYOUTS_MDS_HSM) {
+ rc = mdd_xattr_hsm_replace(env, fst_o, snd_hsm_buf, handle);
+ if (rc < 0)
+ GOTO(stop, rc);
+
+ rc = mdd_xattr_hsm_replace(env, snd_o, fst_hsm_buf, handle);
+ if (rc < 0) {
+ rc2 = mdd_xattr_hsm_replace(env, fst_o, fst_hsm_buf,
+ handle);
+ if (rc2 < 0)
+ CERROR("%s: restore "DFID" HSM error: %d/%d\n",
+ mdd_obj_dev_name(fst_o),
+ PFID(mdo2fid(fst_o)), rc, rc2);
+ GOTO(stop, rc);
+ }
+ }
+
+ rc = mdo_xattr_set(env, fst_o, snd_buf, XATTR_NAME_LOV, fst_fl, handle,
+ mdd_object_capa(env, fst_o));
+ if (rc != 0)
+ GOTO(stop, rc);
+
+ if (fst_buf->lb_buf != NULL)
+ rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle,
+ mdd_object_capa(env, snd_o));
+ else
+ rc = mdo_xattr_del(env, snd_o, XATTR_NAME_LOV, handle,
+ mdd_object_capa(env, snd_o));
+ if (rc != 0) {
+ int steps = 0;
+
+ /* failure on second file, but first was done, so we have
+ * to roll back first. */
+ if (fst_buf->lb_buf != NULL) {
+ fst_lmm->lmm_oi = *saved_oi;
+ fst_lmm->lmm_layout_gen = cpu_to_le16(fst_gen - 1);
+ rc2 = mdo_xattr_set(env, fst_o, fst_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle,
+ mdd_object_capa(env, fst_o));
+ } else {
+ rc2 = mdo_xattr_del(env, fst_o, XATTR_NAME_LOV, handle,
+ mdd_object_capa(env, fst_o));
+ }
+ if (rc2 < 0)
+ goto do_lbug;
+
+ ++steps;
+ rc2 = mdd_xattr_hsm_replace(env, fst_o, fst_hsm_buf, handle);
+ if (rc2 < 0)
+ goto do_lbug;
+
+ ++steps;
+ rc2 = mdd_xattr_hsm_replace(env, snd_o, snd_hsm_buf, handle);
+
+ do_lbug:
+ if (rc2 < 0) {
+ /* very bad day */
+ CERROR("%s: unable to roll back layout swap. FIDs: "
+ DFID" and "DFID "error: %d/%d, steps: %d\n",
+ mdd_obj_dev_name(fst_o),
+ PFID(mdo2fid(snd_o)), PFID(mdo2fid(fst_o)),
+ rc, rc2, steps);
+ /* a solution to avoid journal commit is to panic,
+ * but it has strong consequences so we use LBUG to
+ * allow sysdamin to choose to panic or not
+ */
+ LBUG();
+ }
+ GOTO(stop, rc);
+ }
+
+ /* Issue one changelog record per file */
+ rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, fst_o, handle);
+ if (rc)
+ GOTO(stop, rc);
+
+ rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, snd_o, handle);
+ if (rc)
+ GOTO(stop, rc);
+ EXIT;
+
+stop:
+ mdd_trans_stop(env, mdd, rc, handle);
+ mdd_write_unlock(env, snd_o);
+ mdd_write_unlock(env, fst_o);
+
+ lu_buf_free(fst_buf);
+ lu_buf_free(snd_buf);
+ lu_buf_free(fst_hsm_buf);
+ lu_buf_free(snd_hsm_buf);
+ return rc;
+}
+
+void mdd_object_make_hint(const struct lu_env *env, struct mdd_object *parent,
+ struct mdd_object *child, struct lu_attr *attr)
+{
+ struct dt_allocation_hint *hint = &mdd_env_info(env)->mti_hint;
+ struct dt_object *np = parent ? mdd_object_child(parent) : NULL;
+ struct dt_object *nc = mdd_object_child(child);
+
+ /* @hint will be initialized by underlying device. */
+ nc->do_ops->do_ah_init(env, hint, np, nc, attr->la_mode & S_IFMT);