+ struct mdd_object *o1, *o2, *fst_o, *snd_o;
+ struct lu_buf *lmm1_buf = NULL, *lmm2_buf = NULL;
+ struct lu_buf *fst_buf, *snd_buf;
+ struct lov_mds_md *fst_lmm, *snd_lmm, *old_fst_lmm = NULL;
+ struct thandle *handle;
+ struct mdd_device *mdd = mdo2mdd(obj1);
+ int rc;
+ __u16 fst_gen, snd_gen;
+ ENTRY;
+
+ /* we have to sort the 2 obj, so locking will always
+ * be in the same order, even in case of 2 concurrent swaps */
+ rc = lu_fid_cmp(mdo2fid(md2mdd_obj(obj1)),
+ mdo2fid(md2mdd_obj(obj2)));
+ /* same fid ? */
+ if (rc == 0)
+ RETURN(-EPERM);
+
+ if (rc > 0) {
+ o1 = md2mdd_obj(obj1);
+ o2 = md2mdd_obj(obj2);
+ } else {
+ o1 = md2mdd_obj(obj2);
+ o2 = md2mdd_obj(obj1);
+ }
+
+ /* check if layout swapping is allowed */
+ rc = mdd_layout_swap_allowed(env, o1, o2);
+ if (rc)
+ RETURN(rc);
+
+ handle = mdd_trans_create(env, mdd);
+ if (IS_ERR(handle))
+ RETURN(PTR_ERR(handle));
+
+ /* objects are already sorted */
+ mdd_write_lock(env, o1, MOR_TGT_CHILD);
+ mdd_write_lock(env, o2, MOR_TGT_CHILD);
+
+ lmm1_buf = mdd_get_lov_ea(env, o1);
+ if (IS_ERR(lmm1_buf)) {
+ rc = PTR_ERR(lmm1_buf);
+ lmm1_buf = NULL;
+ if (rc != -ENODATA)
+ GOTO(unlock, rc);
+ }
+
+ lmm2_buf = mdd_get_lov_ea(env, o2);
+ if (IS_ERR(lmm2_buf)) {
+ rc = PTR_ERR(lmm2_buf);
+ lmm2_buf = NULL;
+ if (rc != -ENODATA)
+ GOTO(unlock, rc);
+ }
+
+ /* swapping 2 non existant layouts is a success */
+ if ((lmm1_buf == NULL) && (lmm2_buf == NULL))
+ GOTO(unlock, rc = 0);
+
+ /* to help inode migration between MDT, it is better to
+ * start by the no layout file (if one), so we order the swap */
+ if (lmm1_buf == NULL) {
+ fst_o = o1;
+ fst_buf = lmm1_buf;
+ snd_o = o2;
+ snd_buf = lmm2_buf;
+ } else {
+ fst_o = o2;
+ fst_buf = lmm2_buf;
+ snd_o = o1;
+ snd_buf = lmm1_buf;
+ }
+
+ /* lmm and generation layout initialization */
+ if (fst_buf) {
+ fst_lmm = fst_buf->lb_buf;
+ fst_gen = le16_to_cpu(fst_lmm->lmm_layout_gen);
+ } else {
+ fst_lmm = NULL;
+ fst_gen = 0;
+ }
+
+ if (snd_buf) {
+ snd_lmm = snd_buf->lb_buf;
+ snd_gen = le16_to_cpu(snd_lmm->lmm_layout_gen);
+ } else {
+ snd_lmm = NULL;
+ snd_gen = 0;
+ }
+
+ /* save the orignal lmm common header of first file
+ * to be able to roll back */
+ OBD_ALLOC_PTR(old_fst_lmm);
+ if (old_fst_lmm == NULL)
+ GOTO(unlock, rc = -ENOMEM);
+
+ memcpy(old_fst_lmm, fst_lmm, sizeof(*old_fst_lmm));
+
+ /* increase the generation layout numbers */
+ snd_gen++;
+ fst_gen++;
+
+ /* set the file specific informations in lmm */
+ if (fst_lmm) {
+ fst_lmm->lmm_layout_gen = cpu_to_le16(snd_gen);
+ fst_lmm->lmm_object_seq = snd_lmm->lmm_object_seq;
+ fst_lmm->lmm_object_id = snd_lmm->lmm_object_id;
+ }
+
+ if (snd_lmm) {
+ snd_lmm->lmm_layout_gen = cpu_to_le16(fst_gen);
+ snd_lmm->lmm_object_seq = old_fst_lmm->lmm_object_seq;
+ snd_lmm->lmm_object_id = old_fst_lmm->lmm_object_id;
+ }
+
+ /* prepare transaction */
+ rc = mdd_declare_xattr_set(env, mdd, fst_o, snd_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle);
+ if (rc)
+ GOTO(stop, rc);
+
+ rc = mdd_declare_xattr_set(env, mdd, snd_o, fst_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle);
+ if (rc)
+ GOTO(stop, rc);
+
+ rc = mdd_trans_start(env, mdd, handle);
+ if (rc)
+ GOTO(stop, rc);
+
+ rc = mdo_xattr_set(env, fst_o, snd_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle,
+ mdd_object_capa(env, fst_o));
+ if (rc)
+ GOTO(stop, rc);
+
+ rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle,
+ mdd_object_capa(env, snd_o));
+ if (rc) {
+ int rc2;
+
+ /* failure on second file, but first was done, so we have
+ * to roll back first */
+ /* restore object_id, object_seq and generation number
+ * on first file */
+ if (fst_lmm) {
+ fst_lmm->lmm_object_id = old_fst_lmm->lmm_object_id;
+ fst_lmm->lmm_object_seq = old_fst_lmm->lmm_object_seq;
+ fst_lmm->lmm_layout_gen = old_fst_lmm->lmm_layout_gen;
+ }
+
+ rc2 = mdo_xattr_set(env, fst_o, fst_buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle,
+ mdd_object_capa(env, fst_o));
+ if (rc2) {
+ /* very bad day */
+ CERROR("%s: unable to roll back after swap layouts"
+ " failure between "DFID" and "DFID
+ " rc2 = %d rc = %d)\n",
+ mdd2obd_dev(mdd)->obd_name,
+ PFID(mdo2fid(snd_o)), PFID(mdo2fid(fst_o)),
+ rc2, rc);
+ /* a solution to avoid journal commit is to panic,
+ * but it has strong consequences so we use LBUG to
+ * allow sysdamin to choose to panic or not
+ */
+ LBUG();
+ }
+ GOTO(stop, rc);
+ }
+ EXIT;
+
+stop:
+ mdd_trans_stop(env, mdd, rc, handle);
+unlock:
+ mdd_write_unlock(env, o2);
+ mdd_write_unlock(env, o1);