+ mdt_object_unlock(info, NULL, lh, 1);
+
+ return 0;
+}
+
+/*
+ * operation may takes locks of linkea, or directory stripes, group them in
+ * different list.
+ */
+struct mdt_sub_lock {
+ struct mdt_object *msl_obj;
+ struct mdt_lock_handle msl_lh;
+ struct list_head msl_linkage;
+};
+
+static void mdt_unlock_list(struct mdt_thread_info *info,
+ struct list_head *list, int decref)
+{
+ struct mdt_sub_lock *msl;
+ struct mdt_sub_lock *tmp;
+
+ list_for_each_entry_safe(msl, tmp, list, msl_linkage) {
+ mdt_object_unlock_put(info, msl->msl_obj, &msl->msl_lh, decref);
+ list_del(&msl->msl_linkage);
+ OBD_FREE_PTR(msl);
+ }
+}
+
+/*
+ * lock parents of links, and also check whether total locks don't exceed
+ * RS_MAX_LOCKS.
+ *
+ * \retval 0 on success, and locks can be saved in ptlrpc_reply_stat
+ * \retval 1 on success, but total lock count may exceed RS_MAX_LOCKS
+ * \retval -ev negative errno upon error
+ */
+static int mdt_lock_links(struct mdt_thread_info *info,
+ struct mdt_object *pobj,
+ const struct md_attr *ma,
+ struct mdt_object *obj,
+ struct list_head *link_locks)
+{
+ struct mdt_device *mdt = info->mti_mdt;
+ struct lu_buf *buf = &info->mti_big_buf;
+ struct lu_name *lname = &info->mti_name;
+ struct linkea_data ldata = { NULL };
+ bool blocked = false;
+ int retries = 5;
+ int local_lnkp_cnt = 0;
+ int rc;
+
+ ENTRY;
+
+ if (S_ISDIR(lu_object_attr(&obj->mot_obj)))
+ RETURN(0);
+
+ buf = lu_buf_check_and_alloc(buf, MAX_LINKEA_SIZE);
+ if (buf->lb_buf == NULL)
+ RETURN(-ENOMEM);
+
+ ldata.ld_buf = buf;
+ rc = mdt_links_read(info, obj, &ldata);
+ if (rc) {
+ if (rc == -ENOENT || rc == -ENODATA)
+ rc = 0;
+ RETURN(rc);
+ }
+
+repeat:
+ for (linkea_first_entry(&ldata); ldata.ld_lee && !rc;
+ linkea_next_entry(&ldata)) {
+ struct mdt_object *lnkp;
+ struct mdt_sub_lock *msl;
+ struct lu_fid fid;
+ __u64 ibits;
+
+ linkea_entry_unpack(ldata.ld_lee, &ldata.ld_reclen, lname,
+ &fid);
+
+ /* check if it's also linked to parent */
+ if (lu_fid_eq(mdt_object_fid(pobj), &fid)) {
+ CDEBUG(D_INFO, "skip parent "DFID", reovke "DNAME"\n",
+ PFID(&fid), PNAME(lname));
+ /* in case link is remote object, revoke LOOKUP lock */
+ rc = mdt_revoke_remote_lookup_lock(info, pobj, obj);
+ continue;
+ }
+
+ lnkp = NULL;
+
+ /* check if it's linked to a stripe of parent */
+ if (ma->ma_valid & MA_LMV) {
+ struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1;
+ struct lu_fid *stripe_fid = &info->mti_tmp_fid1;
+ int j = 0;
+
+ for (; j < le32_to_cpu(lmv->lmv_stripe_count); j++) {
+ fid_le_to_cpu(stripe_fid,
+ &lmv->lmv_stripe_fids[j]);
+ if (lu_fid_eq(stripe_fid, &fid)) {
+ CDEBUG(D_INFO, "skip stripe "DFID
+ ", reovke "DNAME"\n",
+ PFID(&fid), PNAME(lname));
+ lnkp = mdt_object_find(info->mti_env,
+ mdt, &fid);
+ if (IS_ERR(lnkp))
+ GOTO(out, rc = PTR_ERR(lnkp));
+ break;
+ }
+ }
+
+ if (lnkp) {
+ rc = mdt_revoke_remote_lookup_lock(info, lnkp,
+ obj);
+ mdt_object_put(info->mti_env, lnkp);
+ continue;
+ }
+ }
+
+ /* Check if it's already locked */
+ list_for_each_entry(msl, link_locks, msl_linkage) {
+ if (lu_fid_eq(mdt_object_fid(msl->msl_obj), &fid)) {
+ CDEBUG(D_INFO,
+ DFID" was locked, revoke "DNAME"\n",
+ PFID(&fid), PNAME(lname));
+ lnkp = msl->msl_obj;
+ break;
+ }
+ }
+
+ if (lnkp) {
+ rc = mdt_revoke_remote_lookup_lock(info, lnkp, obj);
+ continue;
+ }
+
+ CDEBUG(D_INFO, "lock "DFID":"DNAME"\n",
+ PFID(&fid), PNAME(lname));
+
+ lnkp = mdt_object_find(info->mti_env, mdt, &fid);
+ if (IS_ERR(lnkp)) {
+ CWARN("%s: cannot find obj "DFID": %ld\n",
+ mdt_obd_name(mdt), PFID(&fid), PTR_ERR(lnkp));
+ continue;
+ }
+
+ if (!mdt_object_exists(lnkp)) {
+ CDEBUG(D_INFO, DFID" doesn't exist, skip "DNAME"\n",
+ PFID(&fid), PNAME(lname));
+ mdt_object_put(info->mti_env, lnkp);
+ continue;
+ }
+
+ if (!mdt_object_remote(lnkp))
+ local_lnkp_cnt++;
+
+ OBD_ALLOC_PTR(msl);
+ if (msl == NULL)
+ GOTO(out, rc = -ENOMEM);
+
+ /*
+ * we can't follow parent-child lock order like other MD
+ * operations, use lock_try here to avoid deadlock, if the lock
+ * cannot be taken, drop all locks taken, revoke the blocked
+ * one, and continue processing the remaining entries, and in
+ * the end of the loop restart from beginning.
+ */
+ mdt_lock_pdo_init(&msl->msl_lh, LCK_PW, lname);
+ ibits = 0;
+ rc = mdt_object_lock_try(info, lnkp, &msl->msl_lh, &ibits,
+ MDS_INODELOCK_UPDATE, true);
+ if (!(ibits & MDS_INODELOCK_UPDATE)) {
+ blocked = true;
+
+ CDEBUG(D_INFO, "busy lock on "DFID" "DNAME" retry %d\n",
+ PFID(&fid), PNAME(lname), retries);
+
+ mdt_unlock_list(info, link_locks, 1);
+
+ mdt_lock_pdo_init(&msl->msl_lh, LCK_PW, lname);
+ rc = mdt_object_lock(info, lnkp, &msl->msl_lh,
+ MDS_INODELOCK_UPDATE);
+ if (rc) {
+ mdt_object_put(info->mti_env, lnkp);
+ OBD_FREE_PTR(msl);
+ GOTO(out, rc);
+ }
+
+ if (mdt_object_remote(lnkp)) {
+ struct ldlm_lock *lock;
+
+ /*
+ * for remote object, set lock cb_atomic,
+ * so lock can be released in blocking_ast()
+ * immediately, then the next lock_try will
+ * have better chance of success.
+ */
+ lock = ldlm_handle2lock(
+ &msl->msl_lh.mlh_rreg_lh);
+ LASSERT(lock != NULL);
+ lock_res_and_lock(lock);
+ ldlm_set_atomic_cb(lock);
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_PUT(lock);
+ }
+
+ mdt_object_unlock_put(info, lnkp, &msl->msl_lh, 1);
+ OBD_FREE_PTR(msl);
+ continue;
+ }
+
+ INIT_LIST_HEAD(&msl->msl_linkage);
+ msl->msl_obj = lnkp;
+ list_add_tail(&msl->msl_linkage, link_locks);
+
+ rc = mdt_revoke_remote_lookup_lock(info, lnkp, obj);
+ }
+
+ if (blocked) {
+ rc = -EBUSY;
+ if (--retries > 0) {
+ mdt_unlock_list(info, link_locks, rc);
+ blocked = false;
+ local_lnkp_cnt = 0;
+ goto repeat;
+ }
+ }
+
+ EXIT;
+out:
+ if (rc)
+ mdt_unlock_list(info, link_locks, rc);
+ else if (local_lnkp_cnt > RS_MAX_LOCKS - 6)
+ /*
+ * parent may have 3 local objects: master object and 2 stripes
+ * (if it's being migrated too); source may have 2 local
+ * objects: master and 1 stripe; target has 1 local object.
+ */
+ rc = 1;
+ return rc;
+}
+
+static int mdt_lock_remote_slaves(struct mdt_thread_info *info,
+ struct mdt_object *obj,
+ const struct md_attr *ma,
+ struct list_head *slave_locks)
+{
+ struct mdt_device *mdt = info->mti_mdt;
+ const struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1;
+ struct lu_fid *fid = &info->mti_tmp_fid1;
+ struct mdt_object *slave;
+ struct mdt_sub_lock *msl;
+ int i;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(mdt_object_remote(obj));
+ LASSERT(ma->ma_valid & MA_LMV);
+ LASSERT(lmv);
+
+ if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
+ RETURN(-EINVAL);
+
+ if (le32_to_cpu(lmv->lmv_stripe_count) < 1)
+ RETURN(0);
+
+ for (i = 0; i < le32_to_cpu(lmv->lmv_stripe_count); i++) {
+ fid_le_to_cpu(fid, &lmv->lmv_stripe_fids[i]);
+
+ slave = mdt_object_find(info->mti_env, mdt, fid);
+ if (IS_ERR(slave))
+ GOTO(out, rc = PTR_ERR(slave));
+
+ OBD_ALLOC_PTR(msl);
+ if (!msl) {
+ mdt_object_put(info->mti_env, slave);
+ GOTO(out, rc = -ENOMEM);
+ }
+
+ mdt_lock_reg_init(&msl->msl_lh, LCK_EX);
+ rc = mdt_reint_object_lock(info, slave, &msl->msl_lh,
+ MDS_INODELOCK_UPDATE, true);
+ if (rc) {
+ OBD_FREE_PTR(msl);
+ mdt_object_put(info->mti_env, slave);
+ GOTO(out, rc);
+ }
+
+ INIT_LIST_HEAD(&msl->msl_linkage);
+ msl->msl_obj = slave;
+ list_add_tail(&msl->msl_linkage, slave_locks);
+
+ }
+ EXIT;
+
+out:
+ if (rc)
+ mdt_unlock_list(info, slave_locks, rc);
+ return rc;
+}
+
+static inline void mdt_migrate_object_unlock(struct mdt_thread_info *info,
+ struct mdt_object *obj,
+ struct mdt_lock_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ struct list_head *slave_locks,
+ int decref)
+{
+ if (mdt_object_remote(obj)) {
+ mdt_unlock_list(info, slave_locks, decref);
+ mdt_object_unlock(info, obj, lh, decref);
+ } else {
+ mdt_reint_striped_unlock(info, obj, lh, einfo, decref);
+ }
+}
+
+/* lock parent and its stripes */
+static int mdt_migrate_parent_lock(struct mdt_thread_info *info,
+ struct mdt_object *obj,
+ const struct md_attr *ma,
+ struct mdt_lock_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ struct list_head *slave_locks)
+{
+ int rc;
+
+ if (mdt_object_remote(obj)) {
+ rc = mdt_remote_object_lock(info, obj, mdt_object_fid(obj),
+ &lh->mlh_rreg_lh, LCK_PW,
+ MDS_INODELOCK_UPDATE, false);
+ if (rc != ELDLM_OK)
+ return rc;
+
+ /*
+ * if obj is remote and striped, lock its stripes explicitly
+ * because it's not striped in LOD layer on this MDT.
+ */
+ if (ma->ma_valid & MA_LMV) {
+ rc = mdt_lock_remote_slaves(info, obj, ma, slave_locks);
+ if (rc)
+ mdt_object_unlock(info, obj, lh, rc);
+ }
+ } else {
+ rc = mdt_reint_striped_lock(info, obj, lh, MDS_INODELOCK_UPDATE,
+ einfo, true);
+ }
+
+ return rc;
+}
+
+/*
+ * in migration, object may be remote, and we need take full lock of it and its
+ * stripes if it's directory, besides, object may be a remote object on its
+ * parent, revoke its LOOKUP lock on where its parent is located.
+ */
+static int mdt_migrate_object_lock(struct mdt_thread_info *info,
+ struct mdt_object *pobj,
+ struct mdt_object *obj,
+ struct mdt_lock_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ struct list_head *slave_locks)
+{
+ int rc;
+
+ if (mdt_object_remote(obj)) {
+ /* don't bother to check if pobj and obj are on the same MDT. */
+ rc = mdt_revoke_remote_lookup_lock(info, pobj, obj);
+ if (rc)
+ return rc;
+
+ rc = mdt_remote_object_lock(info, obj, mdt_object_fid(obj),
+ &lh->mlh_rreg_lh, LCK_EX,
+ MDS_INODELOCK_FULL, false);
+ if (rc != ELDLM_OK)
+ return rc;
+
+ /*
+ * if obj is remote and striped, lock its stripes explicitly
+ * because it's not striped in LOD layer on this MDT.
+ */
+ if (S_ISDIR(lu_object_attr(&obj->mot_obj))) {
+ struct md_attr *ma = &info->mti_attr;
+
+ ma->ma_lmv = info->mti_big_lmm;
+ ma->ma_lmv_size = info->mti_big_lmmsize;
+ ma->ma_valid = 0;
+ rc = mdt_stripe_get(info, obj, ma, XATTR_NAME_LMV);
+ if (rc) {
+ mdt_object_unlock(info, obj, lh, rc);
+ return rc;
+ }
+
+ if (ma->ma_valid & MA_LMV) {
+ rc = mdt_lock_remote_slaves(info, obj, ma,
+ slave_locks);
+ if (rc)
+ mdt_object_unlock(info, obj, lh, rc);
+ }
+ }
+ } else {
+ if (mdt_object_remote(pobj)) {
+ rc = mdt_revoke_remote_lookup_lock(info, pobj, obj);
+ if (rc)
+ return rc;
+ }
+
+ rc = mdt_reint_striped_lock(info, obj, lh, MDS_INODELOCK_FULL,
+ einfo, true);
+ }
+
+ return rc;
+}
+
+/*
+ * lookup source by name, if parent is striped directory, we need to find the
+ * corresponding stripe where source is located, and then lookup there.
+ *
+ * besides, if parent is migrating too, and file is already in target stripe,
+ * this should be a redo of 'lfs migrate' on client side.
+ */
+static int mdt_migrate_lookup(struct mdt_thread_info *info,
+ struct mdt_object *pobj,
+ const struct md_attr *ma,
+ const struct lu_name *lname,
+ struct mdt_object **spobj,
+ struct mdt_object **sobj)
+{
+ const struct lu_env *env = info->mti_env;
+ struct lu_fid *fid = &info->mti_tmp_fid1;
+ struct mdt_object *stripe;
+ int rc;
+
+ if (ma->ma_valid & MA_LMV) {
+ /* if parent is striped, lookup on corresponding stripe */
+ struct lmv_mds_md_v1 *lmv = &ma->ma_lmv->lmv_md_v1;
+ __u32 hash_type = le32_to_cpu(lmv->lmv_hash_type);
+ __u32 stripe_count = le32_to_cpu(lmv->lmv_stripe_count);
+ bool is_migrating = le32_to_cpu(lmv->lmv_hash_type) &
+ LMV_HASH_FLAG_MIGRATION;
+
+ if (is_migrating) {
+ hash_type = le32_to_cpu(lmv->lmv_migrate_hash);
+ stripe_count -= le32_to_cpu(lmv->lmv_migrate_offset);
+ }
+
+ rc = lmv_name_to_stripe_index(hash_type, stripe_count,
+ lname->ln_name,
+ lname->ln_namelen);
+ if (rc < 0)
+ return rc;
+
+ if (le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_FLAG_MIGRATION)
+ rc += le32_to_cpu(lmv->lmv_migrate_offset);
+
+ fid_le_to_cpu(fid, &lmv->lmv_stripe_fids[rc]);
+
+ stripe = mdt_object_find(env, info->mti_mdt, fid);
+ if (IS_ERR(stripe))
+ return PTR_ERR(stripe);
+
+ fid_zero(fid);
+ rc = mdo_lookup(env, mdt_object_child(stripe), lname, fid,
+ &info->mti_spec);
+ if (rc == -ENOENT && is_migrating) {
+ /*
+ * if parent is migrating, and lookup child failed on
+ * source stripe, lookup again on target stripe, if it
+ * exists, it means previous migration was interrupted,
+ * and current file was migrated already.
+ */
+ mdt_object_put(env, stripe);
+
+ hash_type = le32_to_cpu(lmv->lmv_hash_type);
+ stripe_count = le32_to_cpu(lmv->lmv_migrate_offset);
+
+ rc = lmv_name_to_stripe_index(hash_type, stripe_count,
+ lname->ln_name,
+ lname->ln_namelen);
+ if (rc < 0)
+ return rc;
+
+ fid_le_to_cpu(fid, &lmv->lmv_stripe_fids[rc]);
+
+ stripe = mdt_object_find(env, info->mti_mdt, fid);
+ if (IS_ERR(stripe))
+ return PTR_ERR(stripe);
+
+ fid_zero(fid);
+ rc = mdo_lookup(env, mdt_object_child(stripe), lname,
+ fid, &info->mti_spec);
+ mdt_object_put(env, stripe);
+ return rc ?: -EALREADY;
+ } else if (rc) {
+ mdt_object_put(env, stripe);
+ return rc;
+ }
+ } else {
+ fid_zero(fid);
+ rc = mdo_lookup(env, mdt_object_child(pobj), lname, fid,
+ &info->mti_spec);
+ if (rc)
+ return rc;
+
+ stripe = pobj;
+ mdt_object_get(env, stripe);
+ }
+
+ *spobj = stripe;
+
+ *sobj = mdt_object_find(env, info->mti_mdt, fid);
+ if (IS_ERR(*sobj)) {
+ mdt_object_put(env, stripe);
+ rc = PTR_ERR(*sobj);
+ *spobj = NULL;
+ *sobj = NULL;
+ }
+
+ return rc;
+}
+
+/* end lease and close file for regular file */
+static int mdd_migrate_close(struct mdt_thread_info *info,
+ struct mdt_object *obj)
+{
+ struct close_data *data;
+ struct mdt_body *repbody;
+ struct ldlm_lock *lease;
+ int rc;
+ int rc2;
+
+ rc = -EPROTO;
+ if (!req_capsule_field_present(info->mti_pill, &RMF_MDT_EPOCH,
+ RCL_CLIENT) ||
+ !req_capsule_field_present(info->mti_pill, &RMF_CLOSE_DATA,
+ RCL_CLIENT))
+ goto close;
+
+ data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+ if (!data)
+ goto close;
+
+ rc = -ESTALE;
+ lease = ldlm_handle2lock(&data->cd_handle);
+ if (!lease)
+ goto close;
+
+ /* check if the lease was already canceled */
+ lock_res_and_lock(lease);
+ rc = ldlm_is_cancel(lease);
+ unlock_res_and_lock(lease);
+
+ if (rc) {
+ rc = -EAGAIN;
+ LDLM_DEBUG(lease, DFID" lease broken",
+ PFID(mdt_object_fid(obj)));
+ }
+