Whamcloud - gitweb
LU-11642 mdt: revoke remote LOOKUP lock in dir layout shrink
[fs/lustre-release.git] / lustre / mdt / mdt_xattr.c
index c25d471..e5f70e3 100644 (file)
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
@@ -27,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 /* return EADATA length to the caller. negative value means error */
 static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
 {
-        struct req_capsule     *pill = info->mti_pill ;
-        struct ptlrpc_request  *req = mdt_info_req(info);
-        char                   *xattr_name;
-        __u64                   valid;
-        static const char       user_string[] = "user.";
-        int                     size, rc;
-        ENTRY;
+       struct req_capsule *pill = info->mti_pill;
+       struct ptlrpc_request *req = mdt_info_req(info);
+       const char *xattr_name;
+       u64 valid;
+       static const char user_string[] = "user.";
+       int size;
+       int rc = 0;
+       int rc2;
+       ENTRY;
 
        valid = info->mti_body->mbo_valid & (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS);
 
-        /* Determine how many bytes we need */
+       /* Determine how many bytes we need */
         if (valid == OBD_MD_FLXATTR) {
-                xattr_name = req_capsule_client_get(pill, &RMF_NAME);
-                if (!xattr_name)
-                        RETURN(-EFAULT);
+               xattr_name = req_capsule_client_get(pill, &RMF_NAME);
+               if (!xattr_name)
+                       RETURN(-EFAULT);
 
                if (!(exp_connect_flags(req->rq_export) & OBD_CONNECT_XATTR) &&
                    !strncmp(xattr_name, user_string, sizeof(user_string) - 1))
                        RETURN(-EOPNOTSUPP);
 
-                size = mo_xattr_get(info->mti_env,
-                                    mdt_object_child(info->mti_object),
-                                    &LU_BUF_NULL, xattr_name);
-        } else if (valid == OBD_MD_FLXATTRLS) {
-                size = mo_xattr_list(info->mti_env,
-                                     mdt_object_child(info->mti_object),
-                                     &LU_BUF_NULL);
+               size = mo_xattr_get(info->mti_env,
+                                   mdt_object_child(info->mti_object),
+                                   &LU_BUF_NULL, xattr_name);
+               if (size == -ENODATA) {
+                       /* XXX: Some client code will not handle -ENODATA
+                        * for XATTR_NAME_LOV (trusted.lov) properly. */
+                       if (strcmp(xattr_name, XATTR_NAME_LOV) == 0)
+                               rc = 0;
+                       else
+                               rc = -ENODATA;
+
+                       size = 0;
+               }
+       } else if (valid == OBD_MD_FLXATTRLS) {
+               xattr_name = "list";
+               size = mo_xattr_list(info->mti_env,
+                                    mdt_object_child(info->mti_object),
+                                    &LU_BUF_NULL);
        } else if (valid == OBD_MD_FLXATTRALL) {
+               xattr_name = "all";
                /* N.B. eadatasize = 0 is not valid for FLXATTRALL */
                /* We could calculate accurate sizes, but this would
                 * introduce a lot of overhead, let's do it later... */
@@ -90,31 +100,63 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
                req_capsule_set_size(pill, &RMF_EAVALS, RCL_SERVER, size);
                req_capsule_set_size(pill, &RMF_EAVALS_LENS, RCL_SERVER, size);
        } else {
-               CDEBUG(D_INFO, "Valid bits: "LPX64"\n",
+               CDEBUG(D_INFO, "Valid bits: %#llx\n",
                       info->mti_body->mbo_valid);
-                RETURN(-EINVAL);
-        }
-
-       if (size == -ENODATA) {
-               size = 0;
-       } else if (size < 0) {
-               if (size != -EOPNOTSUPP)
-                       CERROR("Error geting EA size: %d\n", size);
+               RETURN(-EINVAL);
+       }
+
+       if (size < 0) {
+               if (size != -EOPNOTSUPP && size != -ENOENT)
+                       CERROR("%s: error geting EA size for '%s': rc = %d\n",
+                              mdt_obd_name(info->mti_mdt), xattr_name, size);
                RETURN(size);
        }
 
-        req_capsule_set_size(pill, &RMF_EADATA, RCL_SERVER,
+       if (req_capsule_has_field(pill, &RMF_ACL, RCL_SERVER))
+               req_capsule_set_size(pill, &RMF_ACL, RCL_SERVER,
+                                    LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
+
+       req_capsule_set_size(pill, &RMF_EADATA, RCL_SERVER,
                             info->mti_body->mbo_eadatasize == 0 ? 0 : size);
-        rc = req_capsule_server_pack(pill);
-        if (rc) {
-                LASSERT(rc < 0);
-                RETURN(rc);
-        }
 
-        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETXATTR_PACK))
-                RETURN(-ENOMEM);
+       rc2 = req_capsule_server_pack(pill);
+       if (rc2 < 0)
+               RETURN(rc2);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETXATTR_PACK))
+               RETURN(-ENOMEM);
+
+       RETURN(rc < 0 ? rc : size);
+}
 
-        RETURN(size);
+static int mdt_nodemap_map_acl(struct mdt_thread_info *info, void *buf,
+                              size_t size, const char *name,
+                              enum nodemap_tree_type tree_type)
+{
+       struct lu_nodemap      *nodemap;
+       struct obd_export      *exp = info->mti_exp;
+       int                     rc = size;
+
+       ENTRY;
+
+       if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0 ||
+           strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) {
+               if (size > info->mti_mdt->mdt_max_ea_size ||
+                    (!exp_connect_large_acl(exp) &&
+                     size > LUSTRE_POSIX_ACL_MAX_SIZE_OLD))
+                       GOTO(out, rc = -ERANGE);
+
+               nodemap = nodemap_get_from_exp(exp);
+               if (IS_ERR(nodemap))
+                       GOTO(out, rc = PTR_ERR(nodemap));
+
+               rc = nodemap_map_acl(nodemap, buf, size, tree_type);
+               nodemap_putref(nodemap);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+out:
+       RETURN(rc);
 }
 
 static int mdt_getxattr_all(struct mdt_thread_info *info,
@@ -155,7 +197,10 @@ static int mdt_getxattr_all(struct mdt_thread_info *info,
                rc = mo_xattr_get(env, next, buf, b);
                if (rc < 0)
                        GOTO(out_shrink, rc);
-
+               rc = mdt_nodemap_map_acl(info, buf->lb_buf, rc, b,
+                                        NODEMAP_FS_TO_CLIENT);
+               if (rc < 0)
+                       GOTO(out_shrink, rc);
                sizes[eavallens] = rc;
                eavallens++;
                eavallen += rc;
@@ -206,8 +251,10 @@ int mdt_getxattr(struct mdt_thread_info *info)
 
         next = mdt_object_child(info->mti_object);
         easize = mdt_getxattr_pack_reply(info);
-        if (easize < 0)
-                GOTO(out, rc = err_serious(easize));
+       if (easize == -ENODATA)
+               GOTO(out, rc = easize);
+       else if (easize < 0)
+               GOTO(out, rc = err_serious(easize));
 
         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
         LASSERT(repbody != NULL);
@@ -223,9 +270,14 @@ int mdt_getxattr(struct mdt_thread_info *info)
        valid = info->mti_body->mbo_valid & (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS);
 
        if (valid == OBD_MD_FLXATTR) {
-               char *xattr_name = req_capsule_client_get(info->mti_pill,
-                                                         &RMF_NAME);
+               const char *xattr_name = req_capsule_client_get(info->mti_pill,
+                                                               &RMF_NAME);
                rc = mo_xattr_get(info->mti_env, next, buf, xattr_name);
+               if (rc < 0)
+                       GOTO(out, rc);
+
+               rc = mdt_nodemap_map_acl(info, buf->lb_buf, rc, xattr_name,
+                                        NODEMAP_FS_TO_CLIENT);
        } else if (valid == OBD_MD_FLXATTRLS) {
                CDEBUG(D_INODE, "listxattr\n");
 
@@ -242,6 +294,10 @@ int mdt_getxattr(struct mdt_thread_info *info)
 out:
        if (rc >= 0) {
                mdt_counter_incr(req, LPROC_MDT_GETXATTR);
+               /* LU-11109: Set OBD_MD_FLXATTR on success so that
+                * newer clients can distinguish between nonexistent
+                * xattrs and zero length values. */
+               repbody->mbo_valid |= OBD_MD_FLXATTR;
                repbody->mbo_eadatasize = rc;
                rc = 0;
        }
@@ -249,6 +305,147 @@ out:
        return rc;
 }
 
+/* shrink dir layout after migration */
+static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
+{
+       const struct lu_env *env = info->mti_env;
+       struct mdt_device *mdt = info->mti_mdt;
+       struct lu_ucred *uc = mdt_ucred(info);
+       struct mdt_reint_record *rr = &info->mti_rr;
+       struct lmv_user_md *lmu = rr->rr_eadata;
+       __u32 lum_stripe_count = lmu->lum_stripe_count;
+       struct lu_buf *buf = &info->mti_buf;
+       struct lmv_mds_md_v1 *lmv;
+       struct md_attr *ma = &info->mti_attr;
+       struct ldlm_enqueue_info *einfo = &info->mti_einfo[0];
+       struct mdt_object *pobj = NULL;
+       struct mdt_object *obj;
+       struct mdt_lock_handle *lhp = NULL;
+       struct mdt_lock_handle *lhc;
+       int rc;
+
+       ENTRY;
+
+       if (!mdt->mdt_enable_dir_migration)
+               RETURN(-EPERM);
+
+       if (!md_capable(uc, CFS_CAP_SYS_ADMIN) &&
+           uc->uc_gid != mdt->mdt_enable_remote_dir_gid &&
+           mdt->mdt_enable_remote_dir_gid != -1)
+               RETURN(-EPERM);
+
+       /* mti_big_lmm is used to save LMV, but it may be uninitialized. */
+       if (unlikely(!info->mti_big_lmm)) {
+               info->mti_big_lmmsize = lmv_mds_md_size(64, LMV_MAGIC);
+               OBD_ALLOC(info->mti_big_lmm, info->mti_big_lmmsize);
+               if (!info->mti_big_lmm)
+                       RETURN(-ENOMEM);
+       }
+
+       obj = mdt_object_find(env, mdt, rr->rr_fid1);
+       if (IS_ERR(obj))
+               RETURN(PTR_ERR(obj));
+
+       /* get parent from PFID */
+       rc = mdt_attr_get_pfid(info, obj, &ma->ma_pfid);
+       if (rc)
+               GOTO(put_obj, rc);
+
+       pobj = mdt_object_find(env, mdt, &ma->ma_pfid);
+       if (IS_ERR(pobj))
+               GOTO(put_obj, rc = PTR_ERR(pobj));
+
+       /* revoke object remote LOOKUP lock */
+       if (mdt_object_remote(pobj)) {
+               rc = mdt_revoke_remote_lookup_lock(info, pobj, obj);
+               if (rc)
+                       GOTO(put_pobj, rc);
+       }
+
+       /*
+        * lock parent if dir will be shrunk to 1 stripe, because dir will be
+        * converted to normal directory, as will change dir fid and update
+        * namespace of parent.
+        */
+       lhp = &info->mti_lh[MDT_LH_PARENT];
+       mdt_lock_reg_init(lhp, LCK_PW);
+
+       if (le32_to_cpu(lmu->lum_stripe_count) < 2) {
+               rc = mdt_reint_object_lock(info, pobj, lhp,
+                                          MDS_INODELOCK_UPDATE, true);
+               if (rc)
+                       GOTO(put_pobj, rc);
+       }
+
+       /* lock object */
+       lhc = &info->mti_lh[MDT_LH_CHILD];
+       mdt_lock_reg_init(lhc, LCK_EX);
+       rc = mdt_reint_striped_lock(info, obj, lhc, MDS_INODELOCK_FULL, einfo,
+                                   true);
+       if (rc)
+               GOTO(unlock_pobj, rc);
+
+       ma->ma_lmv = info->mti_big_lmm;
+       ma->ma_lmv_size = info->mti_big_lmmsize;
+       ma->ma_valid = 0;
+       rc = mdt_stripe_get(info, obj, ma, XATTR_NAME_LMV);
+       if (rc)
+               GOTO(unlock_obj, rc);
+
+       /* user may run 'lfs migrate' multiple times, so it's shrunk already */
+       if (!(ma->ma_valid & MA_LMV))
+               GOTO(unlock_obj, rc = -EALREADY);
+
+       lmv = &ma->ma_lmv->lmv_md_v1;
+
+       /* ditto */
+       if (!(le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_FLAG_MIGRATION))
+               GOTO(unlock_obj, rc = -EALREADY);
+
+       lum_stripe_count = lmu->lum_stripe_count;
+       if (!lum_stripe_count)
+               lum_stripe_count = cpu_to_le32(1);
+
+       if (lmv->lmv_migrate_offset != lum_stripe_count) {
+               CERROR("%s: "DFID" migrate mdt count mismatch %u != %u\n",
+                       mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                       lmv->lmv_migrate_offset, lmu->lum_stripe_count);
+               GOTO(unlock_obj, rc = -EINVAL);
+       }
+
+       if (lmv->lmv_master_mdt_index != lmu->lum_stripe_offset) {
+               CERROR("%s: "DFID" migrate mdt index mismatch %u != %u\n",
+                       mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                       lmv->lmv_master_mdt_index, lmu->lum_stripe_offset);
+               GOTO(unlock_obj, rc = -EINVAL);
+       }
+
+       if (lum_stripe_count > 1 &&
+           (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK)) !=
+           lmu->lum_hash_type) {
+               CERROR("%s: "DFID" migrate mdt hash mismatch %u != %u\n",
+                       mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                       lmv->lmv_hash_type, lmu->lum_hash_type);
+               GOTO(unlock_obj, rc = -EINVAL);
+       }
+
+       buf->lb_buf = rr->rr_eadata;
+       buf->lb_len = rr->rr_eadatalen;
+       rc = mo_xattr_set(env, mdt_object_child(obj), buf, XATTR_NAME_LMV, 0);
+       GOTO(unlock_obj, rc);
+
+unlock_obj:
+       mdt_reint_striped_unlock(info, obj, lhc, einfo, rc);
+unlock_pobj:
+       mdt_object_unlock(info, pobj, lhp, rc);
+put_pobj:
+       mdt_object_put(env, pobj);
+put_obj:
+       mdt_object_put(env, obj);
+
+       return rc;
+}
+
 int mdt_reint_setxattr(struct mdt_thread_info *info,
                        struct mdt_lock_handle *unused)
 {
@@ -261,11 +458,10 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
        struct lu_attr          *attr = &info->mti_attr.ma_attr;
        struct mdt_object       *obj;
        struct md_object        *child;
-       struct obd_export       *exp = info->mti_exp;
        __u64                    valid = attr->la_valid;
        const char              *xattr_name = rr->rr_name.ln_name;
        int                      xattr_len = rr->rr_eadatalen;
-       __u64                    lockpart;
+       __u64                    lockpart = MDS_INODELOCK_UPDATE;
        int                      rc;
        ENTRY;
 
@@ -289,6 +485,21 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
        } else if (strncmp(xattr_name, XATTR_TRUSTED_PREFIX,
                    sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0) {
 
+               /* setxattr(LMV) with lum is used to shrink dir layout */
+               if (strcmp(xattr_name, XATTR_NAME_LMV) == 0) {
+                       __u32 *magic = rr->rr_eadata;
+
+                       /* we don't let to remove LMV? */
+                       if (!rr->rr_eadata)
+                               GOTO(out, rc = 0);
+
+                       if (le32_to_cpu(*magic) == LMV_USER_MAGIC ||
+                           le32_to_cpu(*magic) == LMV_USER_MAGIC_SPECIFIC) {
+                               rc = mdt_dir_layout_shrink(info);
+                               GOTO(out, rc);
+                       }
+               }
+
                if (!md_capable(mdt_ucred(info), CFS_CAP_SYS_ADMIN))
                        GOTO(out, rc = -EPERM);
 
@@ -305,28 +516,31 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
        } else if ((valid & OBD_MD_FLXATTR) &&
                   (strcmp(xattr_name, XATTR_NAME_ACL_ACCESS) == 0 ||
                    strcmp(xattr_name, XATTR_NAME_ACL_DEFAULT) == 0)) {
-               struct lu_nodemap *nodemap;
-
-               /* currently lustre limit acl access size */
-               if (xattr_len > LUSTRE_POSIX_ACL_MAX_SIZE)
-                       GOTO(out, rc = -ERANGE);
-
-               nodemap = nodemap_get_from_exp(exp);
-               if (IS_ERR(nodemap))
-                       GOTO(out, rc = PTR_ERR(nodemap));
-
-               rc = nodemap_map_acl(nodemap, rr->rr_eadata, xattr_len,
-                                    NODEMAP_CLIENT_TO_FS);
-               nodemap_putref(nodemap);
+               rc = mdt_nodemap_map_acl(info, rr->rr_eadata, xattr_len,
+                                        xattr_name, NODEMAP_CLIENT_TO_FS);
                if (rc < 0)
                        GOTO(out, rc);
-
                /* ACLs were mapped out, return an error so the user knows */
                if (rc != xattr_len)
                        GOTO(out, rc = -EPERM);
+       } else if ((strlen(xattr_name) > strlen(XATTR_LUSTRE_LOV) + 1) &&
+                  strncmp(xattr_name, XATTR_LUSTRE_LOV,
+                          strlen(XATTR_LUSTRE_LOV)) == 0) {
+
+               if (strncmp(xattr_name, XATTR_LUSTRE_LOV".add",
+                           strlen(XATTR_LUSTRE_LOV".add")) &&
+                   strncmp(xattr_name, XATTR_LUSTRE_LOV".set",
+                           strlen(XATTR_LUSTRE_LOV".set")) &&
+                   strncmp(xattr_name, XATTR_LUSTRE_LOV".del",
+                           strlen(XATTR_LUSTRE_LOV".del"))) {
+                       CERROR("%s: invalid xattr name: %s\n",
+                              mdt_obd_name(info->mti_mdt), xattr_name);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               lockpart |= MDS_INODELOCK_LAYOUT;
        }
 
-        lockpart = MDS_INODELOCK_UPDATE;
         /* Revoke all clients' lookup lock, since the access
          * permissions for this inode is changed when ACL_ACCESS is
          * set. This isn't needed for ACL_DEFAULT, since that does
@@ -358,10 +572,10 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                /* This isn't strictly an error, but all current clients
                 * should set OBD_MD_FLCTIME when setting attributes. */
                CWARN("%s: client miss to set OBD_MD_FLCTIME when "
-                     "setxattr %s: [object "DFID"] [valid "LPU64"]\n",
+                     "setxattr %s: [object "DFID"] [valid %llu]\n",
                      mdt_obd_name(info->mti_mdt), xattr_name,
                      PFID(rr->rr_fid1), valid);
-               attr->la_ctime = cfs_time_current_sec();
+               attr->la_ctime = ktime_get_real_seconds();
        }
        attr->la_valid = LA_CTIME;
        child = mdt_object_child(obj);
@@ -392,11 +606,12 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                         ma->ma_attr_flags |= MDS_PERM_BYPASS;
                         mo_attr_set(env, child, ma);
                 }
-        } else {
-                CDEBUG(D_INFO, "valid bits: "LPX64"\n", valid);
-                rc = -EINVAL;
-        }
-        if (rc == 0)
+       } else {
+               CDEBUG(D_INFO, "valid bits: %#llx\n", valid);
+               rc = -EINVAL;
+       }
+
+       if (rc == 0)
                mdt_counter_incr(req, LPROC_MDT_SETXATTR);
 
         EXIT;