Whamcloud - gitweb
LU-14459 mdt: support fixed directory layout
[fs/lustre-release.git] / lustre / mdt / mdt_xattr.c
index 29271e9..077363d 100644 (file)
@@ -27,7 +27,6 @@
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lustre/mdt/mdt_xattr.c
  *
 #include <obd_class.h>
 #include <lustre_nodemap.h>
 #include <lustre_acl.h>
+#include <lustre_lmv.h>
 #include "mdt_internal.h"
 
 
 /* return EADATA length to the caller. negative value means error */
-static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
+static int mdt_getxattr_pack_reply(struct mdt_thread_info *info)
 {
        struct req_capsule *pill = info->mti_pill;
        struct ptlrpc_request *req = mdt_info_req(info);
@@ -64,7 +64,7 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
        valid = info->mti_body->mbo_valid & (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS);
 
        /* Determine how many bytes we need */
-        if (valid == OBD_MD_FLXATTR) {
+       if (valid == OBD_MD_FLXATTR) {
                xattr_name = req_capsule_client_get(pill, &RMF_NAME);
                if (!xattr_name)
                        RETURN(-EFAULT);
@@ -78,7 +78,8 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
                                    &LU_BUF_NULL, xattr_name);
                if (size == -ENODATA) {
                        /* XXX: Some client code will not handle -ENODATA
-                        * for XATTR_NAME_LOV (trusted.lov) properly. */
+                        * for XATTR_NAME_LOV (trusted.lov) properly.
+                        */
                        if (strcmp(xattr_name, XATTR_NAME_LOV) == 0)
                                rc = 0;
                        else
@@ -95,8 +96,16 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
                xattr_name = "all";
                /* N.B. eadatasize = 0 is not valid for FLXATTRALL */
                /* We could calculate accurate sizes, but this would
-                * introduce a lot of overhead, let's do it later... */
+                * introduce a lot of overhead, let's do it later...
+                */
                size = info->mti_body->mbo_eadatasize;
+               if (size <= 0 || size > info->mti_mdt->mdt_max_ea_size ||
+                   size & (sizeof(__u32) - 1)) {
+                       DEBUG_REQ(D_ERROR, req,
+                                 "%s: invalid EA size(%d) for FLXATTRALL\n",
+                                 mdt_obd_name(info->mti_mdt), size);
+                       RETURN(-EINVAL);
+               }
                req_capsule_set_size(pill, &RMF_EAVALS, RCL_SERVER, size);
                req_capsule_set_size(pill, &RMF_EAVALS_LENS, RCL_SERVER, size);
        } else {
@@ -228,38 +237,43 @@ out_shrink:
 int mdt_getxattr(struct mdt_thread_info *info)
 {
        struct ptlrpc_request  *req = mdt_info_req(info);
-        struct mdt_body        *reqbody;
-        struct mdt_body        *repbody = NULL;
-        struct md_object       *next;
-        struct lu_buf          *buf;
-        int                     easize, rc;
+       struct mdt_body        *reqbody;
+       struct mdt_body        *repbody = NULL;
+       struct md_object       *next;
+       struct lu_buf          *buf;
+       int                     easize, rc;
        u64                     valid;
-        ENTRY;
+       ktime_t                 kstart = ktime_get();
+       ENTRY;
 
-        LASSERT(info->mti_object != NULL);
+       LASSERT(info->mti_object != NULL);
        LASSERT(lu_object_assert_exists(&info->mti_object->mot_obj));
 
        CDEBUG(D_INODE, "getxattr "DFID"\n", PFID(&info->mti_body->mbo_fid1));
 
-        reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
-        if (reqbody == NULL)
-                RETURN(err_serious(-EFAULT));
+       rc = req_check_sepol(info->mti_pill);
+       if (rc)
+               RETURN(err_serious(rc));
+
+       reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
+       if (reqbody == NULL)
+               RETURN(err_serious(-EFAULT));
 
        rc = mdt_init_ucred(info, reqbody);
-        if (rc)
-                RETURN(err_serious(rc));
+       if (rc)
+               RETURN(err_serious(rc));
 
-        next = mdt_object_child(info->mti_object);
-        easize = mdt_getxattr_pack_reply(info);
+       next = mdt_object_child(info->mti_object);
+       easize = mdt_getxattr_pack_reply(info);
        if (easize == -ENODATA)
                GOTO(out, rc = easize);
        else if (easize < 0)
                GOTO(out, rc = err_serious(easize));
 
-        repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
-        LASSERT(repbody != NULL);
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       LASSERT(repbody != NULL);
 
-        /* No need further getxattr. */
+       /* No need further getxattr. */
        if (easize == 0 || reqbody->mbo_eadatasize == 0)
                GOTO(out, rc = easize);
 
@@ -293,10 +307,12 @@ int mdt_getxattr(struct mdt_thread_info *info)
        EXIT;
 out:
        if (rc >= 0) {
-               mdt_counter_incr(req, LPROC_MDT_GETXATTR);
+               mdt_counter_incr(req, LPROC_MDT_GETXATTR,
+                                ktime_us_delta(ktime_get(), kstart));
                /* LU-11109: Set OBD_MD_FLXATTR on success so that
                 * newer clients can distinguish between nonexistent
-                * xattrs and zero length values. */
+                * xattrs and zero length values.
+                */
                repbody->mbo_valid |= OBD_MD_FLXATTR;
                repbody->mbo_eadatasize = rc;
                rc = 0;
@@ -305,8 +321,210 @@ out:
        return rc;
 }
 
+/* update dir layout after migration/restripe */
+int mdt_dir_layout_update(struct mdt_thread_info *info)
+{
+       const struct lu_env *env = info->mti_env;
+       struct mdt_device *mdt = info->mti_mdt;
+       struct lu_ucred *uc = mdt_ucred(info);
+       struct mdt_reint_record *rr = &info->mti_rr;
+       struct lmv_user_md *lmu = rr->rr_eadata;
+       __u32 lum_stripe_count = lmu->lum_stripe_count;
+       struct md_layout_change *mlc = &info->mti_mlc;
+       struct lmv_mds_md_v1 *lmv;
+       struct md_attr *ma = &info->mti_attr;
+       struct ldlm_enqueue_info *einfo = &info->mti_einfo[0];
+       struct mdt_object *pobj = NULL;
+       struct mdt_object *obj;
+       struct mdt_lock_handle *lhp = NULL;
+       struct mdt_lock_handle *lhc;
+       bool shrink = false;
+       int rc;
+
+       ENTRY;
+
+       if (!mdt->mdt_enable_dir_migration)
+               RETURN(-EPERM);
+
+       if (!md_capable(uc, CAP_SYS_ADMIN) &&
+           uc->uc_gid != mdt->mdt_enable_remote_dir_gid &&
+           mdt->mdt_enable_remote_dir_gid != -1)
+               RETURN(-EPERM);
+
+       obj = mdt_object_find(env, mdt, rr->rr_fid1);
+       if (IS_ERR(obj))
+               RETURN(PTR_ERR(obj));
+
+       /* get parent from PFID */
+       rc = mdt_attr_get_pfid(info, obj, &ma->ma_pfid);
+       if (rc)
+               GOTO(put_obj, rc);
+
+       pobj = mdt_object_find(env, mdt, &ma->ma_pfid);
+       if (IS_ERR(pobj))
+               GOTO(put_obj, rc = PTR_ERR(pobj));
+
+       /* revoke object remote LOOKUP lock */
+       if (mdt_object_remote(pobj)) {
+               rc = mdt_revoke_remote_lookup_lock(info, pobj, obj);
+               if (rc)
+                       GOTO(put_pobj, rc);
+       }
+
+       /*
+        * lock parent if dir will be shrunk to 1 stripe, because dir will be
+        * converted to normal directory, as will change dir FID and update
+        * namespace of parent.
+        */
+       lhp = &info->mti_lh[MDT_LH_PARENT];
+       mdt_lock_reg_init(lhp, LCK_PW);
+
+       if (le32_to_cpu(lmu->lum_stripe_count) < 2) {
+               rc = mdt_reint_object_lock(info, pobj, lhp,
+                                          MDS_INODELOCK_UPDATE, true);
+               if (rc)
+                       GOTO(put_pobj, rc);
+       }
+
+       /* lock object */
+       lhc = &info->mti_lh[MDT_LH_CHILD];
+       mdt_lock_reg_init(lhc, LCK_EX);
+       rc = mdt_reint_striped_lock(info, obj, lhc, MDS_INODELOCK_FULL, einfo,
+                                   true);
+       if (rc)
+               GOTO(unlock_pobj, rc);
+
+       rc = mdt_stripe_get(info, obj, ma, XATTR_NAME_LMV);
+       if (rc)
+               GOTO(unlock_obj, rc);
+
+       /* user may run 'lfs migrate' multiple times, so it's shrunk already */
+       if (!(ma->ma_valid & MA_LMV))
+               GOTO(unlock_obj, rc = -EALREADY);
+
+       lmv = &ma->ma_lmv->lmv_md_v1;
+       if (!lmv_is_sane(lmv))
+               GOTO(unlock_obj, rc = -EBADF);
+
+       /* ditto */
+       if (!lmv_is_layout_changing(lmv))
+               GOTO(unlock_obj, rc = -EALREADY);
+
+       lum_stripe_count = lmu->lum_stripe_count;
+       if (!lum_stripe_count)
+               lum_stripe_count = cpu_to_le32(1);
+
+       if (lmv_is_migrating(lmv)) {
+               if (lmv->lmv_migrate_offset != lum_stripe_count) {
+                       CERROR("%s: "DFID" migrate mdt count mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_migrate_offset, lmu->lum_stripe_count);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_stripe_offset != lmv->lmv_master_mdt_index) {
+                       CERROR("%s: "DFID" migrate mdt index mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_master_mdt_index,
+                               lmu->lum_stripe_offset);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lum_stripe_count > 1 && lmu->lum_hash_type &&
+                   lmu->lum_hash_type !=
+                   (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK))) {
+                       CERROR("%s: "DFID" migrate mdt hash mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_hash_type, lmu->lum_hash_type);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               shrink = true;
+       } else if (lmv_is_splitting(lmv)) {
+               if (lmv->lmv_stripe_count != lum_stripe_count) {
+                       CERROR("%s: "DFID" stripe count mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_stripe_count, lmu->lum_stripe_count);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_stripe_offset != LMV_OFFSET_DEFAULT) {
+                       CERROR("%s: "DFID" dir split offset %u != -1\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmu->lum_stripe_offset);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_hash_type &&
+                   lmu->lum_hash_type !=
+                   (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK))) {
+                       CERROR("%s: "DFID" split hash mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_hash_type, lmu->lum_hash_type);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+       } else if (lmv_is_merging(lmv)) {
+               if (lmv->lmv_merge_offset != lum_stripe_count) {
+                       CERROR("%s: "DFID" stripe count mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_merge_offset, lmu->lum_stripe_count);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_stripe_offset != LMV_OFFSET_DEFAULT) {
+                       CERROR("%s: "DFID" dir merge offset %u != -1\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmu->lum_stripe_offset);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_hash_type &&
+                   (lmu->lum_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK)) !=
+                   (lmv->lmv_merge_hash & cpu_to_le32(LMV_HASH_TYPE_MASK))) {
+                       CERROR("%s: "DFID" merge hash mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_merge_hash, lmu->lum_hash_type);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lum_stripe_count < lmv->lmv_stripe_count)
+                       shrink = true;
+       }
+
+       if (shrink) {
+               mlc->mlc_opc = MD_LAYOUT_SHRINK;
+               mlc->mlc_buf.lb_buf = rr->rr_eadata;
+               mlc->mlc_buf.lb_len = rr->rr_eadatalen;
+               rc = mo_layout_change(env, mdt_object_child(obj), mlc);
+       } else {
+               struct lu_buf *buf = &info->mti_buf;
+               u32 version = le32_to_cpu(lmv->lmv_layout_version);
+
+               lmv->lmv_hash_type &= ~LMV_HASH_FLAG_LAYOUT_CHANGE;
+               lmv->lmv_layout_version = cpu_to_le32(++version);
+               lmv->lmv_migrate_offset = 0;
+               lmv->lmv_migrate_hash = 0;
+               buf->lb_buf = lmv;
+               buf->lb_len = sizeof(*lmv);
+               rc = mo_xattr_set(env, mdt_object_child(obj), buf,
+                                 XATTR_NAME_LMV, LU_XATTR_REPLACE);
+       }
+       GOTO(unlock_obj, rc);
+
+unlock_obj:
+       mdt_reint_striped_unlock(info, obj, lhc, einfo, rc);
+unlock_pobj:
+       mdt_object_unlock(info, pobj, lhp, rc);
+put_pobj:
+       mdt_object_put(env, pobj);
+put_obj:
+       mdt_object_put(env, obj);
+
+       return rc;
+}
+
 int mdt_reint_setxattr(struct mdt_thread_info *info,
-                       struct mdt_lock_handle *unused)
+                      struct mdt_lock_handle *unused)
 {
        struct ptlrpc_request   *req = mdt_info_req(info);
        struct mdt_lock_handle  *lh;
@@ -321,6 +539,7 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
        const char              *xattr_name = rr->rr_name.ln_name;
        int                      xattr_len = rr->rr_eadatalen;
        __u64                    lockpart = MDS_INODELOCK_UPDATE;
+       ktime_t                  kstart = ktime_get();
        int                      rc;
        ENTRY;
 
@@ -330,12 +549,12 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
        if (info->mti_dlm_req)
                ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
 
-        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SETXATTR))
-                RETURN(err_serious(-ENOMEM));
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SETXATTR))
+               RETURN(err_serious(-ENOMEM));
 
        rc = mdt_init_ucred_reint(info);
-        if (rc != 0)
-                RETURN(rc);
+       if (rc != 0)
+               RETURN(rc);
 
        if (strncmp(xattr_name, XATTR_USER_PREFIX,
                    sizeof(XATTR_USER_PREFIX) - 1) == 0) {
@@ -344,7 +563,22 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
        } else if (strncmp(xattr_name, XATTR_TRUSTED_PREFIX,
                    sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0) {
 
-               if (!md_capable(mdt_ucred(info), CFS_CAP_SYS_ADMIN))
+               /* setxattr(LMV) with lum is used to shrink dir layout */
+               if (strcmp(xattr_name, XATTR_NAME_LMV) == 0) {
+                       __u32 *magic = rr->rr_eadata;
+
+                       /* we don't let to remove LMV? */
+                       if (!rr->rr_eadata)
+                               GOTO(out, rc = 0);
+
+                       if (le32_to_cpu(*magic) == LMV_USER_MAGIC ||
+                           le32_to_cpu(*magic) == LMV_USER_MAGIC_SPECIFIC) {
+                               rc = mdt_dir_layout_update(info);
+                               GOTO(out, rc);
+                       }
+               }
+
+               if (!md_capable(mdt_ucred(info), CAP_SYS_ADMIN))
                        GOTO(out, rc = -EPERM);
 
                if (strcmp(xattr_name, XATTR_NAME_LOV) == 0 ||
@@ -367,16 +601,11 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                /* ACLs were mapped out, return an error so the user knows */
                if (rc != xattr_len)
                        GOTO(out, rc = -EPERM);
-       } else if ((strlen(xattr_name) > strlen(XATTR_LUSTRE_LOV) + 1) &&
+       } else if ((strlen(xattr_name) > sizeof(XATTR_LUSTRE_LOV)) &&
                   strncmp(xattr_name, XATTR_LUSTRE_LOV,
                           strlen(XATTR_LUSTRE_LOV)) == 0) {
 
-               if (strncmp(xattr_name, XATTR_LUSTRE_LOV".add",
-                           strlen(XATTR_LUSTRE_LOV".add")) &&
-                   strncmp(xattr_name, XATTR_LUSTRE_LOV".set",
-                           strlen(XATTR_LUSTRE_LOV".set")) &&
-                   strncmp(xattr_name, XATTR_LUSTRE_LOV".del",
-                           strlen(XATTR_LUSTRE_LOV".del"))) {
+               if (!allowed_lustre_lov(xattr_name)) {
                        CERROR("%s: invalid xattr name: %s\n",
                               mdt_obd_name(info->mti_mdt), xattr_name);
                        GOTO(out, rc = -EINVAL);
@@ -385,25 +614,28 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                lockpart |= MDS_INODELOCK_LAYOUT;
        }
 
-        /* Revoke all clients' lookup lock, since the access
-         * permissions for this inode is changed when ACL_ACCESS is
-         * set. This isn't needed for ACL_DEFAULT, since that does
-         * not change the access permissions of this inode, nor any
-         * other existing inodes. It is setting the ACLs inherited
-         * by new directories/files at create time. */
+       /* Revoke all clients' lookup lock, since the access
+        * permissions for this inode is changed when ACL_ACCESS is
+        * set. This isn't needed for ACL_DEFAULT, since that does
+        * not change the access permissions of this inode, nor any
+        * other existing inodes. It is setting the ACLs inherited
+        * by new directories/files at create time.
+        */
        /* We need revoke both LOOKUP|PERM lock here, see mdt_attr_set. */
-        if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
+       if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
                lockpart |= MDS_INODELOCK_PERM | MDS_INODELOCK_LOOKUP;
        /* We need to take the lock on behalf of old clients so that newer
-        * clients flush their xattr caches */
+        * clients flush their xattr caches
+        */
        else
                lockpart |= MDS_INODELOCK_XATTR;
 
-        lh = &info->mti_lh[MDT_LH_PARENT];
-        /* ACLs were sent to clients under LCK_CR locks, so taking LCK_EX
-         * to cancel them. */
-        mdt_lock_reg_init(lh, LCK_EX);
-        obj = mdt_object_find_lock(info, rr->rr_fid1, lh, lockpart);
+       lh = &info->mti_lh[MDT_LH_PARENT];
+       /* ACLs were sent to clients under LCK_CR locks, so taking LCK_EX
+        * to cancel them.
+        */
+       mdt_lock_reg_init(lh, LCK_EX);
+       obj = mdt_object_find_lock(info, rr->rr_fid1, lh, lockpart);
        if (IS_ERR(obj))
                GOTO(out, rc = PTR_ERR(obj));
 
@@ -414,9 +646,9 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
 
        if (unlikely(!(valid & OBD_MD_FLCTIME))) {
                /* This isn't strictly an error, but all current clients
-                * should set OBD_MD_FLCTIME when setting attributes. */
-               CWARN("%s: client miss to set OBD_MD_FLCTIME when "
-                     "setxattr %s: [object "DFID"] [valid %llu]\n",
+                * should set OBD_MD_FLCTIME when setting attributes.
+                */
+               CWARN("%s: client miss to set OBD_MD_FLCTIME when setxattr %s: [object "DFID"] [valid %llu]\n",
                      mdt_obd_name(info->mti_mdt), xattr_name,
                      PFID(rr->rr_fid1), valid);
                attr->la_ctime = ktime_get_real_seconds();
@@ -443,24 +675,25 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                        ma->ma_attr_flags |= MDS_PERM_BYPASS;
                        mo_attr_set(env, child, ma);
                }
-        } else if (valid & OBD_MD_FLXATTRRM) {
-                rc = mo_xattr_del(env, child, xattr_name);
-                /* update ctime after xattr changed */
-                if (rc == 0) {
-                        ma->ma_attr_flags |= MDS_PERM_BYPASS;
-                        mo_attr_set(env, child, ma);
-                }
+       } else if (valid & OBD_MD_FLXATTRRM) {
+               rc = mo_xattr_del(env, child, xattr_name);
+               /* update ctime after xattr changed */
+               if (rc == 0) {
+                       ma->ma_attr_flags |= MDS_PERM_BYPASS;
+                       mo_attr_set(env, child, ma);
+               }
        } else {
                CDEBUG(D_INFO, "valid bits: %#llx\n", valid);
                rc = -EINVAL;
        }
 
        if (rc == 0)
-               mdt_counter_incr(req, LPROC_MDT_SETXATTR);
+               mdt_counter_incr(req, LPROC_MDT_SETXATTR,
+                                ktime_us_delta(ktime_get(), kstart));
 
-        EXIT;
+       EXIT;
 out_unlock:
-        mdt_object_unlock_put(info, obj, lh, rc);
+       mdt_object_unlock_put(info, obj, lh, rc);
 out:
        mdt_exit_ucred(info);
        return rc;