LU-14459 mdt: support fixed directory layout

[fs/lustre-release.git] / lustre / mdt / mdt_xattr.c
diff --git a/lustre/mdt/mdt_xattr.c b/lustre/mdt/mdt_xattr.c

index 483cd45..077363d 100644 (file)
--- a/lustre/mdt/mdt_xattr.c
+++ b/lustre/mdt/mdt_xattr.c
@@ -27,7 +27,6 @@
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
   *
   * lustre/mdt/mdt_xattr.c
   *
@@ -45,11 +44,12 @@
  #include <obd_class.h>
  #include <lustre_nodemap.h>
  #include <lustre_acl.h>
+#include <lustre_lmv.h>
  #include "mdt_internal.h"
  
  
  /* return EADATA length to the caller. negative value means error */
-static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
+static int mdt_getxattr_pack_reply(struct mdt_thread_info *info)
  {
         struct req_capsule *pill = info->mti_pill;
         struct ptlrpc_request *req = mdt_info_req(info);
@@ -64,7 +64,7 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
         valid = info->mti_body->mbo_valid & (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS);
  
         /* Determine how many bytes we need */
-        if (valid == OBD_MD_FLXATTR) {
+       if (valid == OBD_MD_FLXATTR) {
                 xattr_name = req_capsule_client_get(pill, &RMF_NAME);
                 if (!xattr_name)
                         RETURN(-EFAULT);
@@ -78,7 +78,8 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
                                     &LU_BUF_NULL, xattr_name);
                 if (size == -ENODATA) {
                         /* XXX: Some client code will not handle -ENODATA
-                        * for XATTR_NAME_LOV (trusted.lov) properly. */
+                        * for XATTR_NAME_LOV (trusted.lov) properly.
+                        */
                         if (strcmp(xattr_name, XATTR_NAME_LOV) == 0)
                                 rc = 0;
                         else
@@ -95,7 +96,8 @@ static int mdt_getxattr_pack_reply(struct mdt_thread_info * info)
                 xattr_name = "all";
                 /* N.B. eadatasize = 0 is not valid for FLXATTRALL */
                 /* We could calculate accurate sizes, but this would
-                * introduce a lot of overhead, let's do it later... */
+                * introduce a lot of overhead, let's do it later...
+                */
                 size = info->mti_body->mbo_eadatasize;
                 if (size <= 0 || size > info->mti_mdt->mdt_max_ea_size ||
                     size & (sizeof(__u32) - 1)) {
@@ -235,15 +237,16 @@ out_shrink:
  int mdt_getxattr(struct mdt_thread_info *info)
  {
         struct ptlrpc_request  *req = mdt_info_req(info);
-        struct mdt_body        *reqbody;
-        struct mdt_body        *repbody = NULL;
-        struct md_object       *next;
-        struct lu_buf          *buf;
-        int                     easize, rc;
+       struct mdt_body        *reqbody;
+       struct mdt_body        *repbody = NULL;
+       struct md_object       *next;
+       struct lu_buf          *buf;
+       int                     easize, rc;
         u64                     valid;
-        ENTRY;
+       ktime_t                 kstart = ktime_get();
+       ENTRY;
  
-        LASSERT(info->mti_object != NULL);
+       LASSERT(info->mti_object != NULL);
         LASSERT(lu_object_assert_exists(&info->mti_object->mot_obj));
  
         CDEBUG(D_INODE, "getxattr "DFID"\n", PFID(&info->mti_body->mbo_fid1));
@@ -252,25 +255,25 @@ int mdt_getxattr(struct mdt_thread_info *info)
         if (rc)
                 RETURN(err_serious(rc));
  
-        reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
-        if (reqbody == NULL)
-                RETURN(err_serious(-EFAULT));
+       reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
+       if (reqbody == NULL)
+               RETURN(err_serious(-EFAULT));
  
         rc = mdt_init_ucred(info, reqbody);
-        if (rc)
-                RETURN(err_serious(rc));
+       if (rc)
+               RETURN(err_serious(rc));
  
-        next = mdt_object_child(info->mti_object);
-        easize = mdt_getxattr_pack_reply(info);
+       next = mdt_object_child(info->mti_object);
+       easize = mdt_getxattr_pack_reply(info);
         if (easize == -ENODATA)
                 GOTO(out, rc = easize);
         else if (easize < 0)
                 GOTO(out, rc = err_serious(easize));
  
-        repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
-        LASSERT(repbody != NULL);
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       LASSERT(repbody != NULL);
  
-        /* No need further getxattr. */
+       /* No need further getxattr. */
         if (easize == 0 || reqbody->mbo_eadatasize == 0)
                 GOTO(out, rc = easize);
  
@@ -304,10 +307,12 @@ int mdt_getxattr(struct mdt_thread_info *info)
         EXIT;
  out:
         if (rc >= 0) {
-               mdt_counter_incr(req, LPROC_MDT_GETXATTR);
+               mdt_counter_incr(req, LPROC_MDT_GETXATTR,
+                                ktime_us_delta(ktime_get(), kstart));
                 /* LU-11109: Set OBD_MD_FLXATTR on success so that
                  * newer clients can distinguish between nonexistent
-                * xattrs and zero length values. */
+                * xattrs and zero length values.
+                */
                 repbody->mbo_valid |= OBD_MD_FLXATTR;
                 repbody->mbo_eadatasize = rc;
                 rc = 0;
@@ -316,8 +321,8 @@ out:
         return rc;
  }
  
-/* shrink dir layout after migration */
-static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
+/* update dir layout after migration/restripe */
+int mdt_dir_layout_update(struct mdt_thread_info *info)
  {
         const struct lu_env *env = info->mti_env;
         struct mdt_device *mdt = info->mti_mdt;
@@ -325,7 +330,7 @@ static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
         struct mdt_reint_record *rr = &info->mti_rr;
         struct lmv_user_md *lmu = rr->rr_eadata;
         __u32 lum_stripe_count = lmu->lum_stripe_count;
-       struct lu_buf *buf = &info->mti_buf;
+       struct md_layout_change *mlc = &info->mti_mlc;
         struct lmv_mds_md_v1 *lmv;
         struct md_attr *ma = &info->mti_attr;
         struct ldlm_enqueue_info *einfo = &info->mti_einfo[0];
@@ -333,6 +338,7 @@ static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
         struct mdt_object *obj;
         struct mdt_lock_handle *lhp = NULL;
         struct mdt_lock_handle *lhc;
+       bool shrink = false;
         int rc;
  
         ENTRY;
@@ -340,19 +346,11 @@ static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
         if (!mdt->mdt_enable_dir_migration)
                 RETURN(-EPERM);
  
-       if (!md_capable(uc, CFS_CAP_SYS_ADMIN) &&
+       if (!md_capable(uc, CAP_SYS_ADMIN) &&
             uc->uc_gid != mdt->mdt_enable_remote_dir_gid &&
             mdt->mdt_enable_remote_dir_gid != -1)
                 RETURN(-EPERM);
  
-       /* mti_big_lmm is used to save LMV, but it may be uninitialized. */
-       if (unlikely(!info->mti_big_lmm)) {
-               info->mti_big_lmmsize = lmv_mds_md_size(64, LMV_MAGIC);
-               OBD_ALLOC(info->mti_big_lmm, info->mti_big_lmmsize);
-               if (!info->mti_big_lmm)
-                       RETURN(-ENOMEM);
-       }
-
         obj = mdt_object_find(env, mdt, rr->rr_fid1);
         if (IS_ERR(obj))
                 RETURN(PTR_ERR(obj));
@@ -375,7 +373,7 @@ static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
  
         /*
          * lock parent if dir will be shrunk to 1 stripe, because dir will be
-        * converted to normal directory, as will change dir fid and update
+        * converted to normal directory, as will change dir FID and update
          * namespace of parent.
          */
         lhp = &info->mti_lh[MDT_LH_PARENT];
@@ -396,9 +394,6 @@ static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
         if (rc)
                 GOTO(unlock_pobj, rc);
  
-       ma->ma_lmv = info->mti_big_lmm;
-       ma->ma_lmv_size = info->mti_big_lmmsize;
-       ma->ma_valid = 0;
         rc = mdt_stripe_get(info, obj, ma, XATTR_NAME_LMV);
         if (rc)
                 GOTO(unlock_obj, rc);
@@ -408,41 +403,112 @@ static int mdt_dir_layout_shrink(struct mdt_thread_info *info)
                 GOTO(unlock_obj, rc = -EALREADY);
  
         lmv = &ma->ma_lmv->lmv_md_v1;
+       if (!lmv_is_sane(lmv))
+               GOTO(unlock_obj, rc = -EBADF);
  
         /* ditto */
-       if (!(le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_FLAG_MIGRATION))
+       if (!lmv_is_layout_changing(lmv))
                 GOTO(unlock_obj, rc = -EALREADY);
  
         lum_stripe_count = lmu->lum_stripe_count;
         if (!lum_stripe_count)
                 lum_stripe_count = cpu_to_le32(1);
  
-       if (lmv->lmv_migrate_offset != lum_stripe_count) {
-               CERROR("%s: "DFID" migrate mdt count mismatch %u != %u\n",
-                       mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
-                       lmv->lmv_migrate_offset, lmu->lum_stripe_count);
-               GOTO(unlock_obj, rc = -EINVAL);
-       }
+       if (lmv_is_migrating(lmv)) {
+               if (lmv->lmv_migrate_offset != lum_stripe_count) {
+                       CERROR("%s: "DFID" migrate mdt count mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_migrate_offset, lmu->lum_stripe_count);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
  
-       if (lmv->lmv_master_mdt_index != lmu->lum_stripe_offset) {
-               CERROR("%s: "DFID" migrate mdt index mismatch %u != %u\n",
-                       mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
-                       lmv->lmv_master_mdt_index, lmu->lum_stripe_offset);
-               GOTO(unlock_obj, rc = -EINVAL);
-       }
+               if (lmu->lum_stripe_offset != lmv->lmv_master_mdt_index) {
+                       CERROR("%s: "DFID" migrate mdt index mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_master_mdt_index,
+                               lmu->lum_stripe_offset);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lum_stripe_count > 1 && lmu->lum_hash_type &&
+                   lmu->lum_hash_type !=
+                   (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK))) {
+                       CERROR("%s: "DFID" migrate mdt hash mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_hash_type, lmu->lum_hash_type);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               shrink = true;
+       } else if (lmv_is_splitting(lmv)) {
+               if (lmv->lmv_stripe_count != lum_stripe_count) {
+                       CERROR("%s: "DFID" stripe count mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_stripe_count, lmu->lum_stripe_count);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_stripe_offset != LMV_OFFSET_DEFAULT) {
+                       CERROR("%s: "DFID" dir split offset %u != -1\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmu->lum_stripe_offset);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_hash_type &&
+                   lmu->lum_hash_type !=
+                   (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK))) {
+                       CERROR("%s: "DFID" split hash mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_hash_type, lmu->lum_hash_type);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+       } else if (lmv_is_merging(lmv)) {
+               if (lmv->lmv_merge_offset != lum_stripe_count) {
+                       CERROR("%s: "DFID" stripe count mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_merge_offset, lmu->lum_stripe_count);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
  
-       if (lum_stripe_count > 1 &&
-           (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK)) !=
-           lmu->lum_hash_type) {
-               CERROR("%s: "DFID" migrate mdt hash mismatch %u != %u\n",
-                       mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
-                       lmv->lmv_hash_type, lmu->lum_hash_type);
-               GOTO(unlock_obj, rc = -EINVAL);
+               if (lmu->lum_stripe_offset != LMV_OFFSET_DEFAULT) {
+                       CERROR("%s: "DFID" dir merge offset %u != -1\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmu->lum_stripe_offset);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lmu->lum_hash_type &&
+                   (lmu->lum_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK)) !=
+                   (lmv->lmv_merge_hash & cpu_to_le32(LMV_HASH_TYPE_MASK))) {
+                       CERROR("%s: "DFID" merge hash mismatch %u != %u\n",
+                               mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1),
+                               lmv->lmv_merge_hash, lmu->lum_hash_type);
+                       GOTO(unlock_obj, rc = -EINVAL);
+               }
+
+               if (lum_stripe_count < lmv->lmv_stripe_count)
+                       shrink = true;
         }
  
-       buf->lb_buf = rr->rr_eadata;
-       buf->lb_len = rr->rr_eadatalen;
-       rc = mo_xattr_set(env, mdt_object_child(obj), buf, XATTR_NAME_LMV, 0);
+       if (shrink) {
+               mlc->mlc_opc = MD_LAYOUT_SHRINK;
+               mlc->mlc_buf.lb_buf = rr->rr_eadata;
+               mlc->mlc_buf.lb_len = rr->rr_eadatalen;
+               rc = mo_layout_change(env, mdt_object_child(obj), mlc);
+       } else {
+               struct lu_buf *buf = &info->mti_buf;
+               u32 version = le32_to_cpu(lmv->lmv_layout_version);
+
+               lmv->lmv_hash_type &= ~LMV_HASH_FLAG_LAYOUT_CHANGE;
+               lmv->lmv_layout_version = cpu_to_le32(++version);
+               lmv->lmv_migrate_offset = 0;
+               lmv->lmv_migrate_hash = 0;
+               buf->lb_buf = lmv;
+               buf->lb_len = sizeof(*lmv);
+               rc = mo_xattr_set(env, mdt_object_child(obj), buf,
+                                 XATTR_NAME_LMV, LU_XATTR_REPLACE);
+       }
         GOTO(unlock_obj, rc);
  
  unlock_obj:
@@ -458,7 +524,7 @@ put_obj:
  }
  
  int mdt_reint_setxattr(struct mdt_thread_info *info,
-                       struct mdt_lock_handle *unused)
+                      struct mdt_lock_handle *unused)
  {
         struct ptlrpc_request   *req = mdt_info_req(info);
         struct mdt_lock_handle  *lh;
@@ -473,6 +539,7 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
         const char              *xattr_name = rr->rr_name.ln_name;
         int                      xattr_len = rr->rr_eadatalen;
         __u64                    lockpart = MDS_INODELOCK_UPDATE;
+       ktime_t                  kstart = ktime_get();
         int                      rc;
         ENTRY;
  
@@ -482,12 +549,12 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
         if (info->mti_dlm_req)
                 ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
  
-        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SETXATTR))
-                RETURN(err_serious(-ENOMEM));
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SETXATTR))
+               RETURN(err_serious(-ENOMEM));
  
         rc = mdt_init_ucred_reint(info);
-        if (rc != 0)
-                RETURN(rc);
+       if (rc != 0)
+               RETURN(rc);
  
         if (strncmp(xattr_name, XATTR_USER_PREFIX,
                     sizeof(XATTR_USER_PREFIX) - 1) == 0) {
@@ -506,12 +573,12 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
  
                         if (le32_to_cpu(*magic) == LMV_USER_MAGIC ||
                             le32_to_cpu(*magic) == LMV_USER_MAGIC_SPECIFIC) {
-                               rc = mdt_dir_layout_shrink(info);
+                               rc = mdt_dir_layout_update(info);
                                 GOTO(out, rc);
                         }
                 }
  
-               if (!md_capable(mdt_ucred(info), CFS_CAP_SYS_ADMIN))
+               if (!md_capable(mdt_ucred(info), CAP_SYS_ADMIN))
                         GOTO(out, rc = -EPERM);
  
                 if (strcmp(xattr_name, XATTR_NAME_LOV) == 0 ||
@@ -547,25 +614,28 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                 lockpart |= MDS_INODELOCK_LAYOUT;
         }
  
-        /* Revoke all clients' lookup lock, since the access
-         * permissions for this inode is changed when ACL_ACCESS is
-         * set. This isn't needed for ACL_DEFAULT, since that does
-         * not change the access permissions of this inode, nor any
-         * other existing inodes. It is setting the ACLs inherited
-         * by new directories/files at create time. */
+       /* Revoke all clients' lookup lock, since the access
+        * permissions for this inode is changed when ACL_ACCESS is
+        * set. This isn't needed for ACL_DEFAULT, since that does
+        * not change the access permissions of this inode, nor any
+        * other existing inodes. It is setting the ACLs inherited
+        * by new directories/files at create time.
+        */
         /* We need revoke both LOOKUP|PERM lock here, see mdt_attr_set. */
-        if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
+       if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
                 lockpart |= MDS_INODELOCK_PERM | MDS_INODELOCK_LOOKUP;
         /* We need to take the lock on behalf of old clients so that newer
-        * clients flush their xattr caches */
+        * clients flush their xattr caches
+        */
         else
                 lockpart |= MDS_INODELOCK_XATTR;
  
-        lh = &info->mti_lh[MDT_LH_PARENT];
-        /* ACLs were sent to clients under LCK_CR locks, so taking LCK_EX
-         * to cancel them. */
-        mdt_lock_reg_init(lh, LCK_EX);
-        obj = mdt_object_find_lock(info, rr->rr_fid1, lh, lockpart);
+       lh = &info->mti_lh[MDT_LH_PARENT];
+       /* ACLs were sent to clients under LCK_CR locks, so taking LCK_EX
+        * to cancel them.
+        */
+       mdt_lock_reg_init(lh, LCK_EX);
+       obj = mdt_object_find_lock(info, rr->rr_fid1, lh, lockpart);
         if (IS_ERR(obj))
                 GOTO(out, rc = PTR_ERR(obj));
  
@@ -576,9 +646,9 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
  
         if (unlikely(!(valid & OBD_MD_FLCTIME))) {
                 /* This isn't strictly an error, but all current clients
-                * should set OBD_MD_FLCTIME when setting attributes. */
-               CWARN("%s: client miss to set OBD_MD_FLCTIME when "
-                     "setxattr %s: [object "DFID"] [valid %llu]\n",
+                * should set OBD_MD_FLCTIME when setting attributes.
+                */
+               CWARN("%s: client miss to set OBD_MD_FLCTIME when setxattr %s: [object "DFID"] [valid %llu]\n",
                       mdt_obd_name(info->mti_mdt), xattr_name,
                       PFID(rr->rr_fid1), valid);
                 attr->la_ctime = ktime_get_real_seconds();
@@ -605,24 +675,25 @@ int mdt_reint_setxattr(struct mdt_thread_info *info,
                         ma->ma_attr_flags |= MDS_PERM_BYPASS;
                         mo_attr_set(env, child, ma);
                 }
-        } else if (valid & OBD_MD_FLXATTRRM) {
-                rc = mo_xattr_del(env, child, xattr_name);
-                /* update ctime after xattr changed */
-                if (rc == 0) {
-                        ma->ma_attr_flags |= MDS_PERM_BYPASS;
-                        mo_attr_set(env, child, ma);
-                }
+       } else if (valid & OBD_MD_FLXATTRRM) {
+               rc = mo_xattr_del(env, child, xattr_name);
+               /* update ctime after xattr changed */
+               if (rc == 0) {
+                       ma->ma_attr_flags |= MDS_PERM_BYPASS;
+                       mo_attr_set(env, child, ma);
+               }
         } else {
                 CDEBUG(D_INFO, "valid bits: %#llx\n", valid);
                 rc = -EINVAL;
         }
  
         if (rc == 0)
-               mdt_counter_incr(req, LPROC_MDT_SETXATTR);
+               mdt_counter_incr(req, LPROC_MDT_SETXATTR,
+                                ktime_us_delta(ktime_get(), kstart));
  
-        EXIT;
+       EXIT;
  out_unlock:
-        mdt_object_unlock_put(info, obj, lh, rc);
+       mdt_object_unlock_put(info, obj, lh, rc);
  out:
         mdt_exit_ucred(info);
         return rc;