Whamcloud - gitweb
LU-14139 ptlrpc: grow reply buffer properly for batch request
[fs/lustre-release.git] / lustre / ptlrpc / nodemap_storage.c
index 8586745..73a6f31 100644 (file)
@@ -22,7 +22,7 @@
 /*
  * Copyright (C) 2015, Trustees of Indiana University
  *
- * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2017, Intel Corporation.
  *
  * Author: Joshua Walgenbach <jjw@iu.edu>
  * Author: Kit Westneat <cwestnea@iu.edu>
@@ -50,8 +50,8 @@
 #include <linux/mutex.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <lnet/types.h>
-#include <lustre/lustre_idl.h>
+#include <uapi/linux/lnet/lnet-types.h>
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <dt_object.h>
 #include <lu_object.h>
 #include <lustre_net.h>
@@ -67,35 +67,52 @@ static DEFINE_MUTEX(ncf_list_lock);
 /* MGS index is different than others, others are listeners to MGS idx */
 static struct nm_config_file *nodemap_mgs_ncf;
 
-/* lu_nodemap flags */
-enum nm_flag_shifts {
-       NM_FL_ALLOW_ROOT_ACCESS = 0x1,
-       NM_FL_TRUST_CLIENT_IDS = 0x2,
-       NM_FL_DENY_UNKNOWN = 0x4,
-};
-
-static void nodemap_cluster_key_init(struct nodemap_key *nk, unsigned int nm_id)
+static void nodemap_cluster_key_init(struct nodemap_key *nk, unsigned int nm_id,
+                                    enum nodemap_cluster_rec_subid subid)
 {
        nk->nk_nodemap_id = cpu_to_le32(nm_idx_set_type(nm_id,
                                                        NODEMAP_CLUSTER_IDX));
-       nk->nk_unused = 0;
+       nk->nk_cluster_subid = subid;
 }
 
 static void nodemap_cluster_rec_init(union nodemap_rec *nr,
                                     const struct lu_nodemap *nodemap)
 {
-       CLASSERT(sizeof(nr->ncr.ncr_name) == sizeof(nodemap->nm_name));
+       BUILD_BUG_ON(sizeof(nr->ncr.ncr_name) != sizeof(nodemap->nm_name));
 
-       strncpy(nr->ncr.ncr_name, nodemap->nm_name, sizeof(nodemap->nm_name));
+       strncpy(nr->ncr.ncr_name, nodemap->nm_name, sizeof(nr->ncr.ncr_name));
        nr->ncr.ncr_squash_uid = cpu_to_le32(nodemap->nm_squash_uid);
        nr->ncr.ncr_squash_gid = cpu_to_le32(nodemap->nm_squash_gid);
-       nr->ncr.ncr_flags = cpu_to_le32(
+       nr->ncr.ncr_squash_projid = cpu_to_le32(nodemap->nm_squash_projid);
+       nr->ncr.ncr_flags =
                (nodemap->nmf_trust_client_ids ?
                        NM_FL_TRUST_CLIENT_IDS : 0) |
                (nodemap->nmf_allow_root_access ?
                        NM_FL_ALLOW_ROOT_ACCESS : 0) |
                (nodemap->nmf_deny_unknown ?
-                       NM_FL_DENY_UNKNOWN : 0));
+                       NM_FL_DENY_UNKNOWN : 0) |
+               (nodemap->nmf_map_mode & NODEMAP_MAP_UID ?
+                       NM_FL_MAP_UID : 0) |
+               (nodemap->nmf_map_mode & NODEMAP_MAP_GID ?
+                       NM_FL_MAP_GID : 0) |
+               (nodemap->nmf_map_mode & NODEMAP_MAP_PROJID ?
+                       NM_FL_MAP_PROJID : 0) |
+               (nodemap->nmf_enable_audit ?
+                       NM_FL_ENABLE_AUDIT : 0) |
+               (nodemap->nmf_forbid_encryption ?
+                       NM_FL_FORBID_ENCRYPT : 0);
+       nr->ncr.ncr_flags2 =
+               (nodemap->nmf_readonly_mount ?
+                       NM_FL2_READONLY_MOUNT : 0);
+}
+
+static void nodemap_cluster_roles_rec_init(union nodemap_rec *nr,
+                                          const struct lu_nodemap *nodemap)
+{
+       struct nodemap_cluster_roles_rec *ncrr = &nr->ncrr;
+
+       memset(ncrr, 0, sizeof(struct nodemap_cluster_roles_rec));
+       ncrr->ncrr_roles = cpu_to_le64(nodemap->nmf_rbac);
 }
 
 static void nodemap_idmap_key_init(struct nodemap_key *nk, unsigned int nm_id,
@@ -106,8 +123,12 @@ static void nodemap_idmap_key_init(struct nodemap_key *nk, unsigned int nm_id,
 
        if (id_type == NODEMAP_UID)
                idx_type = NODEMAP_UIDMAP_IDX;
-       else
+       else if (id_type == NODEMAP_GID)
                idx_type = NODEMAP_GIDMAP_IDX;
+       else if (id_type == NODEMAP_PROJID)
+               idx_type = NODEMAP_PROJIDMAP_IDX;
+       else
+               idx_type = NODEMAP_EMPTY_IDX;
 
        nk->nk_nodemap_id = cpu_to_le32(nm_idx_set_type(nm_id, idx_type));
        nk->nk_id_client = cpu_to_le32(id_client);
@@ -153,16 +174,94 @@ static void nodemap_inc_version(const struct lu_env *env,
        dt_version_set(env, nodemap_idx, ver + 1, th);
 }
 
+enum ncfc_find_create {
+       NCFC_CREATE_NEW = 1,
+};
+
+static struct dt_object *nodemap_cache_find_create(const struct lu_env *env,
+                                                  struct dt_device *dev,
+                                                  struct local_oid_storage *los,
+                                                  enum ncfc_find_create create_new)
+{
+       struct lu_fid tfid;
+       struct dt_object *root_obj;
+       struct dt_object *nm_obj;
+       int rc = 0;
+
+       rc = dt_root_get(env, dev, &tfid);
+       if (rc < 0)
+               GOTO(out, nm_obj = ERR_PTR(rc));
+
+       root_obj = dt_locate(env, dev, &tfid);
+       if (unlikely(IS_ERR(root_obj)))
+               GOTO(out, nm_obj = root_obj);
+
+       rc = dt_lookup_dir(env, root_obj, LUSTRE_NODEMAP_NAME, &tfid);
+       if (rc == -ENOENT) {
+               if (dev->dd_rdonly)
+                       GOTO(out_root, nm_obj = ERR_PTR(-EROFS));
+       } else if (rc) {
+               GOTO(out_root, nm_obj = ERR_PTR(rc));
+       } else if (dev->dd_rdonly && create_new == NCFC_CREATE_NEW) {
+               GOTO(out_root, nm_obj = ERR_PTR(-EROFS));
+       }
+
+again:
+       /* if loading index fails the first time, create new index */
+       if (create_new == NCFC_CREATE_NEW && rc != -ENOENT) {
+               CDEBUG(D_INFO, "removing old index, creating new one\n");
+               rc = local_object_unlink(env, dev, root_obj,
+                                        LUSTRE_NODEMAP_NAME);
+               if (rc < 0) {
+                       /* XXX not sure the best way to get obd name. */
+                       CERROR("cannot destroy nodemap index: rc = %d\n",
+                              rc);
+                       GOTO(out_root, nm_obj = ERR_PTR(rc));
+               }
+       }
+
+       nm_obj = local_index_find_or_create(env, los, root_obj,
+                                               LUSTRE_NODEMAP_NAME,
+                                               S_IFREG | S_IRUGO | S_IWUSR,
+                                               &dt_nodemap_features);
+       if (IS_ERR(nm_obj))
+               GOTO(out_root, nm_obj);
+
+       if (nm_obj->do_index_ops == NULL) {
+               rc = nm_obj->do_ops->do_index_try(env, nm_obj,
+                                                     &dt_nodemap_features);
+               /* even if loading from tgt fails, connecting to MGS will
+                * rewrite the config
+                */
+               if (rc < 0) {
+                       dt_object_put(env, nm_obj);
+
+                       if (create_new == NCFC_CREATE_NEW)
+                               GOTO(out_root, nm_obj = ERR_PTR(rc));
+
+                       CERROR("cannot load nodemap index from disk, creating "
+                              "new index: rc = %d\n", rc);
+                       create_new = NCFC_CREATE_NEW;
+                       goto again;
+               }
+       }
+
+out_root:
+       dt_object_put(env, root_obj);
+out:
+       return nm_obj;
+}
+
 static int nodemap_idx_insert(const struct lu_env *env,
                              struct dt_object *idx,
                              const struct nodemap_key *nk,
                              const union nodemap_rec *nr)
 {
-       struct thandle          *th;
-       struct dt_device        *dev = lu2dt_dev(idx->do_lu.lo_dev);
-       int                      rc;
+       struct thandle *th;
+       struct dt_device *dev = lu2dt_dev(idx->do_lu.lo_dev);
+       int rc;
 
-       CLASSERT(sizeof(union nodemap_rec) == 32);
+       BUILD_BUG_ON(sizeof(union nodemap_rec) != 32);
 
        th = dt_trans_create(env, dev);
 
@@ -186,7 +285,7 @@ static int nodemap_idx_insert(const struct lu_env *env,
        dt_write_lock(env, idx, 0);
 
        rc = dt_insert(env, idx, (const struct dt_rec *)nr,
-                      (const struct dt_key *)nk, th, 1);
+                      (const struct dt_key *)nk, th);
 
        nodemap_inc_version(env, idx, th);
        dt_write_unlock(env, idx);
@@ -234,7 +333,7 @@ static int nodemap_idx_update(const struct lu_env *env,
                GOTO(out_lock, rc);
 
        rc = dt_insert(env, idx, (const struct dt_rec *)nr,
-                      (const struct dt_key *)nk, th, 1);
+                      (const struct dt_key *)nk, th);
        if (rc != 0)
                GOTO(out_lock, rc);
 
@@ -291,8 +390,10 @@ enum nm_add_update {
        NM_UPDATE = 1,
 };
 
-static int nodemap_idx_nodemap_add_update(const struct lu_nodemap *nodemap,
-                                         enum nm_add_update update)
+static int nodemap_idx_cluster_add_update(const struct lu_nodemap *nodemap,
+                                         struct dt_object *idx,
+                                         enum nm_add_update update,
+                                         enum nodemap_cluster_rec_subid subid)
 {
        struct nodemap_key nk;
        union nodemap_rec nr;
@@ -301,38 +402,51 @@ static int nodemap_idx_nodemap_add_update(const struct lu_nodemap *nodemap,
 
        ENTRY;
 
-       if (nodemap_mgs_ncf == NULL) {
-               CERROR("cannot add nodemap config to non-existing MGS.\n");
-               return -EINVAL;
+       if (idx == NULL) {
+               if (nodemap_mgs_ncf == NULL) {
+                       CERROR("cannot add nodemap config to non-existing MGS.\n");
+                       return -EINVAL;
+               }
+               idx = nodemap_mgs_ncf->ncf_obj;
        }
 
        rc = lu_env_init(&env, LCT_LOCAL);
        if (rc)
                RETURN(rc);
 
-       nodemap_cluster_key_init(&nk, nodemap->nm_id);
-       nodemap_cluster_rec_init(&nr, nodemap);
+       nodemap_cluster_key_init(&nk, nodemap->nm_id, subid);
+       switch (subid) {
+       case NODEMAP_CLUSTER_REC:
+               nodemap_cluster_rec_init(&nr, nodemap);
+               break;
+       case NODEMAP_CLUSTER_ROLES:
+               nodemap_cluster_roles_rec_init(&nr, nodemap);
+               break;
+       default:
+               CWARN("%s: unknown subtype %u\n", nodemap->nm_name, subid);
+               GOTO(fini, rc = -EINVAL);
+       }
 
        if (update == NM_UPDATE)
-               rc = nodemap_idx_update(&env, nodemap_mgs_ncf->ncf_obj,
-                                       &nk, &nr);
+               rc = nodemap_idx_update(&env, idx, &nk, &nr);
        else
-               rc = nodemap_idx_insert(&env, nodemap_mgs_ncf->ncf_obj,
-                                       &nk, &nr);
+               rc = nodemap_idx_insert(&env, idx, &nk, &nr);
 
+fini:
        lu_env_fini(&env);
-
        RETURN(rc);
 }
 
 int nodemap_idx_nodemap_add(const struct lu_nodemap *nodemap)
 {
-       return nodemap_idx_nodemap_add_update(nodemap, NM_ADD);
+       return nodemap_idx_cluster_add_update(nodemap, NULL,
+                                             NM_ADD, NODEMAP_CLUSTER_REC);
 }
 
 int nodemap_idx_nodemap_update(const struct lu_nodemap *nodemap)
 {
-       return nodemap_idx_nodemap_add_update(nodemap, NM_UPDATE);
+       return nodemap_idx_cluster_add_update(nodemap, NULL,
+                                             NM_UPDATE, NODEMAP_CLUSTER_REC);
 }
 
 int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap)
@@ -358,6 +472,11 @@ int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap)
        if (rc != 0)
                RETURN(rc);
 
+       nodemap_cluster_key_init(&nk, nodemap->nm_id, NODEMAP_CLUSTER_ROLES);
+       rc2 = nodemap_idx_delete(&env, nodemap_mgs_ncf->ncf_obj, &nk, NULL);
+       if (rc2 < 0 && rc2 != -ENOENT)
+               rc = rc2;
+
        root = nodemap->nm_fs_to_client_uidmap;
        nm_rbtree_postorder_for_each_entry_safe(idmap, temp, &root,
                                                id_fs_to_client) {
@@ -380,6 +499,17 @@ int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap)
                        rc = rc2;
        }
 
+       root = nodemap->nm_client_to_fs_projidmap;
+       nm_rbtree_postorder_for_each_entry_safe(idmap, temp, &root,
+                                               id_client_to_fs) {
+               nodemap_idmap_key_init(&nk, nodemap->nm_id, NODEMAP_PROJID,
+                                      idmap->id_client);
+               rc2 = nodemap_idx_delete(&env, nodemap_mgs_ncf->ncf_obj,
+                                        &nk, NULL);
+               if (rc2 < 0)
+                       rc = rc2;
+       }
+
        list_for_each_entry_safe(range, range_temp, &nodemap->nm_ranges,
                                 rn_list) {
                nodemap_range_key_init(&nk, nodemap->nm_id, range->rn_id);
@@ -389,7 +519,7 @@ int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap)
                        rc = rc2;
        }
 
-       nodemap_cluster_key_init(&nk, nodemap->nm_id);
+       nodemap_cluster_key_init(&nk, nodemap->nm_id, NODEMAP_CLUSTER_REC);
        rc2 = nodemap_idx_delete(&env, nodemap_mgs_ncf->ncf_obj, &nk, NULL);
        if (rc2 < 0)
                rc = rc2;
@@ -399,6 +529,42 @@ int nodemap_idx_nodemap_del(const struct lu_nodemap *nodemap)
        RETURN(rc);
 }
 
+int nodemap_idx_cluster_roles_add(const struct lu_nodemap *nodemap)
+{
+       return nodemap_idx_cluster_add_update(nodemap, NULL, NM_ADD,
+                                             NODEMAP_CLUSTER_ROLES);
+}
+
+int nodemap_idx_cluster_roles_update(const struct lu_nodemap *nodemap)
+{
+       return nodemap_idx_cluster_add_update(nodemap, NULL, NM_UPDATE,
+                                             NODEMAP_CLUSTER_ROLES);
+}
+
+int nodemap_idx_cluster_roles_del(const struct lu_nodemap *nodemap)
+{
+       struct nodemap_key nk;
+       struct lu_env env;
+       int rc = 0;
+
+       ENTRY;
+
+       if (nodemap_mgs_ncf == NULL) {
+               CERROR("cannot add nodemap config to non-existing MGS.\n");
+               return -EINVAL;
+       }
+
+       rc = lu_env_init(&env, LCT_LOCAL);
+       if (rc != 0)
+               RETURN(rc);
+
+       nodemap_cluster_key_init(&nk, nodemap->nm_id, NODEMAP_CLUSTER_ROLES);
+       rc = nodemap_idx_delete(&env, nodemap_mgs_ncf->ncf_obj, &nk, NULL);
+
+       lu_env_fini(&env);
+       RETURN(rc);
+}
+
 int nodemap_idx_range_add(const struct lu_nid_range *range,
                          const lnet_nid_t nid[2])
 {
@@ -541,6 +707,101 @@ int nodemap_idx_nodemap_activate(bool value)
        return nodemap_idx_global_add_update(value, NM_UPDATE);
 }
 
+static enum nodemap_idx_type nodemap_get_key_type(const struct nodemap_key *key)
+{
+       u32                      nodemap_id;
+
+       nodemap_id = le32_to_cpu(key->nk_nodemap_id);
+       return nm_idx_get_type(nodemap_id);
+}
+
+static int nodemap_get_key_subtype(const struct nodemap_key *key)
+{
+       enum nodemap_idx_type type = nodemap_get_key_type(key);
+
+       return type == NODEMAP_CLUSTER_IDX ? key->nk_cluster_subid : -1;
+}
+
+static int nodemap_cluster_rec_helper(struct nodemap_config *config,
+                                     u32 nodemap_id,
+                                     const union nodemap_rec *rec,
+                                     struct lu_nodemap **recent_nodemap)
+{
+       struct lu_nodemap *nodemap, *old_nm;
+       enum nm_flag_bits flags;
+       enum nm_flag2_bits flags2;
+
+       nodemap = cfs_hash_lookup(config->nmc_nodemap_hash, rec->ncr.ncr_name);
+       if (nodemap == NULL) {
+               if (nodemap_id == LUSTRE_NODEMAP_DEFAULT_ID)
+                       nodemap = nodemap_create(rec->ncr.ncr_name, config, 1);
+               else
+                       nodemap = nodemap_create(rec->ncr.ncr_name, config, 0);
+               if (IS_ERR(nodemap))
+                       return PTR_ERR(nodemap);
+
+               /* we need to override the local ID with the saved ID */
+               nodemap->nm_id = nodemap_id;
+               if (nodemap_id > config->nmc_nodemap_highest_id)
+                       config->nmc_nodemap_highest_id = nodemap_id;
+
+       } else if (nodemap->nm_id != nodemap_id) {
+               nodemap_putref(nodemap);
+               return -EINVAL;
+       }
+
+       nodemap->nm_squash_uid = le32_to_cpu(rec->ncr.ncr_squash_uid);
+       nodemap->nm_squash_gid = le32_to_cpu(rec->ncr.ncr_squash_gid);
+       nodemap->nm_squash_projid = le32_to_cpu(rec->ncr.ncr_squash_projid);
+
+       flags = rec->ncr.ncr_flags;
+       nodemap->nmf_allow_root_access = flags & NM_FL_ALLOW_ROOT_ACCESS;
+       nodemap->nmf_trust_client_ids = flags & NM_FL_TRUST_CLIENT_IDS;
+       nodemap->nmf_deny_unknown = flags & NM_FL_DENY_UNKNOWN;
+       nodemap->nmf_map_mode =
+               (flags & NM_FL_MAP_UID ? NODEMAP_MAP_UID : 0) |
+               (flags & NM_FL_MAP_GID ? NODEMAP_MAP_GID : 0) |
+               (flags & NM_FL_MAP_PROJID ? NODEMAP_MAP_PROJID : 0);
+       if (nodemap->nmf_map_mode == NODEMAP_MAP_BOTH_LEGACY)
+               nodemap->nmf_map_mode = NODEMAP_MAP_BOTH;
+       nodemap->nmf_enable_audit = flags & NM_FL_ENABLE_AUDIT;
+       nodemap->nmf_forbid_encryption = flags & NM_FL_FORBID_ENCRYPT;
+       flags2 = rec->ncr.ncr_flags2;
+       nodemap->nmf_readonly_mount = flags2 & NM_FL2_READONLY_MOUNT;
+       /* by default, and in the absence of cluster_roles, grant all roles */
+       nodemap->nmf_rbac = NODEMAP_RBAC_ALL;
+
+       /* The fileset should be saved otherwise it will be empty
+        * every time in case of "NODEMAP_CLUSTER_IDX".
+        */
+       mutex_lock(&active_config_lock);
+       old_nm = nodemap_lookup(rec->ncr.ncr_name);
+       if (!IS_ERR(old_nm) && old_nm->nm_fileset[0] != '\0')
+               strlcpy(nodemap->nm_fileset, old_nm->nm_fileset,
+                       sizeof(nodemap->nm_fileset));
+       mutex_unlock(&active_config_lock);
+       if (!IS_ERR(old_nm))
+               nodemap_putref(old_nm);
+
+       if (*recent_nodemap == NULL) {
+               *recent_nodemap = nodemap;
+               INIT_LIST_HEAD(&nodemap->nm_list);
+       } else {
+               list_add(&nodemap->nm_list, &(*recent_nodemap)->nm_list);
+       }
+       nodemap_putref(nodemap);
+
+       return 0;
+}
+
+static int nodemap_cluster_roles_helper(struct lu_nodemap *nodemap,
+                                       const union nodemap_rec *rec)
+{
+       nodemap->nmf_rbac = le64_to_cpu(rec->ncrr.ncrr_roles);
+
+       return 0;
+}
+
 /**
  * Process a key/rec pair and modify the new configuration.
  *
@@ -559,27 +820,31 @@ static int nodemap_process_keyrec(struct nodemap_config *config,
                                  const union nodemap_rec *rec,
                                  struct lu_nodemap **recent_nodemap)
 {
-       struct lu_nodemap       *nodemap = NULL;
-       enum nodemap_idx_type    type;
-       enum nodemap_id_type     id_type;
-       u8                       flags;
-       u32                      nodemap_id;
-       lnet_nid_t               nid[2];
-       u32                      map[2];
-       int                      rc;
+       struct lu_nodemap *nodemap = NULL;
+       enum nodemap_idx_type type;
+       enum nodemap_id_type id_type;
+       int subtype;
+       u32 nodemap_id;
+       lnet_nid_t nid[2];
+       u32 map[2];
+       int rc;
+
+       ENTRY;
 
-       CLASSERT(sizeof(union nodemap_rec) == 32);
+       BUILD_BUG_ON(sizeof(union nodemap_rec) != 32);
 
        nodemap_id = le32_to_cpu(key->nk_nodemap_id);
-       type = nm_idx_get_type(nodemap_id);
+       type = nodemap_get_key_type(key);
+       subtype = nodemap_get_key_subtype(key);
        nodemap_id = nm_idx_set_type(nodemap_id, 0);
 
-       CDEBUG(D_INFO, "found config entry, nm_id %d type %d\n",
-              nodemap_id, type);
+       CDEBUG(D_INFO, "found config entry, nm_id %d type %d subtype %d\n",
+              nodemap_id, type, subtype);
 
        /* find the correct nodemap in the load list */
        if (type == NODEMAP_RANGE_IDX || type == NODEMAP_UIDMAP_IDX ||
-           type == NODEMAP_GIDMAP_IDX) {
+           type == NODEMAP_GIDMAP_IDX || type == NODEMAP_PROJIDMAP_IDX ||
+           (type == NODEMAP_CLUSTER_IDX && subtype != NODEMAP_CLUSTER_REC)) {
                struct lu_nodemap *tmp = NULL;
 
                nodemap = *recent_nodemap;
@@ -611,51 +876,24 @@ static int nodemap_process_keyrec(struct nodemap_config *config,
                              nodemap_id);
                break;
        case NODEMAP_CLUSTER_IDX:
-               nodemap = cfs_hash_lookup(config->nmc_nodemap_hash,
-                                         rec->ncr.ncr_name);
-               if (nodemap == NULL) {
-                       if (nodemap_id == LUSTRE_NODEMAP_DEFAULT_ID) {
-                               nodemap = nodemap_create(rec->ncr.ncr_name,
-                                                        config, 1);
-                               config->nmc_default_nodemap = nodemap;
-                       } else {
-                               nodemap = nodemap_create(rec->ncr.ncr_name,
-                                                        config, 0);
-                       }
-                       if (IS_ERR(nodemap))
-                               GOTO(out, rc = PTR_ERR(nodemap));
-
-                       /* we need to override the local ID with the saved ID */
-                       nodemap->nm_id = nodemap_id;
-                       if (nodemap_id > config->nmc_nodemap_highest_id)
-                               config->nmc_nodemap_highest_id = nodemap_id;
-
-               } else if (nodemap->nm_id != nodemap_id) {
-                       nodemap_putref(nodemap);
-                       GOTO(out, rc = -EINVAL);
-               }
-
-               nodemap->nm_squash_uid =
-                               le32_to_cpu(rec->ncr.ncr_squash_uid);
-               nodemap->nm_squash_gid =
-                               le32_to_cpu(rec->ncr.ncr_squash_gid);
-
-               flags = le32_to_cpu(rec->ncr.ncr_flags);
-               nodemap->nmf_allow_root_access =
-                                       flags & NM_FL_ALLOW_ROOT_ACCESS;
-               nodemap->nmf_trust_client_ids =
-                                       flags & NM_FL_TRUST_CLIENT_IDS;
-               nodemap->nmf_deny_unknown =
-                                       flags & NM_FL_DENY_UNKNOWN;
-
-               if (*recent_nodemap == NULL) {
-                       *recent_nodemap = nodemap;
-                       INIT_LIST_HEAD(&nodemap->nm_list);
-               } else {
-                       list_add(&nodemap->nm_list,
-                                &(*recent_nodemap)->nm_list);
+               switch (nodemap_get_key_subtype(key)) {
+               case NODEMAP_CLUSTER_REC:
+                       rc = nodemap_cluster_rec_helper(config, nodemap_id, rec,
+                                                       recent_nodemap);
+                       if (rc != 0)
+                               GOTO(out, rc);
+                       break;
+               case NODEMAP_CLUSTER_ROLES:
+                       rc = nodemap_cluster_roles_helper(nodemap, rec);
+                       if (rc != 0)
+                               GOTO(out, rc);
+                       break;
+               default:
+                       CWARN("%s: ignoring keyrec of type %d with subtype %u\n",
+                             nodemap->nm_name, NODEMAP_CLUSTER_IDX,
+                             nodemap_get_key_subtype(key));
+                       break;
                }
-               nodemap_putref(nodemap);
                break;
        case NODEMAP_RANGE_IDX:
                nid[0] = le64_to_cpu(rec->nrr.nrr_start_nid);
@@ -668,43 +906,68 @@ static int nodemap_process_keyrec(struct nodemap_config *config,
                break;
        case NODEMAP_UIDMAP_IDX:
        case NODEMAP_GIDMAP_IDX:
+       case NODEMAP_PROJIDMAP_IDX:
                map[0] = le32_to_cpu(key->nk_id_client);
                map[1] = le32_to_cpu(rec->nir.nir_id_fs);
 
                if (type == NODEMAP_UIDMAP_IDX)
                        id_type = NODEMAP_UID;
-               else
+               else if (type == NODEMAP_GIDMAP_IDX)
                        id_type = NODEMAP_GID;
+               else if (type == NODEMAP_PROJIDMAP_IDX)
+                       id_type = NODEMAP_PROJID;
+               else
+                       GOTO(out, rc = -EINVAL);
 
                rc = nodemap_add_idmap_helper(nodemap, id_type, map);
                if (rc != 0)
                        GOTO(out, rc);
                break;
        case NODEMAP_GLOBAL_IDX:
-               config->nmc_nodemap_is_active = rec->ngr.ngr_is_active;
+               switch (key->nk_unused) {
+               case 0:
+                       config->nmc_nodemap_is_active = rec->ngr.ngr_is_active;
+                       break;
+               default:
+                       CWARN("%s: ignoring keyrec of type %d with subtype %u\n",
+                             recent_nodemap ?
+                              (*recent_nodemap)->nm_name : "nodemap",
+                             NODEMAP_GLOBAL_IDX, key->nk_unused);
+                       break;
+               }
                break;
        default:
-               CERROR("got keyrec pair for unknown type %d\n", type);
+               CWARN("%s: ignoring key %u:%u for unknown type %u\n",
+                     recent_nodemap ? (*recent_nodemap)->nm_name : "nodemap",
+                     key->nk_nodemap_id & 0x0FFFFFFF, key->nk_unused, type);
                break;
        }
 
        rc = type;
 
+       EXIT;
+
 out:
        return rc;
 }
 
+enum nm_config_passes {
+       NM_READ_CLUSTERS = 0,
+       NM_READ_ATTRIBUTES = 1,
+};
+
 static int nodemap_load_entries(const struct lu_env *env,
                                struct dt_object *nodemap_idx)
 {
-       const struct dt_it_ops  *iops;
-       struct dt_it            *it;
-       struct lu_nodemap       *recent_nodemap = NULL;
-       struct nodemap_config   *new_config = NULL;
-       u64                      hash = 0;
-       bool                     activate_nodemap = false;
-       bool                     loaded_global_idx = false;
-       int                      rc = 0;
+       const struct dt_it_ops *iops;
+       struct dt_it *it;
+       struct lu_nodemap *recent_nodemap = NULL;
+       struct nodemap_config *new_config = NULL;
+       u64 hash = 0;
+       bool activate_nodemap = false;
+       bool loaded_global_idx = false;
+       enum nm_config_passes cur_pass = NM_READ_CLUSTERS;
+       int rc = 0;
 
        ENTRY;
 
@@ -716,52 +979,87 @@ static int nodemap_load_entries(const struct lu_env *env,
                GOTO(out, rc = PTR_ERR(it));
 
        rc = iops->load(env, it, hash);
+       if (rc < 0)
+               GOTO(out_iops_fini, rc);
+
+       /* rc == 0 means we need to advance to record */
        if (rc == 0) {
                rc = iops->next(env, it);
-               if (rc != 0)
-                       GOTO(out_iops, rc = 0);
+
+               if (rc < 0)
+                       GOTO(out_iops_put, rc);
+               /* rc > 0 is eof, will be checked in while below */
+       } else {
+               /* rc == 1, we found initial record and can process below */
+               rc = 0;
        }
 
        new_config = nodemap_config_alloc();
        if (IS_ERR(new_config)) {
                rc = PTR_ERR(new_config);
                new_config = NULL;
-               GOTO(out_lock, rc);
+               GOTO(out_iops_put, rc);
        }
 
-       do {
+       /* rc > 0 is eof, check initial iops->next here as well */
+       while (rc == 0) {
                struct nodemap_key *key;
                union nodemap_rec rec;
+               enum nodemap_idx_type key_type;
+               int sub_type;
 
                key = (struct nodemap_key *)iops->key(env, it);
-               rc = iops->rec(env, it, (struct dt_rec *)&rec, 0);
-               if (rc != -ESTALE) {
-                       if (rc != 0)
-                               GOTO(out_lock, rc);
-                       rc = nodemap_process_keyrec(new_config, key, &rec,
-                                                   &recent_nodemap);
-                       if (rc < 0)
-                               GOTO(out_lock, rc);
-                       if (rc == NODEMAP_GLOBAL_IDX)
-                               loaded_global_idx = true;
+               key_type = nodemap_get_key_type((struct nodemap_key *)key);
+               sub_type = nodemap_get_key_subtype((struct nodemap_key *)key);
+               if ((cur_pass == NM_READ_CLUSTERS &&
+                    key_type == NODEMAP_CLUSTER_IDX &&
+                    sub_type == NODEMAP_CLUSTER_REC) ||
+                   (cur_pass == NM_READ_ATTRIBUTES &&
+                    (key_type != NODEMAP_CLUSTER_IDX ||
+                     sub_type != NODEMAP_CLUSTER_REC) &&
+                    key_type != NODEMAP_EMPTY_IDX)) {
+                       rc = iops->rec(env, it, (struct dt_rec *)&rec, 0);
+                       if (rc != -ESTALE) {
+                               if (rc != 0)
+                                       GOTO(out_nodemap_config, rc);
+                               rc = nodemap_process_keyrec(new_config, key, &rec,
+                                                           &recent_nodemap);
+                               if (rc < 0)
+                                       GOTO(out_nodemap_config, rc);
+                               if (rc == NODEMAP_GLOBAL_IDX)
+                                       loaded_global_idx = true;
+                       }
                }
 
                do
                        rc = iops->next(env, it);
                while (rc == -ESTALE);
-       } while (rc == 0);
+
+               /* move to second pass */
+               if (rc > 0 && cur_pass == NM_READ_CLUSTERS) {
+                       cur_pass = NM_READ_ATTRIBUTES;
+                       rc = iops->load(env, it, 0);
+                       if (rc == 0)
+                               rc = iops->next(env, it);
+                       else if (rc > 0)
+                               rc = 0;
+                       else
+                               GOTO(out, rc);
+               }
+       }
 
        if (rc > 0)
                rc = 0;
 
-out_lock:
+out_nodemap_config:
        if (rc != 0)
                nodemap_config_dealloc(new_config);
        else
                /* creating new default needs to be done outside dt read lock */
                activate_nodemap = true;
-out_iops:
+out_iops_put:
        iops->put(env, it);
+out_iops_fini:
        iops->fini(env, it);
 out:
        dt_read_unlock(env, nodemap_idx);
@@ -775,14 +1073,15 @@ out:
 
        if (new_config->nmc_default_nodemap == NULL) {
                /* new MGS won't have a default nm on disk, so create it here */
-               new_config->nmc_default_nodemap =
+               struct lu_nodemap *nodemap =
                        nodemap_create(DEFAULT_NODEMAP, new_config, 1);
-               if (IS_ERR(new_config->nmc_default_nodemap)) {
-                       rc = PTR_ERR(new_config->nmc_default_nodemap);
+               if (IS_ERR(nodemap)) {
+                       rc = PTR_ERR(nodemap);
                } else {
-                       rc = nodemap_idx_nodemap_add_update(
+                       rc = nodemap_idx_cluster_add_update(
                                        new_config->nmc_default_nodemap,
-                                       NM_ADD);
+                                       nodemap_idx,
+                                       NM_ADD, NODEMAP_CLUSTER_REC);
                        nodemap_putref(new_config->nmc_default_nodemap);
                }
        }
@@ -805,6 +1104,166 @@ out:
        RETURN(rc);
 }
 
+/**
+ * Step through active config and write to disk.
+ */
+struct dt_object *nodemap_save_config_cache(const struct lu_env *env,
+                                           struct dt_device *dev,
+                                           struct local_oid_storage *los)
+{
+       struct dt_object *o;
+       struct lu_nodemap *nodemap;
+       struct lu_nodemap *nm_tmp;
+       struct lu_nid_range *range;
+       struct lu_nid_range *range_temp;
+       struct lu_idmap *idmap;
+       struct lu_idmap *id_tmp;
+       struct rb_root root;
+       struct nodemap_key nk;
+       union nodemap_rec nr;
+       LIST_HEAD(nodemap_list_head);
+       int rc = 0, rc2;
+
+       ENTRY;
+
+       /* create a new index file to fill with active config */
+       o = nodemap_cache_find_create(env, dev, los, NCFC_CREATE_NEW);
+       if (IS_ERR(o))
+               RETURN(o);
+
+       mutex_lock(&active_config_lock);
+
+       /* convert hash to list so we don't spin */
+       cfs_hash_for_each_safe(active_config->nmc_nodemap_hash,
+                              nm_hash_list_cb, &nodemap_list_head);
+
+       list_for_each_entry_safe(nodemap, nm_tmp, &nodemap_list_head, nm_list) {
+               nodemap_cluster_key_init(&nk, nodemap->nm_id,
+                                        NODEMAP_CLUSTER_REC);
+               nodemap_cluster_rec_init(&nr, nodemap);
+
+               rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+               if (rc2 < 0) {
+                       rc = rc2;
+                       continue;
+               }
+
+               /* only insert NODEMAP_CLUSTER_ROLES idx in saved config cache
+                * if nmf_rbac is not default value NODEMAP_RBAC_ALL
+                */
+               if (nodemap->nmf_rbac != NODEMAP_RBAC_ALL) {
+                       nodemap_cluster_key_init(&nk, nodemap->nm_id,
+                                                NODEMAP_CLUSTER_ROLES);
+                       nodemap_cluster_roles_rec_init(&nr, nodemap);
+                       rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+                       if (rc2 < 0)
+                               rc = rc2;
+               }
+
+               down_read(&active_config->nmc_range_tree_lock);
+               list_for_each_entry_safe(range, range_temp, &nodemap->nm_ranges,
+                                        rn_list) {
+                       lnet_nid_t nid[2] = {
+                               range->rn_start,
+                               range->rn_end
+                       };
+                       nodemap_range_key_init(&nk, nodemap->nm_id,
+                                              range->rn_id);
+                       nodemap_range_rec_init(&nr, nid);
+                       rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+                       if (rc2 < 0)
+                               rc = rc2;
+               }
+               up_read(&active_config->nmc_range_tree_lock);
+
+               /* we don't need to take nm_idmap_lock because active config
+                * lock prevents changes from happening to nodemaps
+                */
+               root = nodemap->nm_client_to_fs_uidmap;
+               nm_rbtree_postorder_for_each_entry_safe(idmap, id_tmp, &root,
+                                                       id_client_to_fs) {
+                       nodemap_idmap_key_init(&nk, nodemap->nm_id, NODEMAP_UID,
+                                              idmap->id_client);
+                       nodemap_idmap_rec_init(&nr, idmap->id_fs);
+                       rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+                       if (rc2 < 0)
+                               rc = rc2;
+               }
+
+               root = nodemap->nm_client_to_fs_gidmap;
+               nm_rbtree_postorder_for_each_entry_safe(idmap, id_tmp, &root,
+                                                       id_client_to_fs) {
+                       nodemap_idmap_key_init(&nk, nodemap->nm_id, NODEMAP_GID,
+                                              idmap->id_client);
+                       nodemap_idmap_rec_init(&nr, idmap->id_fs);
+                       rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+                       if (rc2 < 0)
+                               rc = rc2;
+               }
+
+               root = nodemap->nm_client_to_fs_projidmap;
+               nm_rbtree_postorder_for_each_entry_safe(idmap, id_tmp, &root,
+                                                       id_client_to_fs) {
+                       nodemap_idmap_key_init(&nk, nodemap->nm_id,
+                                              NODEMAP_PROJID,
+                                              idmap->id_client);
+                       nodemap_idmap_rec_init(&nr, idmap->id_fs);
+                       rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+                       if (rc2 < 0)
+                               rc = rc2;
+               }
+       }
+       nodemap_global_key_init(&nk);
+       nodemap_global_rec_init(&nr, active_config->nmc_nodemap_is_active);
+       rc2 = nodemap_idx_insert(env, o, &nk, &nr);
+       if (rc2 < 0)
+               rc = rc2;
+
+       mutex_unlock(&active_config_lock);
+
+       if (rc < 0) {
+               dt_object_put(env, o);
+               o = ERR_PTR(rc);
+       }
+
+       RETURN(o);
+}
+
+static void nodemap_save_all_caches(void)
+{
+       struct nm_config_file   *ncf;
+       struct lu_env            env;
+       int                      rc = 0;
+
+       /* recreating nodemap cache requires fld_thread_key be in env */
+       rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+       if (rc != 0) {
+               CWARN("cannot init env for nodemap config: rc = %d\n", rc);
+               return;
+       }
+
+       mutex_lock(&ncf_list_lock);
+       list_for_each_entry(ncf, &ncf_list_head, ncf_list) {
+               struct dt_device *dev = lu2dt_dev(ncf->ncf_obj->do_lu.lo_dev);
+               struct obd_device *obd = ncf->ncf_obj->do_lu.lo_dev->ld_obd;
+               struct dt_object *o;
+
+               /* put current config file so save conf can rewrite it */
+               dt_object_put_nocache(&env, ncf->ncf_obj);
+               ncf->ncf_obj = NULL;
+
+               o = nodemap_save_config_cache(&env, dev, ncf->ncf_los);
+               if (IS_ERR(o))
+                       CWARN("%s: error writing to nodemap config: rc = %d\n",
+                             obd->obd_name, rc);
+               else
+                       ncf->ncf_obj = o;
+       }
+       mutex_unlock(&ncf_list_lock);
+
+       lu_env_fini(&env);
+}
+
 /* tracks if config still needs to be loaded, either from disk or network */
 static bool nodemap_config_loaded;
 static DEFINE_MUTEX(nodemap_config_loaded_lock);
@@ -820,6 +1279,7 @@ void nodemap_config_set_active_mgc(struct nodemap_config *config)
        mutex_lock(&nodemap_config_loaded_lock);
        nodemap_config_set_active(config);
        nodemap_config_loaded = true;
+       nodemap_save_all_caches();
        mutex_unlock(&nodemap_config_loaded_lock);
 }
 EXPORT_SYMBOL(nodemap_config_set_active_mgc);
@@ -836,83 +1296,142 @@ EXPORT_SYMBOL(nodemap_config_set_active_mgc);
  * \retval     -ENOMEM         memory allocation failure
  * \retval     -ENOENT         error loading nodemap config
  * \retval     -EINVAL         error loading nodemap config
+ * \retval     -EEXIST         nodemap config already registered for MGS
  */
-struct nm_config_file *nm_config_file_register(const struct lu_env *env,
-                                              struct dt_object *obj,
-                                              struct local_oid_storage *los,
-                                              enum nm_config_file_type ncf_type)
+struct nm_config_file *nm_config_file_register_mgs(const struct lu_env *env,
+                                                  struct dt_object *obj,
+                                                  struct local_oid_storage *los)
 {
        struct nm_config_file *ncf;
        int rc = 0;
        ENTRY;
 
+       if (nodemap_mgs_ncf != NULL)
+               GOTO(out, ncf = ERR_PTR(-EEXIST));
+
        OBD_ALLOC_PTR(ncf);
        if (ncf == NULL)
-               RETURN(ERR_PTR(-ENOMEM));
+               GOTO(out, ncf = ERR_PTR(-ENOMEM));
+
+       /* if loading from cache, prevent activation of MGS config until cache
+        * loading is done, so disk config is overwritten by MGS config.
+        */
+       mutex_lock(&nodemap_config_loaded_lock);
+       rc = nodemap_load_entries(env, obj);
+       if (!rc)
+               nodemap_config_loaded = true;
+       mutex_unlock(&nodemap_config_loaded_lock);
+
+       if (rc) {
+               OBD_FREE_PTR(ncf);
+               GOTO(out, ncf = ERR_PTR(rc));
+       }
+
+       lu_object_get(&obj->do_lu);
 
        ncf->ncf_obj = obj;
        ncf->ncf_los = los;
 
-       if (ncf_type == NCFT_MGS) {
-               nodemap_mgs_ncf = ncf;
-       } else {
-               mutex_lock(&ncf_list_lock);
-               list_add(&ncf->ncf_list, &ncf_list_head);
-               mutex_unlock(&ncf_list_lock);
-       }
+       nodemap_mgs_ncf = ncf;
 
-       /* prevent activation of config loaded from MGS until disk is loaded
-        * so disk config is overwritten by MGS config.
-        */
+out:
+       return ncf;
+}
+EXPORT_SYMBOL(nm_config_file_register_mgs);
+
+struct nm_config_file *nm_config_file_register_tgt(const struct lu_env *env,
+                                                  struct dt_device *dev,
+                                                  struct local_oid_storage *los)
+{
+       struct nm_config_file *ncf;
+       struct dt_object *config_obj = NULL;
+       int rc = 0;
+
+       OBD_ALLOC_PTR(ncf);
+       if (ncf == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       /* don't load from cache if config already loaded */
        mutex_lock(&nodemap_config_loaded_lock);
-       if (ncf_type == NCFT_MGS || !nodemap_config_loaded)
-               rc = nodemap_load_entries(env, obj);
-       nodemap_config_loaded = true;
+       if (!nodemap_config_loaded) {
+               config_obj = nodemap_cache_find_create(env, dev, los, 0);
+               if (IS_ERR(config_obj))
+                       rc = PTR_ERR(config_obj);
+               else
+                       rc = nodemap_load_entries(env, config_obj);
+
+               if (!rc)
+                       nodemap_config_loaded = true;
+       }
        mutex_unlock(&nodemap_config_loaded_lock);
+       if (rc)
+               GOTO(out_ncf, rc);
 
-       if (rc < 0) {
-               if (ncf_type == NCFT_MGS) {
-                       nodemap_mgs_ncf = NULL;
-               } else {
-                       mutex_lock(&ncf_list_lock);
-                       list_del(&ncf->ncf_list);
-                       mutex_unlock(&ncf_list_lock);
-               }
+       /* sync on disk caches w/ loaded config in memory, ncf_obj may change */
+       if (!config_obj) {
+               config_obj = nodemap_save_config_cache(env, dev, los);
+               if (IS_ERR(config_obj))
+                       GOTO(out_ncf, rc = PTR_ERR(config_obj));
+       }
+
+       ncf->ncf_obj = config_obj;
+       ncf->ncf_los = los;
 
+       mutex_lock(&ncf_list_lock);
+       list_add(&ncf->ncf_list, &ncf_list_head);
+       mutex_unlock(&ncf_list_lock);
+
+out_ncf:
+       if (rc) {
                OBD_FREE_PTR(ncf);
                RETURN(ERR_PTR(rc));
        }
 
        RETURN(ncf);
 }
-EXPORT_SYMBOL(nm_config_file_register);
+EXPORT_SYMBOL(nm_config_file_register_tgt);
 
 /**
  * Deregister a nm_config_file. Should be called by targets during cleanup.
  *
  * \param ncf  config file to deregister
  */
-void nm_config_file_deregister(const struct lu_env *env,
-                              struct nm_config_file *ncf,
-                              enum nm_config_file_type ncf_type)
+void nm_config_file_deregister_mgs(const struct lu_env *env,
+                                  struct nm_config_file *ncf)
 {
        ENTRY;
+       LASSERT(nodemap_mgs_ncf == ncf);
 
+       nodemap_mgs_ncf = NULL;
        if (ncf->ncf_obj)
-               lu_object_put(env, &ncf->ncf_obj->do_lu);
+               dt_object_put(env, ncf->ncf_obj);
 
-       if (ncf_type == NCFT_TGT) {
-               mutex_lock(&ncf_list_lock);
-               list_del(&ncf->ncf_list);
-               mutex_unlock(&ncf_list_lock);
-       } else {
-               nodemap_mgs_ncf = NULL;
-       }
        OBD_FREE_PTR(ncf);
 
        EXIT;
 }
-EXPORT_SYMBOL(nm_config_file_deregister);
+EXPORT_SYMBOL(nm_config_file_deregister_mgs);
+
+void nm_config_file_deregister_tgt(const struct lu_env *env,
+                                  struct nm_config_file *ncf)
+{
+       ENTRY;
+
+       if (ncf == NULL)
+               return;
+
+       mutex_lock(&ncf_list_lock);
+       list_del(&ncf->ncf_list);
+       mutex_unlock(&ncf_list_lock);
+
+       if (ncf->ncf_obj)
+               dt_object_put(env, ncf->ncf_obj);
+
+       OBD_FREE_PTR(ncf);
+
+       EXIT;
+}
+EXPORT_SYMBOL(nm_config_file_deregister_tgt);
 
 int nodemap_process_idx_pages(struct nodemap_config *config, union lu_page *lip,
                              struct lu_nodemap **recent_nodemap)
@@ -952,6 +1471,114 @@ int nodemap_process_idx_pages(struct nodemap_config *config, union lu_page *lip,
 }
 EXPORT_SYMBOL(nodemap_process_idx_pages);
 
+static int nodemap_page_build(const struct lu_env *env, struct dt_object *obj,
+                             union lu_page *lp, size_t bytes,
+                             const struct dt_it_ops *iops,
+                             struct dt_it *it, __u32 attr, void *arg)
+{
+       struct idx_info *ii = (struct idx_info *)arg;
+       struct lu_idxpage *lip = &lp->lp_idx;
+       char *entry;
+       size_t size = ii->ii_keysize + ii->ii_recsize;
+       int rc;
+       ENTRY;
+
+       if (bytes < LIP_HDR_SIZE)
+               return -EINVAL;
+
+       /* initialize the header of the new container */
+       memset(lip, 0, LIP_HDR_SIZE);
+       lip->lip_magic = LIP_MAGIC;
+       bytes -= LIP_HDR_SIZE;
+
+       entry = lip->lip_entries;
+       do {
+               char *tmp_entry = entry;
+               struct dt_key *key;
+               __u64 hash;
+               enum nodemap_idx_type key_type;
+               int sub_type;
+
+               /* fetch 64-bit hash value */
+               hash = iops->store(env, it);
+               ii->ii_hash_end = hash;
+
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+                       if (lip->lip_nr != 0)
+                               GOTO(out, rc = 0);
+               }
+
+               if (bytes < size) {
+                       if (lip->lip_nr == 0)
+                               GOTO(out, rc = -EINVAL);
+                       GOTO(out, rc = 0);
+               }
+
+               key = iops->key(env, it);
+               key_type = nodemap_get_key_type((struct nodemap_key *)key);
+               sub_type = nodemap_get_key_subtype((struct nodemap_key *)key);
+
+               /* on the first pass, get only the cluster types. On second
+                * pass, get all the rest */
+               if ((ii->ii_attrs == NM_READ_CLUSTERS &&
+                    key_type == NODEMAP_CLUSTER_IDX &&
+                    sub_type == NODEMAP_CLUSTER_REC) ||
+                   (ii->ii_attrs == NM_READ_ATTRIBUTES &&
+                    (key_type != NODEMAP_CLUSTER_IDX ||
+                     sub_type != NODEMAP_CLUSTER_REC) &&
+                    key_type != NODEMAP_EMPTY_IDX)) {
+                       memcpy(tmp_entry, key, ii->ii_keysize);
+                       tmp_entry += ii->ii_keysize;
+
+                       /* and finally the record */
+                       rc = iops->rec(env, it, (struct dt_rec *)tmp_entry,
+                                      attr);
+                       if (rc != -ESTALE) {
+                               if (rc != 0)
+                                       GOTO(out, rc);
+
+                               /* hash/key/record successfully copied! */
+                               lip->lip_nr++;
+                               if (unlikely(lip->lip_nr == 1 &&
+                                   ii->ii_count == 0))
+                                       ii->ii_hash_start = hash;
+
+                               entry = tmp_entry + ii->ii_recsize;
+                               bytes -= size;
+                       }
+               }
+
+               /* move on to the next record */
+               do {
+                       rc = iops->next(env, it);
+               } while (rc == -ESTALE);
+
+               /* move to second pass */
+               if (rc > 0 && ii->ii_attrs == NM_READ_CLUSTERS) {
+                       ii->ii_attrs = NM_READ_ATTRIBUTES;
+                       rc = iops->load(env, it, 0);
+                       if (rc == 0)
+                               rc = iops->next(env, it);
+                       else if (rc > 0)
+                               rc = 0;
+                       else
+                               GOTO(out, rc);
+               }
+
+       } while (rc == 0);
+
+       GOTO(out, rc);
+out:
+       if (rc >= 0 && lip->lip_nr > 0)
+               /* one more container */
+               ii->ii_count++;
+       if (rc > 0)
+               /* no more entries */
+               ii->ii_hash_end = II_END_OFF;
+       return rc;
+}
+
+
 int nodemap_index_read(struct lu_env *env,
                       struct nm_config_file *ncf,
                       struct idx_info *ii,
@@ -967,13 +1594,13 @@ int nodemap_index_read(struct lu_env *env,
        dt_read_lock(env, nodemap_idx, 0);
        version = dt_version_get(env, nodemap_idx);
        if (rdpg->rp_hash != 0 && ii->ii_version != version) {
-               CDEBUG(D_INFO, "nodemap config changed while sending, "
-                              "old "LPU64", new "LPU64"\n",
+               CDEBUG(D_INFO, "nodemap config changed inflight, old %llu, new %llu\n",
                       ii->ii_version,
                       version);
                ii->ii_hash_end = 0;
        } else {
-               rc = dt_index_walk(env, nodemap_idx, rdpg, NULL, ii);
+               rc = dt_index_walk(env, nodemap_idx, rdpg, nodemap_page_build,
+                                  ii);
                CDEBUG(D_INFO, "walked index, hashend %llx\n", ii->ii_hash_end);
        }
 
@@ -999,12 +1626,12 @@ EXPORT_SYMBOL(nodemap_index_read);
 int nodemap_get_config_req(struct obd_device *mgs_obd,
                           struct ptlrpc_request *req)
 {
+       const struct ptlrpc_bulk_frag_ops *frag_ops = &ptlrpc_bulk_kiov_pin_ops;
        struct mgs_config_body *body;
        struct mgs_config_res *res;
        struct lu_rdpg rdpg;
        struct idx_info nodemap_ii;
        struct ptlrpc_bulk_desc *desc;
-       struct l_wait_info lwi;
        struct tg_export_data *rqexp_ted = &req->rq_export->exp_target_data;
        int i;
        int page_count;
@@ -1015,12 +1642,12 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
        if (!body)
                RETURN(-EINVAL);
 
-       if (body->mcb_type != CONFIG_T_NODEMAP)
+       if (body->mcb_type != MGS_CFG_T_NODEMAP)
                RETURN(-EINVAL);
 
        rdpg.rp_count = (body->mcb_units << body->mcb_bits);
-       rdpg.rp_npages = (rdpg.rp_count + PAGE_CACHE_SIZE - 1) >>
-               PAGE_CACHE_SHIFT;
+       rdpg.rp_npages = (rdpg.rp_count + PAGE_SIZE - 1) >>
+               PAGE_SHIFT;
        if (rdpg.rp_npages > PTLRPC_MAX_BRW_PAGES)
                RETURN(-EINVAL);
 
@@ -1028,11 +1655,11 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
               body->mcb_name, rdpg.rp_count);
 
        /* allocate pages to store the containers */
-       OBD_ALLOC(rdpg.rp_pages, sizeof(*rdpg.rp_pages) * rdpg.rp_npages);
+       OBD_ALLOC_PTR_ARRAY(rdpg.rp_pages, rdpg.rp_npages);
        if (rdpg.rp_pages == NULL)
                RETURN(-ENOMEM);
        for (i = 0; i < rdpg.rp_npages; i++) {
-               rdpg.rp_pages[i] = alloc_page(GFP_IOFS);
+               rdpg.rp_pages[i] = alloc_page(GFP_NOFS);
                if (rdpg.rp_pages[i] == NULL)
                        GOTO(out, rc = -ENOMEM);
        }
@@ -1041,9 +1668,10 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
        nodemap_ii.ii_magic = IDX_INFO_MAGIC;
        nodemap_ii.ii_flags = II_FL_NOHASH;
        nodemap_ii.ii_version = rqexp_ted->ted_nodemap_version;
+       nodemap_ii.ii_attrs = body->mcb_nm_cur_pass;
 
        bytes = nodemap_index_read(req->rq_svc_thread->t_env,
-                                  mgs_obd->u.obt.obt_nodemap_config_file,
+                                  obd2obt(mgs_obd)->obt_nodemap_config_file,
                                   &nodemap_ii, &rdpg);
        if (bytes < 0)
                GOTO(out, rc = bytes);
@@ -1054,25 +1682,23 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
        if (res == NULL)
                GOTO(out, rc = -EINVAL);
        res->mcr_offset = nodemap_ii.ii_hash_end;
-       res->mcr_size = bytes;
+       res->mcr_nm_cur_pass = nodemap_ii.ii_attrs;
 
-       page_count = (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       page_count = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
        LASSERT(page_count <= rdpg.rp_count);
        desc = ptlrpc_prep_bulk_exp(req, page_count, 1,
-                                   PTLRPC_BULK_PUT_SOURCE |
-                                       PTLRPC_BULK_BUF_KIOV,
-                                   MGS_BULK_PORTAL,
-                                   &ptlrpc_bulk_kiov_pin_ops);
+                                   PTLRPC_BULK_PUT_SOURCE,
+                                   MGS_BULK_PORTAL, frag_ops);
        if (desc == NULL)
                GOTO(out, rc = -ENOMEM);
 
        for (i = 0; i < page_count && bytes > 0; i++) {
-               ptlrpc_prep_bulk_page_pin(desc, rdpg.rp_pages[i], 0,
-                                         min_t(int, bytes, PAGE_CACHE_SIZE));
-               bytes -= PAGE_CACHE_SIZE;
+               frag_ops->add_kiov_frag(desc, rdpg.rp_pages[i], 0,
+                                       min_t(int, bytes, PAGE_SIZE));
+               bytes -= PAGE_SIZE;
        }
 
-       rc = target_bulk_io(req->rq_export, desc, &lwi);
+       rc = target_bulk_io(req->rq_export, desc);
        ptlrpc_free_bulk(desc);
 
 out:
@@ -1080,8 +1706,7 @@ out:
                for (i = 0; i < rdpg.rp_npages; i++)
                        if (rdpg.rp_pages[i] != NULL)
                                __free_page(rdpg.rp_pages[i]);
-               OBD_FREE(rdpg.rp_pages,
-                        rdpg.rp_npages * sizeof(rdpg.rp_pages[0]));
+               OBD_FREE_PTR_ARRAY(rdpg.rp_pages, rdpg.rp_npages);
        }
        return rc;
 }