Whamcloud - gitweb
LU-5092 nodemap: handle config changes while mid-flight 41/16941/13
authorKit Westneat <kit.westneat@gmail.com>
Thu, 13 Aug 2015 15:03:21 +0000 (11:03 -0400)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 14 Jun 2016 03:50:12 +0000 (03:50 +0000)
This adds the ability to detect when the nodemap configuration has
changed between messages for very large configurations. If the
configuration is over 1MB (about 25k entries), it needs to be sent
in multiple messages. If the configuration changes in-between these
messages, then the client needs to abort the previous get config
operation and restart.

Test-Parameters: envdefinitions=SLOW=yes testlist=sanity-sec
Signed-off-by: Kit Westneat <kit.westneat@gmail.com>
Change-Id: I2bf6e4bade947fea8331c2922d9c33ab49100577
Reviewed-on: http://review.whamcloud.com/16941
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/lustre_export.h
lustre/mgc/mgc_request.c
lustre/ptlrpc/nodemap_storage.c
lustre/tests/sanity-sec.sh
lustre/utils/lustre_cfg.c

index 31a1e81..8e94a66 100644 (file)
@@ -84,6 +84,9 @@ struct tg_export_data {
        struct lu_nodemap       *ted_nodemap;
        struct list_head        ted_nodemap_member;
 
+       /** last version of nodemap config sent to client */
+       __u64                   ted_nodemap_version;
+
        /* Every reply data fields below are
         * protected by ted_lcd_lock */
        /** List of reply data */
index 6415307..7b4db0c 100644 (file)
@@ -1577,19 +1577,8 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
        struct config_llog_instance *cfg = &cld->cld_cfg;
        struct mgs_config_body *body;
        struct mgs_config_res *res;
-
-       /* When a nodemap config is received, we build a new nodemap config,
-        * with new nodemap structs. We keep track of the most recently added
-        * nodemap since the config is read ordered by nodemap_id, and so it
-        * is likely that the next record will be related. Because access to
-        * the nodemaps is single threaded until the nodemap_config is active,
-        * we don't need to reference count with recent_nodemap, though
-        * recent_nodemap should be set to NULL when the nodemap_config
-        * is either destroyed or set active.
-        */
        struct nodemap_config *new_config = NULL;
        struct lu_nodemap *recent_nodemap = NULL;
-
        struct ptlrpc_bulk_desc *desc;
        struct page **pages;
        __u64 config_read_offset = 0;
@@ -1622,8 +1611,9 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
                         GOTO(out, rc = -ENOMEM);
         }
 
+again:
 #ifdef HAVE_SERVER_SUPPORT
-       if (cld_is_nodemap(cld)) {
+       if (cld_is_nodemap(cld) && config_read_offset == 0) {
                new_config = nodemap_config_alloc();
                if (IS_ERR(new_config)) {
                        rc = PTR_ERR(new_config);
@@ -1632,7 +1622,6 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd,
                }
        }
 #endif
-again:
        LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld));
        LASSERT(mutex_is_locked(&cld->cld_lock));
        req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
@@ -1705,6 +1694,20 @@ again:
                GOTO(out, rc = -EINVAL);
 
        if (ealen == 0) { /* no logs transferred */
+#ifdef HAVE_SERVER_SUPPORT
+               /* config changed since first read RPC */
+               if (cld_is_nodemap(cld) && config_read_offset == 0) {
+                       recent_nodemap = NULL;
+                       nodemap_config_dealloc(new_config);
+                       new_config = NULL;
+
+                       CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
+
+                       /* setting eof to false, we request config again */
+                       eof = false;
+                       GOTO(out, rc = 0);
+               }
+#endif
                if (!eof)
                        rc = -EINVAL;
                GOTO(out, rc);
@@ -1718,6 +1721,15 @@ again:
                mne_swab = !mne_swab;
 #endif
 
+       /* When a nodemap config is received, we build a new nodemap config,
+        * with new nodemap structs. We keep track of the most recently added
+        * nodemap since the config is read ordered by nodemap_id, and so it
+        * is likely that the next record will be related. Because access to
+        * the nodemaps is single threaded until the nodemap_config is active,
+        * we don't need to reference count with recent_nodemap, though
+        * recent_nodemap should be set to NULL when the nodemap_config
+        * is either destroyed or set active.
+        */
        for (i = 0; i < nrpages && ealen > 0; i++) {
                int rc2;
                union lu_page   *ptr;
@@ -1746,15 +1758,17 @@ again:
        }
 
 out:
-       if (req)
+       if (req) {
                ptlrpc_req_finished(req);
+               req = NULL;
+       }
 
        if (rc == 0 && !eof)
                goto again;
 
 #ifdef HAVE_SERVER_SUPPORT
        if (new_config != NULL) {
-               recent_nodemap = NULL;
+               /* recent_nodemap cannot be used after set_active/dealloc */
                if (rc == 0)
                        nodemap_config_set_active(new_config);
                else
index b61e97d..ef81a47 100644 (file)
@@ -839,14 +839,27 @@ int nodemap_index_read(struct lu_env *env,
                       const struct lu_rdpg *rdpg)
 {
        struct dt_object        *nodemap_idx = ncf->ncf_obj;
+       __u64                    version;
        int                      rc = 0;
 
        ii->ii_keysize = dt_nodemap_features.dif_keysize_max;
        ii->ii_recsize = dt_nodemap_features.dif_recsize_max;
 
        dt_read_lock(env, nodemap_idx, 0);
-       rc = dt_index_walk(env, nodemap_idx, rdpg, NULL, ii);
-       CDEBUG(D_INFO, "walked index, hashend %llx\n", ii->ii_hash_end);
+       version = dt_version_get(env, nodemap_idx);
+       if (rdpg->rp_hash != 0 && ii->ii_version != version) {
+               CDEBUG(D_INFO, "nodemap config changed while sending, "
+                              "old "LPU64", new "LPU64"\n",
+                      ii->ii_version,
+                      version);
+               ii->ii_hash_end = 0;
+       } else {
+               rc = dt_index_walk(env, nodemap_idx, rdpg, NULL, ii);
+               CDEBUG(D_INFO, "walked index, hashend %llx\n", ii->ii_hash_end);
+       }
+
+       if (rc >= 0)
+               ii->ii_version = version;
 
        dt_read_unlock(env, nodemap_idx);
        return rc;
@@ -873,6 +886,7 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
        struct idx_info nodemap_ii;
        struct ptlrpc_bulk_desc *desc;
        struct l_wait_info lwi;
+       struct tg_export_data *rqexp_ted = &req->rq_export->exp_target_data;
        int i;
        int page_count;
        int bytes = 0;
@@ -907,6 +921,7 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
        rdpg.rp_hash = body->mcb_offset;
        nodemap_ii.ii_magic = IDX_INFO_MAGIC;
        nodemap_ii.ii_flags = II_FL_NOHASH;
+       nodemap_ii.ii_version = rqexp_ted->ted_nodemap_version;
 
        bytes = nodemap_index_read(req->rq_svc_thread->t_env,
                                   mgs_obd->u.obt.obt_nodemap_config_file,
@@ -914,6 +929,8 @@ int nodemap_get_config_req(struct obd_device *mgs_obd,
        if (bytes < 0)
                GOTO(out, rc = bytes);
 
+       rqexp_ted->ted_nodemap_version = nodemap_ii.ii_version;
+
        res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
        if (res == NULL)
                GOTO(out, rc = -EINVAL);
index c954d7e..39a167e 100755 (executable)
@@ -990,7 +990,7 @@ wait_nm_sync() {
        if ! $is_sync; then
                echo MGS
                echo $out1
-               echo OTHER
+               echo OTHER - IP: $node_ip
                echo $out2
                error "mgs and $nodemap_name ${key} mismatch, $i attempts"
        fi
@@ -1608,19 +1608,13 @@ run_test 25 "test save and reload nodemap config"
 test_26() {
        nodemap_version_check || return 0
 
-       local large_i=13000
+       local large_i=32000
 
-       for ((i = 0; i < large_i; i++)); do
-               ((i % 1000 == 0)) && echo $i
-               do_facet mgs $LCTL nodemap_add c$i ||
-                       error "cannot add nodemap $i to config"
-       done
+       do_facet mgs "seq -f 'c%g' $large_i | xargs -n1 $LCTL nodemap_add"
+       wait_nm_sync c$large_i admin_nodemap
 
-       for ((i = 0; i < large_i; i++)); do
-               ((i % 1000 == 0)) && echo $i
-               do_facet mgs $LCTL nodemap_del c$i ||
-                       error "cannot delete nodemap $i from config"
-       done
+       do_facet mgs "seq -f 'c%g' $large_i | xargs -n1 $LCTL nodemap_del"
+       wait_nm_sync c$large_i admin_nodemap
 }
 run_test 26 "test transferring very large nodemap"
 
index 92edd33..7e859c1 100644 (file)
@@ -1363,7 +1363,7 @@ int jt_nodemap_info(int argc, char **argv)
                snprintf(pattern, sizeof(pattern), "nodemap/%s/*", argv[1]);
                rc = param_display(&popt, pattern, NULL, LIST_PARAM);
                if (rc == -ESRCH)
-                       fprintf(stderr, "error: nodemap_info: cannot find"
+                       fprintf(stderr, "error: nodemap_info: cannot find "
                                        "nodemap %s\n", argv[1]);
        }
        return rc;