From d3ca7a90b4a22908e212ef008fa78932541ef017 Mon Sep 17 00:00:00 2001 From: Kit Westneat Date: Thu, 13 Aug 2015 11:03:21 -0400 Subject: [PATCH] LU-5092 nodemap: handle config changes while mid-flight This adds the ability to detect when the nodemap configuration has changed between messages for very large configurations. If the configuration is over 1MB (about 25k entries), it needs to be sent in multiple messages. If the configuration changes in-between these messages, then the client needs to abort the previous get config operation and restart. Test-Parameters: envdefinitions=SLOW=yes testlist=sanity-sec Signed-off-by: Kit Westneat Change-Id: I2bf6e4bade947fea8331c2922d9c33ab49100577 Reviewed-on: http://review.whamcloud.com/16941 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/include/lustre_export.h | 3 +++ lustre/mgc/mgc_request.c | 44 +++++++++++++++++++++++++++-------------- lustre/ptlrpc/nodemap_storage.c | 21 ++++++++++++++++++-- lustre/tests/sanity-sec.sh | 18 ++++++----------- lustre/utils/lustre_cfg.c | 2 +- 5 files changed, 58 insertions(+), 30 deletions(-) diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 31a1e81..8e94a66 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -84,6 +84,9 @@ struct tg_export_data { struct lu_nodemap *ted_nodemap; struct list_head ted_nodemap_member; + /** last version of nodemap config sent to client */ + __u64 ted_nodemap_version; + /* Every reply data fields below are * protected by ted_lcd_lock */ /** List of reply data */ diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 6415307a..7b4db0c 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -1577,19 +1577,8 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd, struct config_llog_instance *cfg = &cld->cld_cfg; struct mgs_config_body *body; struct mgs_config_res *res; - - /* When a nodemap config is received, we build a new nodemap config, - * with new nodemap structs. We keep track of the most recently added - * nodemap since the config is read ordered by nodemap_id, and so it - * is likely that the next record will be related. Because access to - * the nodemaps is single threaded until the nodemap_config is active, - * we don't need to reference count with recent_nodemap, though - * recent_nodemap should be set to NULL when the nodemap_config - * is either destroyed or set active. - */ struct nodemap_config *new_config = NULL; struct lu_nodemap *recent_nodemap = NULL; - struct ptlrpc_bulk_desc *desc; struct page **pages; __u64 config_read_offset = 0; @@ -1622,8 +1611,9 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd, GOTO(out, rc = -ENOMEM); } +again: #ifdef HAVE_SERVER_SUPPORT - if (cld_is_nodemap(cld)) { + if (cld_is_nodemap(cld) && config_read_offset == 0) { new_config = nodemap_config_alloc(); if (IS_ERR(new_config)) { rc = PTR_ERR(new_config); @@ -1632,7 +1622,6 @@ static int mgc_process_recover_nodemap_log(struct obd_device *obd, } } #endif -again: LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld)); LASSERT(mutex_is_locked(&cld->cld_lock)); req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp), @@ -1705,6 +1694,20 @@ again: GOTO(out, rc = -EINVAL); if (ealen == 0) { /* no logs transferred */ +#ifdef HAVE_SERVER_SUPPORT + /* config changed since first read RPC */ + if (cld_is_nodemap(cld) && config_read_offset == 0) { + recent_nodemap = NULL; + nodemap_config_dealloc(new_config); + new_config = NULL; + + CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n"); + + /* setting eof to false, we request config again */ + eof = false; + GOTO(out, rc = 0); + } +#endif if (!eof) rc = -EINVAL; GOTO(out, rc); @@ -1718,6 +1721,15 @@ again: mne_swab = !mne_swab; #endif + /* When a nodemap config is received, we build a new nodemap config, + * with new nodemap structs. We keep track of the most recently added + * nodemap since the config is read ordered by nodemap_id, and so it + * is likely that the next record will be related. Because access to + * the nodemaps is single threaded until the nodemap_config is active, + * we don't need to reference count with recent_nodemap, though + * recent_nodemap should be set to NULL when the nodemap_config + * is either destroyed or set active. + */ for (i = 0; i < nrpages && ealen > 0; i++) { int rc2; union lu_page *ptr; @@ -1746,15 +1758,17 @@ again: } out: - if (req) + if (req) { ptlrpc_req_finished(req); + req = NULL; + } if (rc == 0 && !eof) goto again; #ifdef HAVE_SERVER_SUPPORT if (new_config != NULL) { - recent_nodemap = NULL; + /* recent_nodemap cannot be used after set_active/dealloc */ if (rc == 0) nodemap_config_set_active(new_config); else diff --git a/lustre/ptlrpc/nodemap_storage.c b/lustre/ptlrpc/nodemap_storage.c index b61e97d..ef81a47 100644 --- a/lustre/ptlrpc/nodemap_storage.c +++ b/lustre/ptlrpc/nodemap_storage.c @@ -839,14 +839,27 @@ int nodemap_index_read(struct lu_env *env, const struct lu_rdpg *rdpg) { struct dt_object *nodemap_idx = ncf->ncf_obj; + __u64 version; int rc = 0; ii->ii_keysize = dt_nodemap_features.dif_keysize_max; ii->ii_recsize = dt_nodemap_features.dif_recsize_max; dt_read_lock(env, nodemap_idx, 0); - rc = dt_index_walk(env, nodemap_idx, rdpg, NULL, ii); - CDEBUG(D_INFO, "walked index, hashend %llx\n", ii->ii_hash_end); + version = dt_version_get(env, nodemap_idx); + if (rdpg->rp_hash != 0 && ii->ii_version != version) { + CDEBUG(D_INFO, "nodemap config changed while sending, " + "old "LPU64", new "LPU64"\n", + ii->ii_version, + version); + ii->ii_hash_end = 0; + } else { + rc = dt_index_walk(env, nodemap_idx, rdpg, NULL, ii); + CDEBUG(D_INFO, "walked index, hashend %llx\n", ii->ii_hash_end); + } + + if (rc >= 0) + ii->ii_version = version; dt_read_unlock(env, nodemap_idx); return rc; @@ -873,6 +886,7 @@ int nodemap_get_config_req(struct obd_device *mgs_obd, struct idx_info nodemap_ii; struct ptlrpc_bulk_desc *desc; struct l_wait_info lwi; + struct tg_export_data *rqexp_ted = &req->rq_export->exp_target_data; int i; int page_count; int bytes = 0; @@ -907,6 +921,7 @@ int nodemap_get_config_req(struct obd_device *mgs_obd, rdpg.rp_hash = body->mcb_offset; nodemap_ii.ii_magic = IDX_INFO_MAGIC; nodemap_ii.ii_flags = II_FL_NOHASH; + nodemap_ii.ii_version = rqexp_ted->ted_nodemap_version; bytes = nodemap_index_read(req->rq_svc_thread->t_env, mgs_obd->u.obt.obt_nodemap_config_file, @@ -914,6 +929,8 @@ int nodemap_get_config_req(struct obd_device *mgs_obd, if (bytes < 0) GOTO(out, rc = bytes); + rqexp_ted->ted_nodemap_version = nodemap_ii.ii_version; + res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES); if (res == NULL) GOTO(out, rc = -EINVAL); diff --git a/lustre/tests/sanity-sec.sh b/lustre/tests/sanity-sec.sh index c954d7e..39a167e 100755 --- a/lustre/tests/sanity-sec.sh +++ b/lustre/tests/sanity-sec.sh @@ -990,7 +990,7 @@ wait_nm_sync() { if ! $is_sync; then echo MGS echo $out1 - echo OTHER + echo OTHER - IP: $node_ip echo $out2 error "mgs and $nodemap_name ${key} mismatch, $i attempts" fi @@ -1608,19 +1608,13 @@ run_test 25 "test save and reload nodemap config" test_26() { nodemap_version_check || return 0 - local large_i=13000 + local large_i=32000 - for ((i = 0; i < large_i; i++)); do - ((i % 1000 == 0)) && echo $i - do_facet mgs $LCTL nodemap_add c$i || - error "cannot add nodemap $i to config" - done + do_facet mgs "seq -f 'c%g' $large_i | xargs -n1 $LCTL nodemap_add" + wait_nm_sync c$large_i admin_nodemap - for ((i = 0; i < large_i; i++)); do - ((i % 1000 == 0)) && echo $i - do_facet mgs $LCTL nodemap_del c$i || - error "cannot delete nodemap $i from config" - done + do_facet mgs "seq -f 'c%g' $large_i | xargs -n1 $LCTL nodemap_del" + wait_nm_sync c$large_i admin_nodemap } run_test 26 "test transferring very large nodemap" diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 92edd33..7e859c1 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -1363,7 +1363,7 @@ int jt_nodemap_info(int argc, char **argv) snprintf(pattern, sizeof(pattern), "nodemap/%s/*", argv[1]); rc = param_display(&popt, pattern, NULL, LIST_PARAM); if (rc == -ESRCH) - fprintf(stderr, "error: nodemap_info: cannot find" + fprintf(stderr, "error: nodemap_info: cannot find " "nodemap %s\n", argv[1]); } return rc; -- 1.8.3.1