* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
#define D_MGC D_CONFIG /*|D_WARNING*/
#include <linux/module.h>
-#include <obd_class.h>
-#include <lustre_dlm.h>
+#include <linux/kthread.h>
+
+#include <dt_object.h>
#include <lprocfs_status.h>
-#include <lustre_log.h>
+#include <lustre_dlm.h>
#include <lustre_disk.h>
-#include <dt_object.h>
+#include <lustre_log.h>
+#include <lustre_nodemap.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
#include "mgc_internal.h"
break;
case CONFIG_T_RECOVER:
case CONFIG_T_PARAMS:
- resname = type;
- break;
+ case CONFIG_T_NODEMAP:
+ resname = type;
+ break;
default:
LBUG();
}
list_del(&cld->cld_list_chain);
spin_unlock(&config_list_lock);
- CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+ CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
- if (cld->cld_recover)
- config_log_put(cld->cld_recover);
- if (cld->cld_sptlrpc)
- config_log_put(cld->cld_sptlrpc);
+ if (cld->cld_recover)
+ config_log_put(cld->cld_recover);
+ if (cld->cld_sptlrpc)
+ config_log_put(cld->cld_sptlrpc);
if (cld->cld_params)
config_log_put(cld->cld_params);
- if (cld_is_sptlrpc(cld))
- sptlrpc_conf_log_stop(cld->cld_logname);
+ if (cld->cld_nodemap)
+ config_log_put(cld->cld_nodemap);
+ if (cld_is_sptlrpc(cld))
+ sptlrpc_conf_log_stop(cld->cld_logname);
- class_export_put(cld->cld_mgcexp);
- OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
- }
+ class_export_put(cld->cld_mgcexp);
+ OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+ }
- EXIT;
+ EXIT;
}
/* Find a config log by name */
static
struct config_llog_data *do_config_log_add(struct obd_device *obd,
- char *logname,
- int type,
- struct config_llog_instance *cfg,
- struct super_block *sb)
+ char *logname,
+ int type,
+ struct config_llog_instance *cfg,
+ struct super_block *sb)
{
- struct config_llog_data *cld;
- int rc;
- ENTRY;
+ struct config_llog_data *cld;
+ int rc;
- CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
- cfg ? cfg->cfg_instance : 0);
+ ENTRY;
- OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
- if (!cld)
- RETURN(ERR_PTR(-ENOMEM));
+ CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+ cfg ? cfg->cfg_instance : NULL);
+
+ OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+ if (!cld)
+ RETURN(ERR_PTR(-ENOMEM));
strcpy(cld->cld_logname, logname);
if (cfg)
cld->cld_type = type;
atomic_set(&cld->cld_refcount, 1);
- /* Keep the mgc around until we are done */
- cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+ /* Keep the mgc around until we are done */
+ cld->cld_mgcexp = class_export_get(obd->obd_self_export);
- if (cld_is_sptlrpc(cld)) {
- sptlrpc_conf_log_start(logname);
- cld->cld_cfg.cfg_obdname = obd->obd_name;
- }
+ if (cld_is_sptlrpc(cld)) {
+ sptlrpc_conf_log_start(logname);
+ cld->cld_cfg.cfg_obdname = obd->obd_name;
+ }
- rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+ rc = mgc_logname2resid(logname, &cld->cld_resid, type);
spin_lock(&config_list_lock);
list_add(&cld->cld_list_chain, &config_llog_list);
spin_unlock(&config_list_lock);
- if (rc) {
- config_log_put(cld);
- RETURN(ERR_PTR(rc));
- }
+ if (rc) {
+ config_log_put(cld);
+ RETURN(ERR_PTR(rc));
+ }
- if (cld_is_sptlrpc(cld)) {
- rc = mgc_process_log(obd, cld);
+ if (cld_is_sptlrpc(cld) || cld_is_nodemap(cld)) {
+ rc = mgc_process_log(obd, cld);
if (rc && rc != -ENOENT)
- CERROR("failed processing sptlrpc log: %d\n", rc);
- }
+ CERROR("%s: failed processing log, type %d: rc = %d\n",
+ obd->obd_name, type, rc);
+ }
- RETURN(cld);
+ RETURN(cld);
}
static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
* Each instance may be at a different point in the log.
*/
static int config_log_add(struct obd_device *obd, char *logname,
- struct config_llog_instance *cfg,
- struct super_block *sb)
+ struct config_llog_instance *cfg,
+ struct super_block *sb)
{
struct lustre_sb_info *lsi = s2lsi(sb);
struct config_llog_data *cld;
struct config_llog_data *sptlrpc_cld;
struct config_llog_data *params_cld;
+ struct config_llog_data *nodemap_cld;
char seclogname[32];
char *ptr;
int rc;
ENTRY;
- CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+ CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
- /*
- * for each regular log, the depended sptlrpc log name is
- * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
- */
- ptr = strrchr(logname, '-');
- if (ptr == NULL || ptr - logname > 8) {
- CERROR("logname %s is too long\n", logname);
- RETURN(-EINVAL);
- }
+ /*
+ * for each regular log, the depended sptlrpc log name is
+ * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+ */
+ ptr = strrchr(logname, '-');
+ if (ptr == NULL || ptr - logname > 8) {
+ CERROR("logname %s is too long\n", logname);
+ RETURN(-EINVAL);
+ }
- memcpy(seclogname, logname, ptr - logname);
- strcpy(seclogname + (ptr - logname), "-sptlrpc");
+ memcpy(seclogname, logname, ptr - logname);
+ strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+ sptlrpc_cld = config_log_find(seclogname, NULL);
+ if (sptlrpc_cld == NULL) {
+ sptlrpc_cld = do_config_log_add(obd, seclogname,
+ CONFIG_T_SPTLRPC, NULL, NULL);
+ if (IS_ERR(sptlrpc_cld)) {
+ CERROR("can't create sptlrpc log: %s\n", seclogname);
+ GOTO(out, rc = PTR_ERR(sptlrpc_cld));
+ }
+ }
+
+ nodemap_cld = config_log_find(LUSTRE_NODEMAP_NAME, NULL);
+ if (!nodemap_cld && IS_SERVER(lsi) && !IS_MGS(lsi)) {
+ nodemap_cld = do_config_log_add(obd, LUSTRE_NODEMAP_NAME,
+ CONFIG_T_NODEMAP, NULL, NULL);
+ if (IS_ERR(nodemap_cld)) {
+ rc = PTR_ERR(nodemap_cld);
+ CERROR("%s: cannot create nodemap log: rc = %d\n",
+ obd->obd_name, rc);
+ GOTO(out_sptlrpc, rc);
+ }
+ }
- sptlrpc_cld = config_log_find(seclogname, NULL);
- if (sptlrpc_cld == NULL) {
- sptlrpc_cld = do_config_log_add(obd, seclogname,
- CONFIG_T_SPTLRPC, NULL, NULL);
- if (IS_ERR(sptlrpc_cld)) {
- CERROR("can't create sptlrpc log: %s\n", seclogname);
- GOTO(out_err, rc = PTR_ERR(sptlrpc_cld));
- }
- }
params_cld = config_params_log_add(obd, cfg, sb);
if (IS_ERR(params_cld)) {
rc = PTR_ERR(params_cld);
CERROR("%s: can't create params log: rc = %d\n",
obd->obd_name, rc);
- GOTO(out_err1, rc);
+ GOTO(out_nodemap, rc);
}
cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
if (IS_ERR(cld)) {
CERROR("can't create log: %s\n", logname);
- GOTO(out_err2, rc = PTR_ERR(cld));
+ GOTO(out_params, rc = PTR_ERR(cld));
}
- cld->cld_sptlrpc = sptlrpc_cld;
- cld->cld_params = params_cld;
-
- LASSERT(lsi->lsi_lmd);
- if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
- struct config_llog_data *recover_cld;
+ LASSERT(lsi->lsi_lmd);
+ if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+ struct config_llog_data *recover_cld;
ptr = strrchr(seclogname, '-');
if (ptr != NULL) {
*ptr = 0;
config_log_put(cld);
RETURN(-EINVAL);
}
- recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+ recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
if (IS_ERR(recover_cld))
- GOTO(out_err3, rc = PTR_ERR(recover_cld));
+ GOTO(out_cld, rc = PTR_ERR(recover_cld));
cld->cld_recover = recover_cld;
}
+ cld->cld_sptlrpc = sptlrpc_cld;
+ cld->cld_params = params_cld;
+ cld->cld_nodemap = nodemap_cld;
+
RETURN(0);
-out_err3:
+out_cld:
config_log_put(cld);
-out_err2:
+out_params:
config_log_put(params_cld);
-out_err1:
+out_nodemap:
+ config_log_put(nodemap_cld);
+
+out_sptlrpc:
config_log_put(sptlrpc_cld);
-out_err:
- RETURN(rc);
+out:
+ return rc;
}
DEFINE_MUTEX(llog_process_lock);
*/
static int config_log_end(char *logname, struct config_llog_instance *cfg)
{
- struct config_llog_data *cld;
- struct config_llog_data *cld_sptlrpc = NULL;
+ struct config_llog_data *cld;
+ struct config_llog_data *cld_sptlrpc = NULL;
struct config_llog_data *cld_params = NULL;
- struct config_llog_data *cld_recover = NULL;
- int rc = 0;
- ENTRY;
+ struct config_llog_data *cld_recover = NULL;
+ struct config_llog_data *cld_nodemap = NULL;
+ int rc = 0;
- cld = config_log_find(logname, cfg);
- if (cld == NULL)
- RETURN(-ENOENT);
+ ENTRY;
+
+ cld = config_log_find(logname, cfg);
+ if (cld == NULL)
+ RETURN(-ENOENT);
mutex_lock(&cld->cld_lock);
- /*
- * if cld_stopping is set, it means we didn't start the log thus
- * not owning the start ref. this can happen after previous umount:
- * the cld still hanging there waiting for lock cancel, and we
- * remount again but failed in the middle and call log_end without
- * calling start_log.
- */
- if (unlikely(cld->cld_stopping)) {
+ /*
+ * if cld_stopping is set, it means we didn't start the log thus
+ * not owning the start ref. this can happen after previous umount:
+ * the cld still hanging there waiting for lock cancel, and we
+ * remount again but failed in the middle and call log_end without
+ * calling start_log.
+ */
+ if (unlikely(cld->cld_stopping)) {
mutex_unlock(&cld->cld_lock);
- /* drop the ref from the find */
- config_log_put(cld);
- RETURN(rc);
- }
+ /* drop the ref from the find */
+ config_log_put(cld);
+ RETURN(rc);
+ }
- cld->cld_stopping = 1;
+ cld->cld_stopping = 1;
- cld_recover = cld->cld_recover;
- cld->cld_recover = NULL;
+ cld_recover = cld->cld_recover;
+ cld->cld_recover = NULL;
mutex_unlock(&cld->cld_lock);
if (cld_recover) {
cld->cld_sptlrpc = NULL;
cld_params = cld->cld_params;
cld->cld_params = NULL;
+ cld_nodemap = cld->cld_nodemap;
+ cld->cld_nodemap = NULL;
spin_unlock(&config_list_lock);
- if (cld_sptlrpc)
- config_log_put(cld_sptlrpc);
+ if (cld_sptlrpc)
+ config_log_put(cld_sptlrpc);
if (cld_params) {
mutex_lock(&cld_params->cld_lock);
config_log_put(cld_params);
}
- /* drop the ref from the find */
- config_log_put(cld);
- /* drop the start ref */
- config_log_put(cld);
+ /* don't set cld_stopping on nm lock as other targets may be active */
+ if (cld_nodemap)
+ config_log_put(cld_nodemap);
- CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
- rc);
- RETURN(rc);
+ /* drop the ref from the find */
+ config_log_put(cld);
+ /* drop the start ref */
+ config_log_put(cld);
+
+ CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+ rc);
+ RETURN(rc);
}
-#ifdef LPROCFS
+#ifdef CONFIG_PROC_FS
int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
{
struct obd_device *obd = data;
if (cld->cld_recover == NULL)
continue;
seq_printf(m, " - { client: %s, nidtbl_version: %u }\n",
- cld->cld_logname,
- cld->cld_recover->cld_cfg.cfg_last_idx);
+ cld->cld_logname,
+ cld->cld_recover->cld_cfg.cfg_last_idx);
}
spin_unlock(&config_list_lock);
LASSERT(atomic_read(&cld->cld_refcount) > 0);
- /* Do not run mgc_process_log on a disconnected export or an
+ /*
+ * Do not run mgc_process_log on a disconnected export or an
* export which is being disconnected. Take the client
- * semaphore to make the check non-racy. */
- down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+ * semaphore to make the check non-racy.
+ */
+ down_read_nested(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem,
+ OBD_CLI_SEM_MGC);
if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
RETURN(0);
}
-static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb)
+static int mgc_fs_setup(const struct lu_env *env, struct obd_device *obd,
+ struct super_block *sb)
{
struct lustre_sb_info *lsi = s2lsi(sb);
struct client_obd *cli = &obd->u.cli;
struct lu_fid rfid, fid;
struct dt_object *root, *dto;
- struct lu_env *env;
int rc = 0;
ENTRY;
LASSERT(lsi);
LASSERT(lsi->lsi_dt_dev);
- OBD_ALLOC_PTR(env);
- if (env == NULL)
- RETURN(-ENOMEM);
-
/* The mgc fs exclusion mutex. Only one fs can be setup at a time. */
mutex_lock(&cli->cl_mgc_mutex);
/* Setup the configs dir */
- rc = lu_env_init(env, LCT_MG_THREAD);
- if (rc)
- GOTO(out_err, rc);
-
fid.f_seq = FID_SEQ_LOCAL_NAME;
fid.f_oid = 1;
fid.f_ver = 0;
rc = local_oid_storage_init(env, lsi->lsi_dt_dev, &fid,
&cli->cl_mgc_los);
if (rc)
- GOTO(out_env, rc);
+ RETURN(rc);
rc = dt_root_get(env, lsi->lsi_dt_dev, &rfid);
if (rc)
- GOTO(out_env, rc);
+ GOTO(out_los, rc);
root = dt_locate_at(env, lsi->lsi_dt_dev, &rfid,
&cli->cl_mgc_los->los_dev->dd_lu_dev, NULL);
cli->cl_mgc_los = NULL;
mutex_unlock(&cli->cl_mgc_mutex);
}
-out_env:
- lu_env_fini(env);
-out_err:
- OBD_FREE_PTR(env);
return rc;
}
-static int mgc_fs_cleanup(struct obd_device *obd)
+static int mgc_fs_cleanup(const struct lu_env *env, struct obd_device *obd)
{
- struct lu_env env;
struct client_obd *cli = &obd->u.cli;
- int rc;
-
ENTRY;
LASSERT(cli->cl_mgc_los != NULL);
- rc = lu_env_init(&env, LCT_MG_THREAD);
- if (rc)
- GOTO(unlock, rc);
+ mgc_local_llog_fini(env, obd);
- mgc_local_llog_fini(&env, obd);
-
- lu_object_put_nocache(&env, &cli->cl_mgc_configs_dir->do_lu);
+ lu_object_put_nocache(env, &cli->cl_mgc_configs_dir->do_lu);
cli->cl_mgc_configs_dir = NULL;
- local_oid_storage_fini(&env, cli->cl_mgc_los);
+ local_oid_storage_fini(env, cli->cl_mgc_los);
cli->cl_mgc_los = NULL;
- lu_env_fini(&env);
-unlock:
class_decref(obd, "mgc_fs", obd);
mutex_unlock(&cli->cl_mgc_mutex);
static atomic_t mgc_count = ATOMIC_INIT(0);
-static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+static int mgc_precleanup(struct obd_device *obd)
{
- int rc = 0;
- int temp;
+ int rc = 0;
+ int temp;
ENTRY;
- switch (stage) {
- case OBD_CLEANUP_EARLY:
- break;
- case OBD_CLEANUP_EXPORTS:
- if (atomic_dec_and_test(&mgc_count)) {
- LASSERT(rq_state & RQ_RUNNING);
- /* stop requeue thread */
- temp = RQ_STOP;
- } else {
- /* wakeup requeue thread to clean our cld */
- temp = RQ_NOW | RQ_PRECLEANUP;
- }
- spin_lock(&config_list_lock);
- rq_state |= temp;
- spin_unlock(&config_list_lock);
- wake_up(&rq_waitq);
- if (temp & RQ_STOP)
- wait_for_completion(&rq_exit);
- obd_cleanup_client_import(obd);
- rc = mgc_llog_fini(NULL, obd);
- if (rc != 0)
- CERROR("failed to cleanup llogging subsystems\n");
- break;
+ if (atomic_dec_and_test(&mgc_count)) {
+ LASSERT(rq_state & RQ_RUNNING);
+ /* stop requeue thread */
+ temp = RQ_STOP;
+ } else {
+ /* wakeup requeue thread to clean our cld */
+ temp = RQ_NOW | RQ_PRECLEANUP;
}
+
+ spin_lock(&config_list_lock);
+ rq_state |= temp;
+ spin_unlock(&config_list_lock);
+ wake_up(&rq_waitq);
+
+ if (temp & RQ_STOP)
+ wait_for_completion(&rq_exit);
+ obd_cleanup_client_import(obd);
+
+ rc = mgc_llog_fini(NULL, obd);
+ if (rc != 0)
+ CERROR("failed to cleanup llogging subsystems\n");
+
RETURN(rc);
}
GOTO(err_cleanup, rc);
}
-#ifdef LPROCFS
+#ifdef CONFIG_PROC_FS
obd->obd_vars = lprocfs_mgc_obd_vars;
lprocfs_obd_setup(obd);
#endif
}
/* Make sure not to re-enqueue when the mgc is stopping
(we get called from client_disconnect_export) */
- if (!lock->l_conn_export ||
- !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+ if (lock->l_conn_export == NULL ||
+ lock->l_conn_export->exp_obd->u.cli.cl_conn_count == 0) {
CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
cld->cld_logname);
config_log_put(cld);
}
/* Take a config lock so we can get cancel notifications */
-static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
- __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+static int mgc_enqueue(struct obd_export *exp, enum ldlm_type type,
+ union ldlm_policy_data *policy, enum ldlm_mode mode,
__u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
void *data, __u32 lvb_len, void *lvb_swabber,
struct lustre_handle *lockh)
RETURN(rc);
}
-static int mgc_cancel(struct obd_export *exp, ldlm_mode_t mode,
+static int mgc_cancel(struct obd_export *exp, enum ldlm_mode mode,
struct lustre_handle *lockh)
{
- ENTRY;
+ ENTRY;
- ldlm_lock_decref(lockh, mode);
+ ldlm_lock_decref(lockh, mode);
- RETURN(0);
+ RETURN(0);
}
static void mgc_notify_active(struct obd_device *unused)
if (vallen != sizeof(struct super_block))
RETURN(-EINVAL);
- rc = mgc_fs_setup(exp->exp_obd, sb);
+ rc = mgc_fs_setup(env, exp->exp_obd, sb);
RETURN(rc);
}
if (KEY_IS(KEY_CLEAR_FS)) {
if (vallen != 0)
RETURN(-EINVAL);
- rc = mgc_fs_cleanup(exp->exp_obd);
+ rc = mgc_fs_cleanup(env, exp->exp_obd);
RETURN(rc);
}
if (KEY_IS(KEY_SET_INFO)) {
}
static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
- __u32 keylen, void *key,
- __u32 *vallen, void *val,
- struct lov_stripe_md *unused)
+ __u32 keylen, void *key, __u32 *vallen, void *val)
{
int rc = -EINVAL;
/**
* This function is called if this client was notified for target restarting
- * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery or
+ * nodemap logs.
*/
-static int mgc_process_recover_log(struct obd_device *obd,
- struct config_llog_data *cld)
+static int mgc_process_recover_nodemap_log(struct obd_device *obd,
+ struct config_llog_data *cld)
{
- struct ptlrpc_request *req = NULL;
- struct config_llog_instance *cfg = &cld->cld_cfg;
- struct mgs_config_body *body;
- struct mgs_config_res *res;
- struct ptlrpc_bulk_desc *desc;
+ struct ptlrpc_request *req = NULL;
+ struct config_llog_instance *cfg = &cld->cld_cfg;
+ struct mgs_config_body *body;
+ struct mgs_config_res *res;
+ struct nodemap_config *new_config = NULL;
+ struct lu_nodemap *recent_nodemap = NULL;
+ struct ptlrpc_bulk_desc *desc;
struct page **pages;
- int nrpages;
- bool eof = true;
+ __u64 config_read_offset = 0;
+ int nrpages;
+ bool eof = true;
bool mne_swab = false;
- int i;
- int ealen;
- int rc;
- ENTRY;
+ int i;
+ int ealen;
+ int rc;
+
+ ENTRY;
/* allocate buffer for bulk transfer.
* if this is the first time for this mgs to read logs,
* small and CONFIG_READ_NRPAGES will be used.
*/
nrpages = CONFIG_READ_NRPAGES;
- if (cfg->cfg_last_idx == 0) /* the first time */
+ if (cfg->cfg_last_idx == 0 || cld_is_nodemap(cld))
nrpages = CONFIG_READ_NRPAGES_INIT;
OBD_ALLOC(pages, sizeof(*pages) * nrpages);
GOTO(out, rc = -ENOMEM);
for (i = 0; i < nrpages; i++) {
- pages[i] = alloc_page(GFP_IOFS);
+ pages[i] = alloc_page(GFP_KERNEL);
if (pages[i] == NULL)
GOTO(out, rc = -ENOMEM);
}
again:
- LASSERT(cld_is_recover(cld));
+#ifdef HAVE_SERVER_SUPPORT
+ if (cld_is_nodemap(cld) && config_read_offset == 0) {
+ new_config = nodemap_config_alloc();
+ if (IS_ERR(new_config)) {
+ rc = PTR_ERR(new_config);
+ new_config = NULL;
+ GOTO(out, rc);
+ }
+ }
+#endif
+ LASSERT(cld_is_recover(cld) || cld_is_nodemap(cld));
LASSERT(mutex_is_locked(&cld->cld_lock));
- req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
- &RQF_MGS_CONFIG_READ);
- if (req == NULL)
- GOTO(out, rc = -ENOMEM);
+ req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+ &RQF_MGS_CONFIG_READ);
+ if (req == NULL)
+ GOTO(out, rc = -ENOMEM);
- rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
- if (rc)
- GOTO(out, rc);
+ rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+ if (rc)
+ GOTO(out, rc);
- /* pack request */
- body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
- LASSERT(body != NULL);
- LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+ /* pack request */
+ body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+ LASSERT(body != NULL);
+ LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
>= sizeof(body->mcb_name))
GOTO(out, rc = -E2BIG);
- body->mcb_offset = cfg->cfg_last_idx + 1;
- body->mcb_type = cld->cld_type;
+ if (cld_is_nodemap(cld))
+ body->mcb_offset = config_read_offset;
+ else
+ body->mcb_offset = cfg->cfg_last_idx + 1;
+ body->mcb_type = cld->cld_type;
body->mcb_bits = PAGE_CACHE_SHIFT;
- body->mcb_units = nrpages;
+ body->mcb_units = nrpages;
/* allocate bulk transfer descriptor */
- desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
- MGS_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_imp(req, nrpages, 1,
+ PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV,
+ MGS_BULK_PORTAL,
+ &ptlrpc_bulk_kiov_pin_ops);
if (desc == NULL)
GOTO(out, rc = -ENOMEM);
for (i = 0; i < nrpages; i++)
- ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+ desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0,
+ PAGE_CACHE_SIZE);
- ptlrpc_request_set_replen(req);
- rc = ptlrpc_queue_wait(req);
- if (rc)
- GOTO(out, rc);
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ GOTO(out, rc);
- res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
- if (res->mcr_size < res->mcr_offset)
- GOTO(out, rc = -EINVAL);
+ res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+ if (!res)
+ GOTO(out, rc = -EPROTO);
- /* always update the index even though it might have errors with
- * handling the recover logs */
- cfg->cfg_last_idx = res->mcr_offset;
- eof = res->mcr_offset == res->mcr_size;
+ if (cld_is_nodemap(cld)) {
+ config_read_offset = res->mcr_offset;
+ eof = config_read_offset == II_END_OFF;
+ } else {
+ if (res->mcr_size < res->mcr_offset)
+ GOTO(out, rc = -EINVAL);
+
+ /* always update the index even though it might have errors with
+ * handling the recover logs
+ */
+ cfg->cfg_last_idx = res->mcr_offset;
+ eof = res->mcr_offset == res->mcr_size;
- CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n",
- res->mcr_offset, eof == false);
+ CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n",
+ res->mcr_offset, eof == false);
+ }
- ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
- if (ealen < 0)
- GOTO(out, rc = ealen);
+ ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+ if (ealen < 0)
+ GOTO(out, rc = ealen);
if (ealen > nrpages << PAGE_CACHE_SHIFT)
GOTO(out, rc = -EINVAL);
if (ealen == 0) { /* no logs transferred */
+#ifdef HAVE_SERVER_SUPPORT
+ /* config changed since first read RPC */
+ if (cld_is_nodemap(cld) && config_read_offset == 0) {
+ recent_nodemap = NULL;
+ nodemap_config_dealloc(new_config);
+ new_config = NULL;
+
+ CDEBUG(D_INFO, "nodemap config changed in transit, retrying\n");
+
+ /* setting eof to false, we request config again */
+ eof = false;
+ GOTO(out, rc = 0);
+ }
+#endif
if (!eof)
rc = -EINVAL;
GOTO(out, rc);
mne_swab = !mne_swab;
#endif
+ /* When a nodemap config is received, we build a new nodemap config,
+ * with new nodemap structs. We keep track of the most recently added
+ * nodemap since the config is read ordered by nodemap_id, and so it
+ * is likely that the next record will be related. Because access to
+ * the nodemaps is single threaded until the nodemap_config is active,
+ * we don't need to reference count with recent_nodemap, though
+ * recent_nodemap should be set to NULL when the nodemap_config
+ * is either destroyed or set active.
+ */
for (i = 0; i < nrpages && ealen > 0; i++) {
int rc2;
- void *ptr;
+ union lu_page *ptr;
ptr = kmap(pages[i]);
- rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
- min_t(int, ealen, PAGE_CACHE_SIZE),
- mne_swab);
+ if (cld_is_nodemap(cld))
+ rc2 = nodemap_process_idx_pages(new_config, ptr,
+ &recent_nodemap);
+ else
+ rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset,
+ ptr,
+ min_t(int, ealen,
+ PAGE_CACHE_SIZE),
+ mne_swab);
kunmap(pages[i]);
if (rc2 < 0) {
- CWARN("Process recover log %s error %d\n",
- cld->cld_logname, rc2);
+ CWARN("%s: error processing %s log %s: rc = %d\n",
+ obd->obd_name,
+ cld_is_nodemap(cld) ? "nodemap" : "recovery",
+ cld->cld_logname,
+ rc2);
break;
- }
+ }
ealen -= PAGE_CACHE_SIZE;
- }
+ }
out:
- if (req)
- ptlrpc_req_finished(req);
+ if (req) {
+ ptlrpc_req_finished(req);
+ req = NULL;
+ }
- if (rc == 0 && !eof)
- goto again;
+ if (rc == 0 && !eof)
+ goto again;
+
+#ifdef HAVE_SERVER_SUPPORT
+ if (new_config != NULL) {
+ /* recent_nodemap cannot be used after set_active/dealloc */
+ if (rc == 0)
+ nodemap_config_set_active_mgc(new_config);
+ else
+ nodemap_config_dealloc(new_config);
+ }
+#endif
if (pages) {
for (i = 0; i < nrpages; i++) {
* - if failed then move bakup to logname again
*/
- OBD_ALLOC(temp_log, strlen(logname) + 1);
+ OBD_ALLOC(temp_log, strlen(logname) + 2);
if (!temp_log)
RETURN(-ENOMEM);
sprintf(temp_log, "%sT", logname);
obd->obd_name, logname, rc);
}
llog_erase(env, lctxt, NULL, temp_log);
- OBD_FREE(temp_log, strlen(logname) + 1);
+ OBD_FREE(temp_log, strlen(logname) + 2);
return rc;
}
return rc;
}
-/** Get a config log from the MGS and process it.
- * This func is called for both clients and servers.
- * Copy the log locally before parsing it if appropriate (non-MGS server)
+static bool mgc_import_in_recovery(struct obd_import *imp)
+{
+ bool in_recovery = true;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state == LUSTRE_IMP_FULL ||
+ imp->imp_state == LUSTRE_IMP_CLOSED)
+ in_recovery = false;
+ spin_unlock(&imp->imp_lock);
+
+ return in_recovery;
+}
+
+/**
+ * Get a configuration log from the MGS and process it.
+ *
+ * This function is called for both clients and servers to process the
+ * configuration log from the MGS. The MGC enqueues a DLM lock on the
+ * log from the MGS, and if the lock gets revoked the MGC will be notified
+ * by the lock cancellation callback that the config log has changed,
+ * and will enqueue another MGS lock on it, and then continue processing
+ * the new additions to the end of the log.
+ *
+ * Since the MGC import is not replayable, if the import is being evicted
+ * (rcl == -ESHUTDOWN, \see ptlrpc_import_delay_req()), retry to process
+ * the log until recovery is finished or the import is closed.
+ *
+ * Make a local copy of the log before parsing it if appropriate (non-MGS
+ * server) so that the server can start even when the MGS is down.
+ *
+ * There shouldn't be multiple processes running process_log at once --
+ * sounds like badness. It actually might be fine, as long as they're not
+ * trying to update from the same log simultaneously, in which case we
+ * should use a per-log semaphore instead of cld_lock.
+ *
+ * \param[in] mgc MGC device by which to fetch the configuration log
+ * \param[in] cld log processing state (stored in lock callback data)
+ *
+ * \retval 0 on success
+ * \retval negative errno on failure
*/
int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
{
struct lustre_handle lockh = { 0 };
__u64 flags = LDLM_FL_NO_LRU;
int rc = 0, rcl;
+ bool retry = false;
ENTRY;
- LASSERT(cld);
+ LASSERT(cld != NULL);
/* I don't want multiple processes running process_log at once --
sounds like badness. It actually might be fine, as long as
we're not trying to update from the same log
simultaneously (in which case we should use a per-log sem.) */
+restart:
mutex_lock(&cld->cld_lock);
if (cld->cld_stopping) {
mutex_unlock(&cld->cld_lock);
OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
- CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
- cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
-
- /* Get the cfg lock on the llog */
- rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
- LCK_CR, &flags, NULL, NULL, NULL,
- cld, 0, NULL, &lockh);
- if (rcl == 0) {
- /* Get the cld, it will be released in mgc_blocking_ast. */
- config_log_get(cld);
- rc = ldlm_lock_set_data(&lockh, (void *)cld);
- LASSERT(rc == 0);
- } else {
- CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+ CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+ cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+ /* Get the cfg lock on the llog */
+ rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, LDLM_PLAIN, NULL,
+ LCK_CR, &flags, NULL, NULL, NULL,
+ cld, 0, NULL, &lockh);
+ if (rcl == 0) {
+ /* Get the cld, it will be released in mgc_blocking_ast. */
+ config_log_get(cld);
+ rc = ldlm_lock_set_data(&lockh, (void *)cld);
+ LASSERT(rc == 0);
+ } else {
+ CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+ if (rcl == -ESHUTDOWN &&
+ atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) {
+ struct obd_import *imp;
+ struct l_wait_info lwi;
+ int secs = cfs_time_seconds(obd_timeout);
+
+ mutex_unlock(&cld->cld_lock);
+ imp = class_exp2cliimp(mgc->u.cli.cl_mgc_mgsexp);
+
+ /* Let's force the pinger, and wait the import to be
+ * connected, note: since mgc import is non-replayable,
+ * and even the import state is disconnected, it does
+ * not mean the "recovery" is stopped, so we will keep
+ * waitting until timeout or the import state is
+ * FULL or closed */
+ ptlrpc_pinger_force(imp);
+
+ lwi = LWI_TIMEOUT(secs, NULL, NULL);
+ l_wait_event(imp->imp_recovery_waitq,
+ !mgc_import_in_recovery(imp), &lwi);
+
+ if (imp->imp_state == LUSTRE_IMP_FULL) {
+ retry = true;
+ goto restart;
+ } else {
+ mutex_lock(&cld->cld_lock);
+ cld->cld_lostlock = 1;
+ }
+ } else {
+ /* mark cld_lostlock so that it will requeue
+ * after MGC becomes available. */
+ cld->cld_lostlock = 1;
+ }
+ /* Get extra reference, it will be put in requeue thread */
+ config_log_get(cld);
+ }
- /* mark cld_lostlock so that it will requeue
- * after MGC becomes available. */
- cld->cld_lostlock = 1;
- /* Get extra reference, it will be put in requeue thread */
- config_log_get(cld);
- }
+ if (cld_is_recover(cld) || cld_is_nodemap(cld)) {
+ if (!rcl)
+ rc = mgc_process_recover_nodemap_log(mgc, cld);
+ else if (cld_is_nodemap(cld))
+ rc = rcl;
- if (cld_is_recover(cld)) {
- rc = 0; /* this is not a fatal error for recover log */
- if (rcl == 0)
- rc = mgc_process_recover_log(mgc, cld);
- } else {
- rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
- }
+ if (cld_is_recover(cld) && rc) {
+ if (!rcl) {
+ CERROR("%s: recover log %s failed, not fatal: rc = %d\n",
+ mgc->obd_name, cld->cld_logname, rc);
+ cld->cld_lostlock = 1;
+ }
+ rc = 0; /* this is not a fatal error for recover log */
+ }
+ } else {
+ rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+ }
- CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
- mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+ CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+ mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
mutex_unlock(&cld->cld_lock);
LUSTRE_MGC_NAME, NULL);
}
-static void /*__exit*/ mgc_exit(void)
+static void __exit mgc_exit(void)
{
class_unregister_type(LUSTRE_MGC_NAME);
}
-MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
MODULE_LICENSE("GPL");
module_init(mgc_init);