X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flod%2Flod_dev.c;h=767b8502b48ef87ec7b7cf6945d5c39c3392d6fc;hb=c9f3efded4ef5aea926e4c67cfb0859d8592e58a;hp=fd51f339526c54c721288d9ccae7935c44f5d6c7;hpb=b65efe20dc07f925e06d58658cd24247ae36037c;p=fs%2Flustre-release.git diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index fd51f33..767b850 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -6,13 +6,13 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 only, * as published by the Free Software Foundation. - + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License version 2 for more details. A copy is * included in the COPYING file that accompanied this code. - + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -23,8 +23,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved * Use is subject to license terms. * - * Copyright (c) 2012, 2013, Intel Corporation. - * + * Copyright (c) 2012, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -90,14 +89,19 @@ #define DEBUG_SUBSYSTEM S_MDS +#include #include #include #include -#include +#include #include +#include #include "lod_internal.h" +static const char lod_update_log_name[] = "update_log"; +static const char lod_update_log_dir_name[] = "update_log_dir"; + /* * Lookup target by FID. * @@ -119,15 +123,25 @@ int lod_fld_lookup(const struct lu_env *env, struct lod_device *lod, { struct lu_seq_range range = { 0 }; struct lu_server_fld *server_fld; - int rc = 0; + int rc; ENTRY; - LASSERTF(fid_is_sane(fid), "Invalid FID "DFID"\n", PFID(fid)); + if (!fid_is_sane(fid)) { + CERROR("%s: invalid FID "DFID"\n", lod2obd(lod)->obd_name, + PFID(fid)); + RETURN(-EIO); + } if (fid_is_idif(fid)) { *tgt = fid_idif_ost_idx(fid); *type = LU_SEQ_RANGE_OST; - RETURN(rc); + RETURN(0); + } + + if (fid_is_update_log(fid) || fid_is_update_log_dir(fid)) { + *tgt = fid_oid(fid); + *type = LU_SEQ_RANGE_MDT; + RETURN(0); } if (!lod->lod_initialized || (!fid_seq_in_fldb(fid_seq(fid)))) { @@ -135,30 +149,32 @@ int lod_fld_lookup(const struct lu_env *env, struct lod_device *lod, *tgt = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id; *type = LU_SEQ_RANGE_MDT; - RETURN(rc); + RETURN(0); } server_fld = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_server_fld; + if (server_fld == NULL) + RETURN(-EIO); + fld_range_set_type(&range, *type); rc = fld_server_lookup(env, server_fld, fid_seq(fid), &range); - if (rc) + if (rc != 0) RETURN(rc); *tgt = range.lsr_index; *type = range.lsr_flags; - CDEBUG(D_INFO, "LOD: got tgt %x for sequence: " - LPX64"\n", *tgt, fid_seq(fid)); + CDEBUG(D_INFO, "%s: got tgt %x for sequence: %#llx\n", + lod2obd(lod)->obd_name, *tgt, fid_seq(fid)); - RETURN(rc); + RETURN(0); } -extern struct lu_object_operations lod_lu_obj_ops; -extern struct dt_object_operations lod_obj_ops; - /* Slab for OSD object allocation */ struct kmem_cache *lod_object_kmem; +/* Slab for dt_txn_callback */ +struct kmem_cache *lod_txn_callback_kmem; static struct lu_kmem_descr lod_caches[] = { { .ckd_cache = &lod_object_kmem, @@ -166,6 +182,11 @@ static struct lu_kmem_descr lod_caches[] = { .ckd_size = sizeof(struct lod_object) }, { + .ckd_cache = &lod_txn_callback_kmem, + .ckd_name = "lod_txn_callback", + .ckd_size = sizeof(struct dt_txn_callback) + }, + { .ckd_cache = NULL } }; @@ -180,9 +201,9 @@ static struct lu_device *lod_device_fini(const struct lu_env *env, * * see include/lu_object.h for the details. */ -struct lu_object *lod_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev) +static struct lu_object *lod_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) { struct lod_object *lod_obj; struct lu_object *lu_obj; @@ -192,6 +213,7 @@ struct lu_object *lod_object_alloc(const struct lu_env *env, if (lod_obj == NULL) RETURN(ERR_PTR(-ENOMEM)); + mutex_init(&lod_obj->ldo_layout_mutex); lu_obj = lod2lu_obj(lod_obj); dt_object_init(&lod_obj->ldo_obj, NULL, dev); lod_obj->ldo_obj.do_ops = &lod_obj_ops; @@ -201,7 +223,7 @@ struct lu_object *lod_object_alloc(const struct lu_env *env, } /** - * Cleanup table of target's descriptors. + * Process the config log for all sub device. * * The function goes through all the targets in the given table * and apply given configuration command on to the targets. @@ -215,7 +237,7 @@ struct lu_object *lod_object_alloc(const struct lu_env *env, * \retval 0 on success * \retval negative negated errno on error **/ -static int lod_cleanup_desc_tgts(const struct lu_env *env, +static int lod_sub_process_config(const struct lu_env *env, struct lod_device *lod, struct lod_tgt_descs *ltd, struct lustre_cfg *lcfg) @@ -248,22 +270,276 @@ static int lod_cleanup_desc_tgts(const struct lu_env *env, return rc; } +struct lod_recovery_data { + struct lod_device *lrd_lod; + struct lod_tgt_desc *lrd_ltd; + struct ptlrpc_thread *lrd_thread; + __u32 lrd_idx; +}; + + +/** + * process update recovery record + * + * Add the update recovery recode to the update recovery list in + * lod_recovery_data. Then the recovery thread (target_recovery_thread) + * will redo these updates. + * + * \param[in]env execution environment + * \param[in]llh log handle of update record + * \param[in]rec update record to be replayed + * \param[in]data update recovery data which holds the necessary + * arguments for recovery (see struct lod_recovery_data) + * + * \retval 0 if the record is processed successfully. + * \retval negative errno if the record processing fails. + */ +static int lod_process_recovery_updates(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, + void *data) +{ + struct lod_recovery_data *lrd = data; + struct llog_cookie *cookie = &lod_env_info(env)->lti_cookie; + struct lu_target *lut; + __u32 index = 0; + ENTRY; + + if (lrd->lrd_ltd == NULL) { + int rc; + + rc = lodname2mdt_index(lod2obd(lrd->lrd_lod)->obd_name, &index); + if (rc != 0) + return rc; + } else { + index = lrd->lrd_ltd->ltd_index; + } + + if (rec->lrh_len != + llog_update_record_size((struct llog_update_record *)rec)) { + CERROR("%s broken update record! index %u "DFID".%u :" + " rc = %d\n", lod2obd(lrd->lrd_lod)->obd_name, index, + PFID(&llh->lgh_id.lgl_oi.oi_fid), rec->lrh_index, -EIO); + return -EINVAL; + } + + cookie->lgc_lgl = llh->lgh_id; + cookie->lgc_index = rec->lrh_index; + cookie->lgc_subsys = LLOG_UPDATELOG_ORIG_CTXT; + + CDEBUG(D_HA, "%s: process recovery updates "DFID".%u\n", + lod2obd(lrd->lrd_lod)->obd_name, + PFID(&llh->lgh_id.lgl_oi.oi_fid), rec->lrh_index); + lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt; + + if (lut->lut_obd->obd_stopping || + lut->lut_obd->obd_abort_recovery) + return -ESHUTDOWN; + + return insert_update_records_to_replay_list(lut->lut_tdtd, + (struct llog_update_record *)rec, + cookie, index); +} + +/** + * recovery thread for update log + * + * Start recovery thread and prepare the sub llog, then it will retrieve + * the update records from the correpondent MDT and do recovery. + * + * \param[in] arg pointer to the recovery data + * + * \retval 0 if recovery succeeds + * \retval negative errno if recovery failed. + */ +static int lod_sub_recovery_thread(void *arg) +{ + struct lod_recovery_data *lrd = arg; + struct lod_device *lod = lrd->lrd_lod; + struct dt_device *dt; + struct ptlrpc_thread *thread = lrd->lrd_thread; + struct llog_ctxt *ctxt = NULL; + struct lu_env env; + struct lu_target *lut; + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lod_tgt_desc *tgt = NULL; + time64_t start; + int retries = 0; + int i; + int rc; + ENTRY; + + thread->t_flags = SVC_RUNNING; + wake_up(&thread->t_ctl_waitq); + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MD_THREAD); + if (rc != 0) { + OBD_FREE_PTR(lrd); + CERROR("%s: can't initialize env: rc = %d\n", + lod2obd(lod)->obd_name, rc); + RETURN(rc); + } + + lut = lod2lu_dev(lod)->ld_site->ls_tgt; + atomic_inc(&lut->lut_tdtd->tdtd_recovery_threads_count); + if (lrd->lrd_ltd == NULL) + dt = lod->lod_child; + else + dt = lrd->lrd_ltd->ltd_tgt; + + start = ktime_get_real_seconds(); + +again: + rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx); + if (!rc && !lod->lod_child->dd_rdonly) { + /* Process the recovery record */ + ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + LASSERT(ctxt != NULL); + LASSERT(ctxt->loc_handle != NULL); + + rc = llog_cat_process(&env, ctxt->loc_handle, + lod_process_recovery_updates, lrd, 0, 0); + } + + if (rc < 0) { + struct lu_device *top_device; + + top_device = lod->lod_dt_dev.dd_lu_dev.ld_site->ls_top_dev; + /* Because the remote target might failover at the same time, + * let's retry here */ + if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) && + dt != lod->lod_child && + !top_device->ld_obd->obd_abort_recovery && + !top_device->ld_obd->obd_stopping) { + if (ctxt != NULL) { + if (ctxt->loc_handle != NULL) + llog_cat_close(&env, + ctxt->loc_handle); + llog_ctxt_put(ctxt); + } + retries++; + CDEBUG(D_HA, "%s get update log failed %d, retry\n", + dt->dd_lu_dev.ld_obd->obd_name, rc); + goto again; + } + + CERROR("%s get update log failed: rc = %d\n", + dt->dd_lu_dev.ld_obd->obd_name, rc); + llog_ctxt_put(ctxt); + + spin_lock(&top_device->ld_obd->obd_dev_lock); + if (!top_device->ld_obd->obd_abort_recovery && + !top_device->ld_obd->obd_stopping) + top_device->ld_obd->obd_abort_recovery = 1; + spin_unlock(&top_device->ld_obd->obd_dev_lock); + + GOTO(out, rc); + } + llog_ctxt_put(ctxt); + + CDEBUG(D_HA, "%s retrieved update log, duration %lld, retries %d\n", + dt->dd_lu_dev.ld_obd->obd_name, ktime_get_real_seconds() - start, + retries); + + spin_lock(&lod->lod_lock); + if (lrd->lrd_ltd == NULL) + lod->lod_child_got_update_log = 1; + else + lrd->lrd_ltd->ltd_got_update_log = 1; + + if (!lod->lod_child_got_update_log) { + spin_unlock(&lod->lod_lock); + GOTO(out, rc = 0); + } + + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + tgt = LTD_TGT(ltd, i); + if (!tgt->ltd_got_update_log) { + spin_unlock(&lod->lod_lock); + GOTO(out, rc = 0); + } + } + lut->lut_tdtd->tdtd_replay_ready = 1; + spin_unlock(&lod->lod_lock); + + CDEBUG(D_HA, "%s got update logs from all MDTs.\n", + lut->lut_obd->obd_name); + wake_up(&lut->lut_obd->obd_next_transno_waitq); + EXIT; + +out: + OBD_FREE_PTR(lrd); + thread->t_flags = SVC_STOPPED; + atomic_dec(&lut->lut_tdtd->tdtd_recovery_threads_count); + wake_up(&lut->lut_tdtd->tdtd_recovery_threads_waitq); + wake_up(&thread->t_ctl_waitq); + lu_env_fini(&env); + return rc; +} + +/** + * finish sub llog context + * + * Stop update recovery thread for the sub device, then cleanup the + * correspondent llog ctxt. + * + * \param[in] env execution environment + * \param[in] lod lod device to do update recovery + * \param[in] thread recovery thread on this sub device + */ +void lod_sub_fini_llog(const struct lu_env *env, + struct dt_device *dt, struct ptlrpc_thread *thread) +{ + struct obd_device *obd; + struct llog_ctxt *ctxt; + ENTRY; + + obd = dt->dd_lu_dev.ld_obd; + CDEBUG(D_INFO, "%s: finish sub llog\n", obd->obd_name); + /* Stop recovery thread first */ + if (thread != NULL && thread->t_flags & SVC_RUNNING) { + thread->t_flags = SVC_STOPPING; + wake_up(&thread->t_ctl_waitq); + wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED); + } + + ctxt = llog_get_context(obd, LLOG_UPDATELOG_ORIG_CTXT); + if (ctxt == NULL) + RETURN_EXIT; + + if (ctxt->loc_handle != NULL) + llog_cat_close(env, ctxt->loc_handle); + + llog_cleanup(env, ctxt); + + RETURN_EXIT; +} + /** * Extract MDT target index from a device name. * * a helper function to extract index from the given device name * like "fsname-MDTxxxx-mdtlov" * - * \param[in] lodname device name - * \param[out] index extracted index + * \param[in] lodname device name + * \param[out] mdt_index extracted index * * \retval 0 on success * \retval -EINVAL if the name is invalid */ -static int lodname2mdt_index(char *lodname, long *index) +int lodname2mdt_index(char *lodname, __u32 *mdt_index) { + unsigned long index; char *ptr, *tmp; + /* 1.8 configs don't have "-MDT0000" at the end */ + ptr = strstr(lodname, "-MDT"); + if (ptr == NULL) { + *mdt_index = 0; + return 0; + } + ptr = strrchr(lodname, '-'); if (ptr == NULL) { CERROR("invalid MDT index in '%s'\n", lodname); @@ -285,15 +561,315 @@ static int lodname2mdt_index(char *lodname, long *index) return -EINVAL; } - *index = simple_strtol(ptr - 4, &tmp, 16); - if (*tmp != '-' || *index > INT_MAX || *index < 0) { + index = simple_strtol(ptr - 4, &tmp, 16); + if (*tmp != '-' || index > INT_MAX) { CERROR("invalid MDT index in '%s'\n", lodname); return -EINVAL; } + *mdt_index = index; return 0; } /** + * Init sub llog context + * + * Setup update llog ctxt for update recovery threads, then start the + * recovery thread (lod_sub_recovery_thread) to read update llog from + * the correspondent MDT to do update recovery. + * + * \param[in] env execution environment + * \param[in] lod lod device to do update recovery + * \param[in] dt sub dt device for which the recovery thread is + * + * \retval 0 if initialization succeeds. + * \retval negative errno if initialization fails. + */ +int lod_sub_init_llog(const struct lu_env *env, struct lod_device *lod, + struct dt_device *dt) +{ + struct obd_device *obd; + struct lod_recovery_data *lrd = NULL; + struct ptlrpc_thread *thread; + struct task_struct *task; + struct l_wait_info lwi = { 0 }; + struct lod_tgt_desc *sub_ltd = NULL; + __u32 index; + __u32 master_index; + int rc; + ENTRY; + + rc = lodname2mdt_index(lod2obd(lod)->obd_name, &master_index); + if (rc != 0) + RETURN(rc); + + OBD_ALLOC_PTR(lrd); + if (lrd == NULL) + RETURN(-ENOMEM); + + if (lod->lod_child == dt) { + thread = &lod->lod_child_recovery_thread; + index = master_index; + } else { + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lod_tgt_desc *tgt = NULL; + unsigned int i; + + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + tgt = LTD_TGT(ltd, i); + if (tgt->ltd_tgt == dt) { + index = tgt->ltd_index; + sub_ltd = tgt; + break; + } + } + LASSERT(sub_ltd != NULL); + OBD_ALLOC_PTR(sub_ltd->ltd_recovery_thread); + if (sub_ltd->ltd_recovery_thread == NULL) + GOTO(free_lrd, rc = -ENOMEM); + + thread = sub_ltd->ltd_recovery_thread; + } + + CDEBUG(D_INFO, "%s init sub log %s\n", lod2obd(lod)->obd_name, + dt->dd_lu_dev.ld_obd->obd_name); + lrd->lrd_lod = lod; + lrd->lrd_ltd = sub_ltd; + lrd->lrd_thread = thread; + lrd->lrd_idx = index; + init_waitqueue_head(&thread->t_ctl_waitq); + + obd = dt->dd_lu_dev.ld_obd; + obd->obd_lvfs_ctxt.dt = dt; + rc = llog_setup(env, obd, &obd->obd_olg, LLOG_UPDATELOG_ORIG_CTXT, + NULL, &llog_common_cat_ops); + if (rc < 0) { + CERROR("%s: cannot setup updatelog llog: rc = %d\n", + obd->obd_name, rc); + GOTO(free_thread, rc); + } + + /* Start the recovery thread */ + task = kthread_run(lod_sub_recovery_thread, lrd, "lod%04x_rec%04x", + master_index, index); + if (IS_ERR(task)) { + rc = PTR_ERR(task); + CERROR("%s: cannot start recovery thread: rc = %d\n", + obd->obd_name, rc); + GOTO(out_llog, rc); + } + + l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING || + thread->t_flags & SVC_STOPPED, &lwi); + + RETURN(0); +out_llog: + lod_sub_fini_llog(env, dt, thread); +free_thread: + if (lod->lod_child != dt) { + OBD_FREE_PTR(sub_ltd->ltd_recovery_thread); + sub_ltd->ltd_recovery_thread = NULL; + } +free_lrd: + OBD_FREE_PTR(lrd); + RETURN(rc); +} + +/** + * Stop sub recovery thread + * + * Stop sub recovery thread on all subs. + * + * \param[in] env execution environment + * \param[in] lod lod device to do update recovery + */ +static void lod_sub_stop_recovery_threads(const struct lu_env *env, + struct lod_device *lod) +{ + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + struct ptlrpc_thread *thread; + unsigned int i; + + /* Stop the update log commit cancel threads and finish master + * llog ctxt */ + thread = &lod->lod_child_recovery_thread; + /* Stop recovery thread first */ + if (thread != NULL && thread->t_flags & SVC_RUNNING) { + thread->t_flags = SVC_STOPPING; + wake_up(&thread->t_ctl_waitq); + wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED); + } + + lod_getref(ltd); + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + struct lod_tgt_desc *tgt; + + tgt = LTD_TGT(ltd, i); + thread = tgt->ltd_recovery_thread; + if (thread != NULL && thread->t_flags & SVC_RUNNING) { + thread->t_flags = SVC_STOPPING; + wake_up(&thread->t_ctl_waitq); + wait_event(thread->t_ctl_waitq, + thread->t_flags & SVC_STOPPED); + OBD_FREE_PTR(tgt->ltd_recovery_thread); + tgt->ltd_recovery_thread = NULL; + } + } + + lod_putref(lod, ltd); +} + +/** + * finish all sub llog + * + * cleanup all of sub llog ctxt on the LOD. + * + * \param[in] env execution environment + * \param[in] lod lod device to do update recovery + */ +static void lod_sub_fini_all_llogs(const struct lu_env *env, + struct lod_device *lod) +{ + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + unsigned int i; + + /* Stop the update log commit cancel threads and finish master + * llog ctxt */ + lod_sub_fini_llog(env, lod->lod_child, + &lod->lod_child_recovery_thread); + lod_getref(ltd); + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + struct lod_tgt_desc *tgt; + + tgt = LTD_TGT(ltd, i); + lod_sub_fini_llog(env, tgt->ltd_tgt, + tgt->ltd_recovery_thread); + } + + lod_putref(lod, ltd); +} + +static char *lod_show_update_logs_retrievers(void *data, int *size, int *count) +{ + struct lod_device *lod = (struct lod_device *)data; + struct lu_target *lut = lod2lu_dev(lod)->ld_site->ls_tgt; + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lod_tgt_desc *tgt = NULL; + char *buf; + int len = 0; + int rc; + int i; + + *count = atomic_read(&lut->lut_tdtd->tdtd_recovery_threads_count); + if (*count == 0) { + *size = 0; + return NULL; + } + + *size = 5 * *count + 1; + OBD_ALLOC(buf, *size); + if (buf == NULL) + return NULL; + + *count = 0; + memset(buf, 0, *size); + + if (!lod->lod_child_got_update_log) { + rc = lodname2mdt_index(lod2obd(lod)->obd_name, &i); + LASSERTF(rc == 0, "Fail to parse target index: rc = %d\n", rc); + + rc = snprintf(buf + len, *size - len, " %04x", i); + LASSERT(rc > 0); + + len += rc; + (*count)++; + } + + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + tgt = LTD_TGT(ltd, i); + if (!tgt->ltd_got_update_log) { + rc = snprintf(buf + len, *size - len, " %04x", i); + if (unlikely(rc <= 0)) + break; + + len += rc; + (*count)++; + } + } + + return buf; +} + +/** + * Prepare distribute txn + * + * Prepare distribute txn structure for LOD + * + * \param[in] env execution environment + * \param[in] lod_device LOD device + * + * \retval 0 if preparation succeeds. + * \retval negative errno if preparation fails. + */ +static int lod_prepare_distribute_txn(const struct lu_env *env, + struct lod_device *lod) +{ + struct target_distribute_txn_data *tdtd; + struct lu_target *lut; + int rc; + ENTRY; + + /* Init update recovery data */ + OBD_ALLOC_PTR(tdtd); + if (tdtd == NULL) + RETURN(-ENOMEM); + + lut = lod2lu_dev(lod)->ld_site->ls_tgt; + tdtd->tdtd_dt = &lod->lod_dt_dev; + rc = distribute_txn_init(env, lut, tdtd, + lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id); + + if (rc < 0) { + CERROR("%s: cannot init distribute txn: rc = %d\n", + lod2obd(lod)->obd_name, rc); + OBD_FREE_PTR(tdtd); + RETURN(rc); + } + + tdtd->tdtd_show_update_logs_retrievers = + lod_show_update_logs_retrievers; + tdtd->tdtd_show_retrievers_cbdata = lod; + + lut->lut_tdtd = tdtd; + + RETURN(0); +} + +/** + * Finish distribute txn + * + * Release the resource holding by distribute txn, i.e. stop distribute + * txn thread. + * + * \param[in] env execution environment + * \param[in] lod lod device + */ +static void lod_fini_distribute_txn(const struct lu_env *env, + struct lod_device *lod) +{ + struct lu_target *lut; + + lut = lod2lu_dev(lod)->ld_site->ls_tgt; + target_recovery_fini(lut->lut_obd); + if (lut->lut_tdtd == NULL) + return; + + distribute_txn_fini(env, lut->lut_tdtd); + + OBD_FREE_PTR(lut->lut_tdtd); + lut->lut_tdtd = NULL; +} + +/** * Implementation of lu_device_operations::ldo_process_config() for LOD * * The method is called by the configuration subsystem during setup, @@ -356,20 +932,13 @@ static int lod_process_config(const struct lu_env *env, GOTO(out, rc = -EINVAL); if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD) { - char *mdt; - mdt = strstr(lustre_cfg_string(lcfg, 0), "-MDT"); - /* 1.8 configs don't have "-MDT0000" at the end */ - if (mdt == NULL) { - mdt_index = 0; - } else { - long long_index; - rc = lodname2mdt_index( - lustre_cfg_string(lcfg, 0), - &long_index); - if (rc != 0) - GOTO(out, rc); - mdt_index = long_index; - } + __u32 mdt_index; + + rc = lodname2mdt_index(lustre_cfg_string(lcfg, 0), + &mdt_index); + if (rc != 0) + GOTO(out, rc); + rc = lod_add_device(env, lod, arg1, index, gen, mdt_index, LUSTRE_OSC_NAME, 1); } else if (lcfg->lcfg_command == LCFG_ADD_MDC) { @@ -391,25 +960,116 @@ static int lod_process_config(const struct lu_env *env, } case LCFG_PARAM: { - struct obd_device *obd = lod2obd(lod); + struct obd_device *obd; + ssize_t count; + char *param; + + /* Check if it is activate/deactivate mdc + * lustre-MDTXXXX-osp-MDTXXXX.active=1 */ + param = lustre_cfg_buf(lcfg, 1); + if (strstr(param, "osp") != NULL && + strstr(param, ".active=") != NULL) { + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lod_tgt_desc *sub_tgt = NULL; + char *ptr; + char *tmp; + int i; + + ptr = strstr(param, "."); + *ptr = '\0'; + obd = class_name2obd(param); + if (obd == NULL) { + CERROR("%s: can not find %s: rc = %d\n", + lod2obd(lod)->obd_name, param, -EINVAL); + *ptr = '.'; + GOTO(out, rc); + } - rc = class_process_proc_param(PARAM_LOV, obd->obd_vars, - lcfg, obd); - if (rc > 0) - rc = 0; + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + struct lod_tgt_desc *tgt; + + tgt = LTD_TGT(ltd, i); + if (tgt->ltd_tgt->dd_lu_dev.ld_obd == obd) { + sub_tgt = tgt; + break; + } + } + + if (sub_tgt == NULL) { + CERROR("%s: can not find %s: rc = %d\n", + lod2obd(lod)->obd_name, param, -EINVAL); + *ptr = '.'; + GOTO(out, rc); + } + + *ptr = '.'; + tmp = strstr(param, "="); + tmp++; + if (*tmp == '1') { + struct llog_ctxt *ctxt; + + obd = sub_tgt->ltd_tgt->dd_lu_dev.ld_obd; + ctxt = llog_get_context(obd, + LLOG_UPDATELOG_ORIG_CTXT); + if (ctxt == NULL) { + rc = llog_setup(env, obd, &obd->obd_olg, + LLOG_UPDATELOG_ORIG_CTXT, + NULL, &llog_common_cat_ops); + if (rc < 0) + GOTO(out, rc); + } else { + llog_ctxt_put(ctxt); + } + rc = lod_sub_prep_llog(env, lod, + sub_tgt->ltd_tgt, + sub_tgt->ltd_index); + if (rc == 0) + sub_tgt->ltd_active = 1; + } else { + lod_sub_fini_llog(env, sub_tgt->ltd_tgt, + NULL); + sub_tgt->ltd_active = 0; + } + GOTO(out, rc); + } + + + if (strstr(param, PARAM_LOD) != NULL) + count = class_modify_config(lcfg, PARAM_LOD, + &lod->lod_dt_dev.dd_kobj); + else + count = class_modify_config(lcfg, PARAM_LOV, + &lod->lod_dt_dev.dd_kobj); + rc = count > 0 ? 0 : count; GOTO(out, rc); } - case LCFG_CLEANUP: case LCFG_PRE_CLEANUP: { - lu_dev_del_linkage(dev->ld_site, dev); - lod_cleanup_desc_tgts(env, lod, &lod->lod_mdt_descs, lcfg); - lod_cleanup_desc_tgts(env, lod, &lod->lod_ost_descs, lcfg); - if (lcfg->lcfg_command == LCFG_PRE_CLEANUP) - break; + lod_sub_process_config(env, lod, &lod->lod_mdt_descs, lcfg); + lod_sub_process_config(env, lod, &lod->lod_ost_descs, lcfg); + next = &lod->lod_child->dd_lu_dev; + rc = next->ld_ops->ldo_process_config(env, next, lcfg); + if (rc != 0) + CDEBUG(D_HA, "%s: can't process %u: %d\n", + lod2obd(lod)->obd_name, lcfg->lcfg_command, rc); + + lod_sub_stop_recovery_threads(env, lod); + lod_fini_distribute_txn(env, lod); + lod_sub_fini_all_llogs(env, lod); + break; + } + case LCFG_CLEANUP: { + if (lod->lod_md_root != NULL) { + dt_object_put(env, &lod->lod_md_root->ldo_obj); + lod->lod_md_root = NULL; + } + /* * do cleanup on underlying storage only when * all OSPs are cleaned up, as they use that OSD as well */ + lu_dev_del_linkage(dev->ld_site, dev); + lod_sub_process_config(env, lod, &lod->lod_mdt_descs, lcfg); + lod_sub_process_config(env, lod, &lod->lod_ost_descs, lcfg); next = &lod->lod_child->dd_lu_dev; rc = next->ld_ops->ldo_process_config(env, next, lcfg); if (rc) @@ -472,6 +1132,46 @@ static int lod_recovery_complete(const struct lu_env *env, } /** + * Init update logs on all sub device + * + * LOD initialize update logs on all of sub devices. Because the initialization + * process might need FLD lookup, see llog_osd_open()->dt_locate()->...-> + * lod_object_init(), this API has to be called after LOD is initialized. + * \param[in] env execution environment + * \param[in] lod lod device + * + * \retval 0 if update log is initialized successfully. + * \retval negative errno if initialization fails. + */ +static int lod_sub_init_llogs(const struct lu_env *env, struct lod_device *lod) +{ + struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; + int rc; + unsigned int i; + ENTRY; + + /* llog must be setup after LOD is initialized, because llog + * initialization include FLD lookup */ + LASSERT(lod->lod_initialized); + + /* Init the llog in its own stack */ + rc = lod_sub_init_llog(env, lod, lod->lod_child); + if (rc < 0) + RETURN(rc); + + cfs_foreach_bit(ltd->ltd_tgt_bitmap, i) { + struct lod_tgt_desc *tgt; + + tgt = LTD_TGT(ltd, i); + rc = lod_sub_init_llog(env, lod, tgt->ltd_tgt); + if (rc != 0) + break; + } + + RETURN(rc); +} + +/** * Implementation of lu_device_operations::ldo_prepare() for LOD * * see include/lu_object.h for the details. @@ -479,9 +1179,13 @@ static int lod_recovery_complete(const struct lu_env *env, static int lod_prepare(const struct lu_env *env, struct lu_device *pdev, struct lu_device *cdev) { - struct lod_device *lod = lu2lod_dev(cdev); - struct lu_device *next = &lod->lod_child->dd_lu_dev; - int rc; + struct lod_device *lod = lu2lod_dev(cdev); + struct lu_device *next = &lod->lod_child->dd_lu_dev; + struct lu_fid *fid = &lod_env_info(env)->lti_fid; + int rc; + struct dt_object *root; + struct dt_object *dto; + __u32 index; ENTRY; rc = next->ld_ops->ldo_prepare(env, pdev, next); @@ -493,6 +1197,49 @@ static int lod_prepare(const struct lu_env *env, struct lu_device *pdev, lod->lod_initialized = 1; + rc = dt_root_get(env, lod->lod_child, fid); + if (rc < 0) + RETURN(rc); + + root = dt_locate(env, lod->lod_child, fid); + if (IS_ERR(root)) + RETURN(PTR_ERR(root)); + + /* Create update log object */ + index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id; + lu_update_log_fid(fid, index); + + dto = local_file_find_or_create_with_fid(env, lod->lod_child, + fid, root, + lod_update_log_name, + S_IFREG | S_IRUGO | S_IWUSR); + if (IS_ERR(dto)) + GOTO(out_put, rc = PTR_ERR(dto)); + + dt_object_put(env, dto); + + /* Create update log dir */ + lu_update_log_dir_fid(fid, index); + dto = local_file_find_or_create_with_fid(env, lod->lod_child, + fid, root, + lod_update_log_dir_name, + S_IFDIR | S_IRUGO | S_IWUSR); + if (IS_ERR(dto)) + GOTO(out_put, rc = PTR_ERR(dto)); + + dt_object_put(env, dto); + + rc = lod_prepare_distribute_txn(env, lod); + if (rc != 0) + GOTO(out_put, rc); + + rc = lod_sub_init_llogs(env, lod); + if (rc != 0) + GOTO(out_put, rc); + +out_put: + dt_object_put(env, root); + RETURN(rc); } @@ -514,6 +1261,30 @@ static int lod_root_get(const struct lu_env *env, return dt_root_get(env, dt2lod_dev(dev)->lod_child, f); } +static void lod_statfs_sum(struct obd_statfs *sfs, + struct obd_statfs *ost_sfs, int *bs) +{ + while (ost_sfs->os_bsize < *bs) { + *bs >>= 1; + sfs->os_bsize >>= 1; + sfs->os_bavail <<= 1; + sfs->os_blocks <<= 1; + sfs->os_bfree <<= 1; + sfs->os_granted <<= 1; + } + while (ost_sfs->os_bsize > *bs) { + ost_sfs->os_bsize >>= 1; + ost_sfs->os_bavail <<= 1; + ost_sfs->os_blocks <<= 1; + ost_sfs->os_bfree <<= 1; + ost_sfs->os_granted <<= 1; + } + sfs->os_bavail += ost_sfs->os_bavail; + sfs->os_blocks += ost_sfs->os_blocks; + sfs->os_bfree += ost_sfs->os_bfree; + sfs->os_granted += ost_sfs->os_granted; +} + /** * Implementation of dt_device_operations::dt_statfs() for LOD * @@ -522,7 +1293,73 @@ static int lod_root_get(const struct lu_env *env, static int lod_statfs(const struct lu_env *env, struct dt_device *dev, struct obd_statfs *sfs) { - return dt_statfs(env, dt2lod_dev(dev)->lod_child, sfs); + struct lod_device *lod = dt2lod_dev(dev); + struct lod_ost_desc *ost; + struct lod_mdt_desc *mdt; + struct obd_statfs ost_sfs; + int i, rc, bs; + bool mdtonly; + + rc = dt_statfs(env, dt2lod_dev(dev)->lod_child, sfs); + if (rc) + GOTO(out, rc); + + bs = sfs->os_bsize; + + sfs->os_bavail = 0; + sfs->os_blocks = 0; + sfs->os_bfree = 0; + sfs->os_granted = 0; + + lod_getref(&lod->lod_mdt_descs); + lod_foreach_mdt(lod, i) { + mdt = MDT_TGT(lod, i); + LASSERT(mdt && mdt->ltd_mdt); + rc = dt_statfs(env, mdt->ltd_mdt, &ost_sfs); + /* ignore errors */ + if (rc) + continue; + sfs->os_files += ost_sfs.os_files; + sfs->os_ffree += ost_sfs.os_ffree; + lod_statfs_sum(sfs, &ost_sfs, &bs); + } + lod_putref(lod, &lod->lod_mdt_descs); + + /* at some point we can check whether DoM is enabled and + * decide how to account MDT space. for simplicity let's + * just fallback to pre-DoM policy if any OST is alive */ + mdtonly = true; + + lod_getref(&lod->lod_ost_descs); + lod_foreach_ost(lod, i) { + ost = OST_TGT(lod, i); + LASSERT(ost && ost->ltd_ost); + rc = dt_statfs(env, ost->ltd_ost, &ost_sfs); + /* ignore errors */ + if (rc || ost_sfs.os_bsize == 0) + continue; + if (mdtonly) { + /* if only MDTs and DoM report MDT space, + * otherwise only OST space */ + sfs->os_bavail = 0; + sfs->os_blocks = 0; + sfs->os_bfree = 0; + sfs->os_granted = 0; + mdtonly = false; + } + ost_sfs.os_bavail += ost_sfs.os_granted; + lod_statfs_sum(sfs, &ost_sfs, &bs); + LASSERTF(bs == ost_sfs.os_bsize, "%d != %d\n", + (int)sfs->os_bsize, (int)ost_sfs.os_bsize); + } + lod_putref(lod, &lod->lod_ost_descs); + sfs->os_state |= OS_STATE_SUM; + + /* a single successful statfs should be enough */ + rc = 0; + +out: + RETURN(rc); } /** @@ -533,14 +1370,16 @@ static int lod_statfs(const struct lu_env *env, * see include/dt_object.h for the details. */ static struct thandle *lod_trans_create(const struct lu_env *env, - struct dt_device *dev) + struct dt_device *dt) { struct thandle *th; - th = dt_trans_create(env, dt2lod_dev(dev)->lod_child); + th = top_trans_create(env, dt2lod_dev(dt)->lod_child); if (IS_ERR(th)) return th; + th->th_dev = dt; + return th; } @@ -552,25 +1391,54 @@ static struct thandle *lod_trans_create(const struct lu_env *env, * * see include/dt_object.h for the details. */ -static int lod_trans_start(const struct lu_env *env, struct dt_device *dev, +static int lod_trans_start(const struct lu_env *env, struct dt_device *dt, struct thandle *th) { - struct lod_device *lod = dt2lod_dev((struct dt_device *) dev); + return top_trans_start(env, dt2lod_dev(dt)->lod_child, th); +} + +static int lod_trans_cb_add(struct thandle *th, + struct dt_txn_commit_cb *dcb) +{ + struct top_thandle *top_th = container_of(th, struct top_thandle, + tt_super); + return dt_trans_cb_add(top_th->tt_master_sub_thandle, dcb); +} + +/** + * add noop update to the update records + * + * Add noop updates to the update records, which is only used in + * test right now. + * + * \param[in] env execution environment + * \param[in] dt dt device of lod + * \param[in] th thandle + * \param[in] count the count of update records to be added. + * + * \retval 0 if adding succeeds. + * \retval negative errno if adding fails. + */ +static int lod_add_noop_records(const struct lu_env *env, + struct dt_device *dt, struct thandle *th, + int count) +{ + struct top_thandle *top_th; + struct lu_fid *fid = &lod_env_info(env)->lti_fid; + int i; int rc = 0; - if (unlikely(th->th_update != NULL)) { - struct thandle_update *tu = th->th_update; - struct dt_update_request *update; + top_th = container_of(th, struct top_thandle, tt_super); + if (top_th->tt_multiple_thandle == NULL) + return 0; - list_for_each_entry(update, &tu->tu_remote_update_list, - dur_list) { - LASSERT(update->dur_dt != NULL); - rc = dt_trans_start(env, update->dur_dt, th); - if (rc != 0) - return rc; - } + fid_zero(fid); + for (i = 0; i < count; i++) { + rc = update_record_pack(noop, th, fid); + if (rc < 0) + return rc; } - return dt_trans_start(env, lod->lod_child, th); + return rc; } /** @@ -584,27 +1452,14 @@ static int lod_trans_start(const struct lu_env *env, struct dt_device *dev, static int lod_trans_stop(const struct lu_env *env, struct dt_device *dt, struct thandle *th) { - struct thandle_update *tu = th->th_update; - struct dt_update_request *update; - struct dt_update_request *tmp; - int rc2 = 0; - int rc; - ENTRY; - - rc = dt_trans_stop(env, th->th_dev, th); - if (likely(tu == NULL)) - RETURN(rc); + if (OBD_FAIL_CHECK(OBD_FAIL_SPLIT_UPDATE_REC)) { + int rc; - list_for_each_entry_safe(update, tmp, - &tu->tu_remote_update_list, - dur_list) { - /* update will be freed inside dt_trans_stop */ - rc2 = dt_trans_stop(env, update->dur_dt, th); - if (unlikely(rc2 != 0 && rc == 0)) - rc = rc2; + rc = lod_add_noop_records(env, dt, th, 5000); + if (rc < 0) + RETURN(rc); } - - RETURN(rc); + return top_trans_stop(env, dt2lod_dev(dt)->lod_child, th); } /** @@ -633,6 +1488,7 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) { struct lod_device *lod = dt2lod_dev(dev); struct lod_ost_desc *ost; + struct lod_mdt_desc *mdt; unsigned int i; int rc = 0; ENTRY; @@ -641,14 +1497,41 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) lod_foreach_ost(lod, i) { ost = OST_TGT(lod, i); LASSERT(ost && ost->ltd_ost); + if (!ost->ltd_active) + continue; rc = dt_sync(env, ost->ltd_ost); if (rc) { - CERROR("%s: can't sync %u: %d\n", - lod2obd(lod)->obd_name, i, rc); - break; + if (rc != -ENOTCONN) { + CERROR("%s: can't sync ost %u: %d\n", + lod2obd(lod)->obd_name, i, rc); + break; + } + rc = 0; } } lod_putref(lod, &lod->lod_ost_descs); + + if (rc) + RETURN(rc); + + lod_getref(&lod->lod_mdt_descs); + lod_foreach_mdt(lod, i) { + mdt = MDT_TGT(lod, i); + LASSERT(mdt && mdt->ltd_mdt); + if (!mdt->ltd_active) + continue; + rc = dt_sync(env, mdt->ltd_mdt); + if (rc) { + if (rc != -ENOTCONN) { + CERROR("%s: can't sync mdt %u: %d\n", + lod2obd(lod)->obd_name, i, rc); + break; + } + rc = 0; + } + } + lod_putref(lod, &lod->lod_mdt_descs); + if (rc == 0) rc = dt_sync(env, lod->lod_child); @@ -679,17 +1562,6 @@ static int lod_commit_async(const struct lu_env *env, struct dt_device *dev) return dt_commit_async(env, dt2lod_dev(dev)->lod_child); } -/** - * Not used - */ -static int lod_init_capa_ctxt(const struct lu_env *env, struct dt_device *dev, - int mode, unsigned long timeout, - __u32 alg, struct lustre_capa_key *keys) -{ - struct dt_device *next = dt2lod_dev(dev)->lod_child; - return dt_init_capa_ctxt(env, next, mode, timeout, alg, keys); -} - static const struct dt_device_operations lod_dt_ops = { .dt_root_get = lod_root_get, .dt_statfs = lod_statfs, @@ -700,7 +1572,7 @@ static const struct dt_device_operations lod_dt_ops = { .dt_sync = lod_sync, .dt_ro = lod_ro, .dt_commit_async = lod_commit_async, - .dt_init_capa_ctxt = lod_init_capa_ctxt, + .dt_trans_cb_add = lod_trans_cb_add, }; /** @@ -742,7 +1614,7 @@ static int lod_connect_to_osd(const struct lu_env *env, struct lod_device *lod, * we use "-MDT" to differentiate 2.x from 1.8 */ if ((p = lustre_cfg_string(cfg, 0)) && strstr(p, "-mdtlov")) { - len = strlen(p) + 1; + len = strlen(p) + 6; OBD_ALLOC(nextdev, len); if (nextdev == NULL) GOTO(out, rc = -ENOMEM); @@ -872,6 +1744,7 @@ static int lod_init0(const struct lu_env *env, struct lod_device *lod, dt_conf_get(env, &lod->lod_dt_dev, &ddp); lod->lod_osd_max_easize = ddp.ddp_max_ea_size; + lod->lod_dom_max_stripesize = (1ULL << 20); /* 1Mb as default value */ /* setup obd to be used with old lov code */ rc = lod_pools_init(lod, cfg); @@ -882,7 +1755,7 @@ static int lod_init0(const struct lu_env *env, struct lod_device *lod, if (rc) GOTO(out_pools, rc); - spin_lock_init(&lod->lod_desc_lock); + spin_lock_init(&lod->lod_lock); spin_lock_init(&lod->lod_connects_lock); lod_tgt_desc_init(&lod->lod_mdt_descs); lod_tgt_desc_init(&lod->lod_ost_descs); @@ -910,7 +1783,12 @@ static struct lu_device *lod_device_free(const struct lu_env *env, struct lu_device *next = &lod->lod_child->dd_lu_dev; ENTRY; - LASSERT(atomic_read(&lu->ld_ref) == 0); + if (atomic_read(&lu->ld_ref) > 0 && + !cfs_hash_is_empty(lu->ld_site->ls_obj_hash)) { + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL); + lu_site_print(env, lu->ld_site, &msgdata, lu_cdebug_printer); + } + LASSERTF(atomic_read(&lu->ld_ref) == 0, "lu is %p\n", lu); dt_device_fini(&lod->lod_dt_dev); OBD_FREE_PTR(lod); RETURN(next); @@ -948,6 +1826,15 @@ static struct lu_device *lod_device_alloc(const struct lu_env *env, return lu_dev; } +static void lod_avoid_guide_fini(struct lod_avoid_guide *lag) +{ + if (lag->lag_oss_avoid_array) + OBD_FREE(lag->lag_oss_avoid_array, + sizeof(__u32) * lag->lag_oaa_size); + if (lag->lag_ost_avoid_bitmap) + CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap); +} + /** * Implementation of lu_device_type_operations::ldto_device_fini() for LOD * @@ -1070,6 +1957,9 @@ static void lod_key_fini(const struct lu_context *ctx, struct lu_context_key *key, void *data) { struct lod_thread_info *info = data; + struct lod_layout_component *lds = + info->lti_def_striping.lds_def_comp_entries; + /* allocated in lod_get_lov_ea * XXX: this is overload, a tread may have such store but used only * once. Probably better would be pool of such stores per LOD. @@ -1080,6 +1970,16 @@ static void lod_key_fini(const struct lu_context *ctx, info->lti_ea_store_size = 0; } lu_buf_free(&info->lti_linkea_buf); + + if (lds != NULL) + lod_free_def_comp_entries(&info->lti_def_striping); + + if (info->lti_comp_size > 0) + OBD_FREE(info->lti_comp_idx, + info->lti_comp_size * sizeof(__u32)); + + lod_avoid_guide_fini(&info->lti_avoid); + OBD_FREE_PTR(info); } @@ -1113,6 +2013,8 @@ static struct lu_device_type lod_device_type = { * * Currently, there is only one supported key: KEY_OSP_CONNECTED , to provide * the caller binary status whether LOD has seen connection to any OST target. + * It will also check if the MDT update log context being initialized (if + * needed). * * \param[in] env LU environment provided by the caller * \param[in] exp export of the caller @@ -1120,7 +2022,6 @@ static struct lu_device_type lod_device_type = { * \param[in] key the key * \param[in] vallen not used * \param[in] val not used - * \param[in] lsm not used * * \retval 0 if a connection was seen * \retval -EAGAIN if LOD isn't running yet or no @@ -1128,15 +2029,14 @@ static struct lu_device_type lod_device_type = { * \retval -EINVAL if not supported key is requested **/ static int lod_obd_get_info(const struct lu_env *env, struct obd_export *exp, - __u32 keylen, void *key, __u32 *vallen, void *val, - struct lov_stripe_md *lsm) + __u32 keylen, void *key, __u32 *vallen, void *val) { int rc = -EINVAL; if (KEY_IS(KEY_OSP_CONNECTED)) { struct obd_device *obd = exp->exp_obd; struct lod_device *d; - struct lod_ost_desc *ost; + struct lod_tgt_desc *tgt; unsigned int i; int rc = 1; @@ -1146,36 +2046,131 @@ static int lod_obd_get_info(const struct lu_env *env, struct obd_export *exp, d = lu2lod_dev(obd->obd_lu_dev); lod_getref(&d->lod_ost_descs); lod_foreach_ost(d, i) { - ost = OST_TGT(d, i); - LASSERT(ost && ost->ltd_ost); - - rc = obd_get_info(env, ost->ltd_exp, keylen, key, - vallen, val, lsm); + tgt = OST_TGT(d, i); + LASSERT(tgt && tgt->ltd_tgt); + rc = obd_get_info(env, tgt->ltd_exp, keylen, key, + vallen, val); /* one healthy device is enough */ if (rc == 0) break; } lod_putref(d, &d->lod_ost_descs); + + lod_getref(&d->lod_mdt_descs); + lod_foreach_mdt(d, i) { + struct llog_ctxt *ctxt; + + tgt = MDT_TGT(d, i); + LASSERT(tgt != NULL); + LASSERT(tgt->ltd_tgt != NULL); + if (!tgt->ltd_active) + continue; + + ctxt = llog_get_context(tgt->ltd_tgt->dd_lu_dev.ld_obd, + LLOG_UPDATELOG_ORIG_CTXT); + if (ctxt == NULL) { + CDEBUG(D_INFO, "%s: %s is not ready.\n", + obd->obd_name, + tgt->ltd_tgt->dd_lu_dev.ld_obd->obd_name); + rc = -EAGAIN; + break; + } + if (ctxt->loc_handle == NULL) { + CDEBUG(D_INFO, "%s: %s is not ready.\n", + obd->obd_name, + tgt->ltd_tgt->dd_lu_dev.ld_obd->obd_name); + rc = -EAGAIN; + llog_ctxt_put(ctxt); + break; + } + llog_ctxt_put(ctxt); + } + lod_putref(d, &d->lod_mdt_descs); + RETURN(rc); } RETURN(rc); } +static int lod_obd_set_info_async(const struct lu_env *env, + struct obd_export *exp, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lod_device *d; + struct lod_tgt_desc *tgt; + int no_set = 0; + int i, rc = 0, rc2; + ENTRY; + + if (set == NULL) { + no_set = 1; + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + + d = lu2lod_dev(obd->obd_lu_dev); + lod_getref(&d->lod_ost_descs); + lod_foreach_ost(d, i) { + tgt = OST_TGT(d, i); + LASSERT(tgt && tgt->ltd_tgt); + if (!tgt->ltd_active) + continue; + + rc2 = obd_set_info_async(env, tgt->ltd_exp, keylen, key, + vallen, val, set); + if (rc2 != 0 && rc == 0) + rc = rc2; + } + lod_putref(d, &d->lod_ost_descs); + + lod_getref(&d->lod_mdt_descs); + lod_foreach_mdt(d, i) { + tgt = MDT_TGT(d, i); + LASSERT(tgt && tgt->ltd_tgt); + if (!tgt->ltd_active) + continue; + rc2 = obd_set_info_async(env, tgt->ltd_exp, keylen, key, + vallen, val, set); + if (rc2 != 0 && rc == 0) + rc = rc2; + } + lod_putref(d, &d->lod_mdt_descs); + + + if (no_set) { + rc2 = ptlrpc_set_wait(env, set); + if (rc2 == 0 && rc == 0) + rc = rc2; + ptlrpc_set_destroy(set); + } + RETURN(rc); +} + static struct obd_ops lod_obd_device_ops = { .o_owner = THIS_MODULE, .o_connect = lod_obd_connect, .o_disconnect = lod_obd_disconnect, .o_get_info = lod_obd_get_info, + .o_set_info_async = lod_obd_set_info_async, .o_pool_new = lod_pool_new, .o_pool_rem = lod_pool_remove, .o_pool_add = lod_pool_add, .o_pool_del = lod_pool_del, }; -static int __init lod_mod_init(void) +static struct obd_type sym; + +static int __init lod_init(void) { + struct dentry *symlink; struct obd_type *type; + struct kobject *kobj; + struct qstr dname; int rc; rc = lu_kmem_init(lod_caches); @@ -1189,32 +2184,67 @@ static int __init lod_mod_init(void) return rc; } - /* create "lov" entry in procfs for compatibility purposes */ + /* create "lov" entry for compatibility purposes */ + dname.name = "lov"; + dname.len = strlen(dname.name); + dname.hash = ll_full_name_hash(debugfs_lustre_root, dname.name, + dname.len); + symlink = d_lookup(debugfs_lustre_root, &dname); + if (!symlink) { + symlink = debugfs_create_dir(dname.name, debugfs_lustre_root); + if (IS_ERR_OR_NULL(symlink)) { + rc = symlink ? PTR_ERR(symlink) : -ENOMEM; + GOTO(no_lov, rc); + } + sym.typ_debugfs_entry = symlink; + } else { + dput(symlink); + } + + kobj = kset_find_obj(lustre_kset, dname.name); + if (kobj) { + kobject_put(kobj); + goto try_proc; + } + + kobj = class_setup_tunables(dname.name); + if (IS_ERR(kobj)) { + rc = PTR_ERR(kobj); + if (sym.typ_debugfs_entry) + ldebugfs_remove(&sym.typ_debugfs_entry); + GOTO(no_lov, rc); + } + sym.typ_kobj = kobj; + +try_proc: type = class_search_type(LUSTRE_LOV_NAME); if (type != NULL && type->typ_procroot != NULL) - return rc; + GOTO(no_lov, rc); type = class_search_type(LUSTRE_LOD_NAME); - type->typ_procsym = lprocfs_seq_register("lov", proc_lustre_root, - NULL, NULL); + type->typ_procsym = lprocfs_register("lov", proc_lustre_root, + NULL, NULL); if (IS_ERR(type->typ_procsym)) { CERROR("lod: can't create compat entry \"lov\": %d\n", (int)PTR_ERR(type->typ_procsym)); type->typ_procsym = NULL; } +no_lov: return rc; } -static void __exit lod_mod_exit(void) +static void __exit lod_exit(void) { + ldebugfs_remove(&sym.typ_debugfs_entry); + kobject_put(sym.typ_kobj); class_unregister_type(LUSTRE_LOD_NAME); lu_kmem_fini(lod_caches); } -MODULE_AUTHOR("Whamcloud, Inc. "); +MODULE_AUTHOR("OpenSFS, Inc. "); MODULE_DESCRIPTION("Lustre Logical Object Device ("LUSTRE_LOD_NAME")"); +MODULE_VERSION(LUSTRE_VERSION_STRING); MODULE_LICENSE("GPL"); -module_init(lod_mod_init); -module_exit(lod_mod_exit); - +module_init(lod_init); +module_exit(lod_exit);