X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fmdd%2Fmdd_trans.c;h=c2581ada6e68c1ea5af780eded4b26074a2999be;hp=001cac394acf45b62ada65f533aeea16d4d89336;hb=bec1334954a73ed668fad409e8c728f9dfd6bb99;hpb=dc9b1f76d1d0ea7326dfd77b0361b0baf1e67aed diff --git a/lustre/mdd/mdd_trans.c b/lustre/mdd/mdd_trans.c index 001cac3..c2581ad 100644 --- a/lustre/mdd/mdd_trans.c +++ b/lustre/mdd/mdd_trans.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -17,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -29,12 +23,10 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011 Whamcloud, Inc. - * + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lustre/mdd/mdd_trans.c * @@ -43,274 +35,197 @@ * Author: Wang Di */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_MDS -#include -#ifdef HAVE_EXT4_LDISKFS -#include -#else -#include -#endif -#include +#include + #include -#include -#include #include - -#ifdef HAVE_EXT4_LDISKFS -#include -#else -#include -#endif #include -#include +#include #include "mdd_internal.h" -static int dto_txn_credits[DTO_NR]; - -int mdd_txn_start_cb(const struct lu_env *env, struct txn_param *param, - void *cookie) +struct thandle *mdd_trans_create(const struct lu_env *env, + struct mdd_device *mdd) { - struct mdd_device *mdd = cookie; - struct obd_device *obd = mdd2obd_dev(mdd); - /* Each transaction updates lov objids, the credits should be added for - * this */ - int blk, shift = mdd->mdd_dt_conf.ddp_block_shift; - blk = ((obd->u.mds.mds_lov_desc.ld_tgt_count * sizeof(obd_id) + - (1 << shift) - 1) >> shift) + 1; - - /* add lov objids credits */ - param->tp_credits += blk * dto_txn_credits[DTO_WRITE_BLOCK] + - dto_txn_credits[DTO_WRITE_BASE]; - - return 0; -} + struct thandle *th; + struct lu_ucred *uc = lu_ucred_check(env); -int mdd_txn_stop_cb(const struct lu_env *env, struct thandle *txn, - void *cookie) -{ - struct mdd_device *mdd = cookie; - struct obd_device *obd = mdd2obd_dev(mdd); + /* If blocked by the write barrier, then return "-EINPROGRESS" + * to the caller. Usually, such error will be forwarded to the + * client, and the expected behaviour is to re-try such modify + * RPC some time later until the barrier is thawed or expired. */ + if (unlikely(!barrier_entry(mdd->mdd_bottom))) + return ERR_PTR(-EINPROGRESS); - LASSERT(obd); - return mds_lov_write_objids(obd); -} + th = mdd_child_ops(mdd)->dt_trans_create(env, mdd->mdd_child); + if (!IS_ERR(th) && uc) + th->th_ignore_quota = !!cap_raised(uc->uc_cap, CAP_SYS_RESOURCE); -int mdd_txn_commit_cb(const struct lu_env *env, struct thandle *txn, - void *cookie) -{ - return 0; + return th; } -void mdd_txn_param_build(const struct lu_env *env, struct mdd_device *mdd, - enum mdd_txn_op op, int changelog_cnt) +int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd, + struct thandle *th) { - LASSERT(0 <= op && op < MDD_TXN_LAST_OP); - - txn_param_init(&mdd_env_info(env)->mti_param, - mdd->mdd_tod[op].mod_credits); - if (changelog_cnt > 0) { - txn_param_credit_add(&mdd_env_info(env)->mti_param, - changelog_cnt * dto_txn_credits[DTO_LOG_REC]); - } + return mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, th); } -int mdd_create_txn_param_build(const struct lu_env *env, struct mdd_device *mdd, - struct lov_mds_md *lmm, enum mdd_txn_op op, - int changelog_cnt) +struct mdd_changelog_gc { + struct mdd_device *mcgc_mdd; + __u32 mcgc_id; + __u32 mcgc_mintime; + __u64 mcgc_minrec; + char mcgc_name[CHANGELOG_USER_NAMELEN_FULL]; +}; + +/* return first registered ChangeLog user idle since too long + * use ChangeLog's user plain LLOG mtime for this */ +static int mdd_changelog_gc_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *hdr, void *data) { - int stripes = 0; - ENTRY; - - LASSERT(op == MDD_TXN_CREATE_DATA_OP || op == MDD_TXN_MKDIR_OP); - - if (lmm == NULL) - GOTO(out, 0); - /* only replay create request will cause lov_objid update */ - if (!mdd->mdd_obd_dev->obd_recovering) - GOTO(out, 0); - - /* add possible orphan unlink rec credits used in lov_objid update */ - if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) { - stripes = le32_to_cpu(((struct lov_mds_md_v1*)lmm) - ->lmm_stripe_count); - } else if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3){ - stripes = le32_to_cpu(((struct lov_mds_md_v3*)lmm) - ->lmm_stripe_count); - } else { - CERROR("Unknown lmm type %X\n", le32_to_cpu(lmm->lmm_magic)); - LBUG(); - } -out: - mdd_txn_param_build(env, mdd, op, stripes + changelog_cnt); - RETURN(0); + struct llog_changelog_user_rec2 *rec; + struct mdd_changelog_gc *mcgc = data; + struct mdd_device *mdd = mcgc->mcgc_mdd; + + ENTRY; + + if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0) + RETURN(-ENXIO); + + rec = container_of(hdr, typeof(*rec), cur_hdr); + + if (rec->cur_endrec < mcgc->mcgc_minrec && + (mdd->mdd_changelog_emrg_gc || + mdd_changelog_is_too_idle(mdd, rec->cur_endrec, rec->cur_time))) { + mcgc->mcgc_mintime = rec->cur_time; + mcgc->mcgc_minrec = rec->cur_endrec; + mcgc->mcgc_id = rec->cur_id; + mdd_chlg_username(rec, mcgc->mcgc_name, + sizeof(mcgc->mcgc_name)); + } + RETURN(0); } -int mdd_log_txn_param_build(const struct lu_env *env, struct md_object *obj, - struct md_attr *ma, enum mdd_txn_op op, - int changelog_cnt) +/* recover space from long-term inactive ChangeLog users */ +static int mdd_chlg_garbage_collect(void *data) { - struct mdd_device *mdd = mdo2mdd(&md2mdd_obj(obj)->mod_obj); - int rc, stripe = 0; - ENTRY; - - if (S_ISDIR(lu_object_attr(&obj->mo_lu))) - GOTO(out, rc = 0); - - LASSERT(op == MDD_TXN_UNLINK_OP || op == MDD_TXN_RENAME_OP || - op == MDD_TXN_RENAME_TGT_OP); - rc = mdd_lmm_get_locked(env, md2mdd_obj(obj), ma); - if (rc || !(ma->ma_valid & MA_LOV)) - GOTO(out, rc); - - LASSERTF(le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V1 || - le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V3, - "%08x", le32_to_cpu(ma->ma_lmm->lmm_magic)); - - if ((int)le32_to_cpu(ma->ma_lmm->lmm_stripe_count) < 0) - stripe = mdd2obd_dev(mdd)->u.mds.mds_lov_desc.ld_tgt_count; - else - stripe = le32_to_cpu(ma->ma_lmm->lmm_stripe_count); - + struct mdd_device *mdd = data; + struct lu_env *env = NULL; + int rc; + struct llog_ctxt *ctxt; + + ENTRY; + + mdd->mdd_cl.mc_gc_task = current; + + CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start with PID %d\n", + mdd2obd_dev(mdd)->obd_name, current->pid); + + OBD_ALLOC_PTR(env); + if (!env) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(env, LCT_MD_THREAD); + if (rc) + GOTO(out_free, rc); + + ctxt = llog_get_context(mdd2obd_dev(mdd), + LLOG_CHANGELOG_USER_ORIG_CTXT); + if (!ctxt) + GOTO(out_env, rc = -ENXIO); + if (!(ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) + GOTO(out_ctxt, rc = -ENXIO); + + for (;;) { + __u32 time_now = (__u32)ktime_get_real_seconds(); + struct mdd_changelog_gc mcgc = { + .mcgc_mdd = mdd, + .mcgc_minrec = mdd->mdd_cl.mc_index, + .mcgc_name = { 0 }, + }; + + rc = llog_cat_process(env, ctxt->loc_handle, + mdd_changelog_gc_cb, &mcgc, 0, 0); + if (rc) + GOTO(out_ctxt, rc); + + if (!mcgc.mcgc_name[0]) + break; + + CWARN("%s: force deregister of changelog user %s idle for %us with %llu unprocessed records\n", + mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_name, + time_now - mcgc.mcgc_mintime, + mdd->mdd_cl.mc_index - mcgc.mcgc_minrec); + + mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id); + + if (mdd->mdd_changelog_emrg_gc && + mdd_changelog_is_space_safe(env, mdd, ctxt->loc_handle, 0)) + mdd->mdd_changelog_emrg_gc = false; + + if (kthread_should_stop()) + GOTO(out_ctxt, rc = 0); + } + EXIT; +out_ctxt: + llog_ctxt_put(ctxt); +out_env: + lu_env_fini(env); +out_free: + OBD_FREE_PTR(env); out: - mdd_txn_param_build(env, mdd, op, stripe + changelog_cnt); - - RETURN(rc); -} - -int mdd_setattr_txn_param_build(const struct lu_env *env, struct md_object *obj, - struct md_attr *ma, enum mdd_txn_op op, - int changelog_cnt) -{ - struct mdd_device *mdd = mdo2mdd(&md2mdd_obj(obj)->mod_obj); - ENTRY; - - mdd_txn_param_build(env, mdd, op, changelog_cnt); - if (ma->ma_attr.la_valid & (LA_UID | LA_GID)) - txn_param_credit_add(&mdd_env_info(env)->mti_param, - dto_txn_credits[DTO_ATTR_SET_CHOWN]); - - /* permission changes may require sync operation */ - if (ma->ma_attr.la_valid & (LA_MODE|LA_UID|LA_GID) && - mdd->mdd_sync_permission == 1) - txn_param_sync(&mdd_env_info(env)->mti_param); - - RETURN(0); -} - -static void mdd_txn_init_dto_credits(const struct lu_env *env, - struct mdd_device *mdd, int *dto_credits) -{ - int op, credits; - for (op = 0; op < DTO_NR; op++) { - credits = mdd_child_ops(mdd)->dt_credit_get(env, mdd->mdd_child, - op); - LASSERT(credits >= 0); - dto_txn_credits[op] = credits; - } -} - -int mdd_txn_init_credits(const struct lu_env *env, struct mdd_device *mdd) -{ - int op; - - /* Init credits for each ops. */ - mdd_txn_init_dto_credits(env, mdd, dto_txn_credits); - - /* Calculate the mdd credits. */ - for (op = MDD_TXN_OBJECT_DESTROY_OP; op < MDD_TXN_LAST_OP; op++) { - int *c = &mdd->mdd_tod[op].mod_credits; - int *dt = dto_txn_credits; - mdd->mdd_tod[op].mod_op = op; - switch(op) { - case MDD_TXN_OBJECT_DESTROY_OP: - /* Unused now */ - *c = dt[DTO_OBJECT_DELETE]; - break; - case MDD_TXN_OBJECT_CREATE_OP: - /* OI INSERT + CREATE OBJECT */ - *c = dt[DTO_INDEX_INSERT] + - dt[DTO_OBJECT_CREATE]; - break; - case MDD_TXN_ATTR_SET_OP: - /* ATTR set + XATTR(lsm, lmv) set */ - *c = dt[DTO_ATTR_SET_BASE] + - dt[DTO_XATTR_SET]; - break; - case MDD_TXN_XATTR_SET_OP: - *c = dt[DTO_XATTR_SET]; - break; - case MDD_TXN_INDEX_INSERT_OP: - *c = dt[DTO_INDEX_INSERT]; - break; - case MDD_TXN_INDEX_DELETE_OP: - *c = dt[DTO_INDEX_DELETE]; - break; - case MDD_TXN_LINK_OP: - *c = dt[DTO_INDEX_INSERT]; - break; - case MDD_TXN_UNLINK_OP: - /* delete index + Unlink log + - * mdd orphan handling */ - *c = dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT] * 2 + - dt[DTO_XATTR_SET] * 3; - break; - case MDD_TXN_RENAME_OP: - /* 2 delete index + 1 insert + Unlink log */ - *c = 2 * dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT] + - dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT] * 2 + - dt[DTO_XATTR_SET] * 3; - break; - case MDD_TXN_RENAME_TGT_OP: - /* index insert + index delete */ - *c = dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT] + - dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT] * 2 + - dt[DTO_XATTR_SET] * 3; - break; - case MDD_TXN_CREATE_DATA_OP: - /* same as set xattr(lsm) */ - *c = dt[DTO_XATTR_SET]; - break; - case MDD_TXN_MKDIR_OP: - /* INDEX INSERT + OI INSERT + - * CREATE_OBJECT_CREDITS - * SET_MD CREDITS is already counted in - * CREATE_OBJECT CREDITS - */ - *c = 2 * dt[DTO_INDEX_INSERT] + - dt[DTO_OBJECT_CREATE]; - break; - default: - CERROR("Invalid op %d init its credit\n", op); - LBUG(); - } - } - RETURN(0); -} - -struct thandle* mdd_trans_start(const struct lu_env *env, - struct mdd_device *mdd) -{ - struct txn_param *p = &mdd_env_info(env)->mti_param; - struct thandle *th; + spin_lock(&mdd->mdd_cl.mc_lock); + mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE; + spin_unlock(&mdd->mdd_cl.mc_lock); - th = mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, p); - return th; + return rc; } -void mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd, - int result, struct thandle *handle) +int mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd, + int result, struct thandle *handle) { - handle->th_result = result; - mdd_child_ops(mdd)->dt_trans_stop(env, handle); + int rc; + + handle->th_result = result; + rc = mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle); + barrier_exit(mdd->mdd_bottom); + + /* bottom half of changelog garbage-collection mechanism, started + * from mdd_changelog_store(). This is required, as running a + * kthead can't occur during a journal transaction is being filled + * because otherwise a deadlock can happen if memory reclaim is + * triggered by kthreadd when forking the new thread, and thus + * I/Os could be attempted to the same device from shrinkers + * requiring a new journal transaction to be started when current + * could never complete (LU-10680). + */ + if (unlikely(mdd->mdd_cl.mc_flags & CLM_ON && + cmpxchg(&mdd->mdd_cl.mc_gc_task, MDD_CHLG_GC_NEED, + MDD_CHLG_GC_START) == MDD_CHLG_GC_NEED)) { + /* XXX we may want to cmpxchg() only if MDD_CHLG_GC_NEED + * to save its cost in the frequent case and have an extra + * if/test cost in the rare case where we need to spawn? + */ + struct task_struct *gc_task; + struct obd_device *obd = mdd2obd_dev(mdd); + + gc_task = kthread_run(mdd_chlg_garbage_collect, mdd, + "chlg_gc_thread"); + if (IS_ERR(gc_task)) { + CERROR("%s: cannot start ChangeLog garbage collection " + "thread: rc = %ld\n", obd->obd_name, + PTR_ERR(gc_task)); + mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE; + } else { + CDEBUG(D_HA, "%s: a ChangeLog garbage collection " + "thread has been started\n", obd->obd_name); + } + } + + /* if operation failed, return \a result, otherwise return status of + * dt_trans_stop */ + return result ?: rc; }