X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fmdd%2Fmdd_trans.c;h=075110d819b0e9276f23b3207c9dfd68211a36df;hp=96d7c88e2bb0d42403e4e7b248bcbdd3b70edc06;hb=3cce65712d94cffe8f1626545845b95b88aef672;hpb=b2fa3d79a26e6a161e6470386a90e9061482b930 diff --git a/lustre/mdd/mdd_trans.c b/lustre/mdd/mdd_trans.c index 96d7c88..075110d 100644 --- a/lustre/mdd/mdd_trans.c +++ b/lustre/mdd/mdd_trans.c @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,7 +23,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -42,16 +38,33 @@ #define DEBUG_SUBSYSTEM S_MDS +#include + #include #include #include +#include #include "mdd_internal.h" struct thandle *mdd_trans_create(const struct lu_env *env, struct mdd_device *mdd) { - return mdd_child_ops(mdd)->dt_trans_create(env, mdd->mdd_child); + struct thandle *th; + struct lu_ucred *uc = lu_ucred_check(env); + + /* If blocked by the write barrier, then return "-EINPROGRESS" + * to the caller. Usually, such error will be forwarded to the + * client, and the expected behaviour is to re-try such modify + * RPC some time later until the barrier is thawed or expired. */ + if (unlikely(!barrier_entry(mdd->mdd_bottom))) + return ERR_PTR(-EINPROGRESS); + + th = mdd_child_ops(mdd)->dt_trans_create(env, mdd->mdd_child); + if (!IS_ERR(th) && uc) + th->th_ignore_quota = !!md_capable(uc, CFS_CAP_SYS_RESOURCE); + + return th; } int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd, @@ -60,9 +73,195 @@ int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd, return mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, th); } +struct mdd_changelog_gc { + struct mdd_device *mcgc_mdd; + __u32 mcgc_id; + __u32 mcgc_maxtime; + __u64 mcgc_maxindexes; + bool mcgc_found; +}; + +/* return first registered ChangeLog user idle since too long + * use ChangeLog's user plain LLOG mtime for this */ +static int mdd_changelog_gc_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *hdr, void *data) +{ + struct llog_changelog_user_rec *rec; + struct mdd_changelog_gc *mcgc = (struct mdd_changelog_gc *)data; + struct mdd_device *mdd = mcgc->mcgc_mdd; + ENTRY; + + if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0) + RETURN(-ENXIO); + + rec = container_of(hdr, struct llog_changelog_user_rec, + cur_hdr); + + /* find oldest idle user, based on last record update/cancel time (new + * behavior), or for old user records, last record index vs current + * ChangeLog index. Late users with old record format will be treated + * first as we assume they could be idle since longer + */ + if (rec->cur_time != 0) { + __u32 time_now = (__u32)ktime_get_real_seconds(); + __u32 time_out = rec->cur_time + + mdd->mdd_changelog_max_idle_time; + __u32 idle_time = time_now - rec->cur_time; + + /* treat oldest idle user first, and if no old format user + * has been already selected + */ + if (time_after32(time_now, time_out) && + idle_time > mcgc->mcgc_maxtime && + mcgc->mcgc_maxindexes == 0) { + mcgc->mcgc_maxtime = idle_time; + mcgc->mcgc_id = rec->cur_id; + mcgc->mcgc_found = true; + } + } else { + /* old user record with no idle time stamp, so use empirical + * method based on its current index/position + */ + __u64 idle_indexes; + + idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec; + + /* treat user with the oldest/smallest current index first */ + if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes && + idle_indexes > mcgc->mcgc_maxindexes) { + mcgc->mcgc_maxindexes = idle_indexes; + mcgc->mcgc_id = rec->cur_id; + mcgc->mcgc_found = true; + } + + } + RETURN(0); +} + +/* recover space from long-term inactive ChangeLog users */ +static int mdd_chlg_garbage_collect(void *data) +{ + struct mdd_device *mdd = (struct mdd_device *)data; + struct lu_env *env = NULL; + int rc; + struct llog_ctxt *ctxt; + struct mdd_changelog_gc mcgc = { + .mcgc_mdd = mdd, + .mcgc_found = false, + .mcgc_maxtime = 0, + .mcgc_maxindexes = 0, + }; + ENTRY; + + mdd->mdd_cl.mc_gc_task = current; + + CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start with PID %d\n", + mdd2obd_dev(mdd)->obd_name, current->pid); + + OBD_ALLOC_PTR(env); + if (env == NULL) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(env, LCT_MD_THREAD); + if (rc) + GOTO(out, rc); + + for (;;) { + ctxt = llog_get_context(mdd2obd_dev(mdd), + LLOG_CHANGELOG_USER_ORIG_CTXT); + if (ctxt == NULL || + (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0) + GOTO(out_ctxt, rc = -ENXIO); + + rc = llog_cat_process(env, ctxt->loc_handle, + mdd_changelog_gc_cb, &mcgc, 0, 0); + if (rc != 0 || mcgc.mcgc_found == false) + break; + llog_ctxt_put(ctxt); + + if (mcgc.mcgc_maxindexes != 0) + CWARN("%s: Force deregister of ChangeLog user cl%d " + "idle with more than %llu unprocessed records\n", + mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id, + mcgc.mcgc_maxindexes); + else + CWARN("%s: Force deregister of ChangeLog user cl%d " + "idle since more than %us\n", + mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id, + mcgc.mcgc_maxtime); + + mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id); + + if (kthread_should_stop()) + GOTO(out_env, rc = 0); + + /* try again to search for another candidate */ + mcgc.mcgc_found = false; + mcgc.mcgc_maxtime = 0; + mcgc.mcgc_maxindexes = 0; + } + +out_ctxt: + if (ctxt != NULL) + llog_ctxt_put(ctxt); + +out_env: + lu_env_fini(env); + GOTO(out, rc); +out: + if (env) + OBD_FREE_PTR(env); + + spin_lock(&mdd->mdd_cl.mc_lock); + mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE; + spin_unlock(&mdd->mdd_cl.mc_lock); + + return rc; +} + int mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd, int result, struct thandle *handle) { + int rc; + handle->th_result = result; - return mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle); + rc = mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle); + barrier_exit(mdd->mdd_bottom); + + /* bottom half of changelog garbage-collection mechanism, started + * from mdd_changelog_store(). This is required, as running a + * kthead can't occur during a journal transaction is being filled + * because otherwise a deadlock can happen if memory reclaim is + * triggered by kthreadd when forking the new thread, and thus + * I/Os could be attempted to the same device from shrinkers + * requiring a new journal transaction to be started when current + * could never complete (LU-10680). + */ + if (unlikely(mdd->mdd_cl.mc_flags & CLM_ON && + cmpxchg(&mdd->mdd_cl.mc_gc_task, MDD_CHLG_GC_NEED, + MDD_CHLG_GC_START) == MDD_CHLG_GC_NEED)) { + /* XXX we may want to cmpxchg() only if MDD_CHLG_GC_NEED + * to save its cost in the frequent case and have an extra + * if/test cost in the rare case where we need to spawn? + */ + struct task_struct *gc_task; + struct obd_device *obd = mdd2obd_dev(mdd); + + gc_task = kthread_run(mdd_chlg_garbage_collect, mdd, + "chlg_gc_thread"); + if (IS_ERR(gc_task)) { + CERROR("%s: cannot start ChangeLog garbage collection " + "thread: rc = %ld\n", obd->obd_name, + PTR_ERR(gc_task)); + mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE; + } else { + CDEBUG(D_HA, "%s: a ChangeLog garbage collection " + "thread has been started\n", obd->obd_name); + } + } + + /* if operation failed, return \a result, otherwise return status of + * dt_trans_stop */ + return result ?: rc; }