4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * lustre/mdd/mdd_trans.c
34 * Lustre Metadata Server (mdd) routines
36 * Author: Wang Di <wangdi@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_MDS
41 #include <linux/kthread.h>
43 #include <obd_class.h>
44 #include <lprocfs_status.h>
45 #include <lustre_mds.h>
46 #include <lustre_barrier.h>
48 #include "mdd_internal.h"
50 struct thandle *mdd_trans_create(const struct lu_env *env,
51 struct mdd_device *mdd)
54 struct lu_ucred *uc = lu_ucred_check(env);
56 /* If blocked by the write barrier, then return "-EINPROGRESS"
57 * to the caller. Usually, such error will be forwarded to the
58 * client, and the expected behaviour is to re-try such modify
59 * RPC some time later until the barrier is thawed or expired. */
60 if (unlikely(!barrier_entry(mdd->mdd_bottom)))
61 return ERR_PTR(-EINPROGRESS);
63 th = mdd_child_ops(mdd)->dt_trans_create(env, mdd->mdd_child);
64 if (!IS_ERR(th) && uc)
65 th->th_ignore_quota = !!md_capable(uc, CFS_CAP_SYS_RESOURCE);
70 int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd,
73 return mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, th);
76 struct mdd_changelog_gc {
77 struct mdd_device *mcgc_mdd;
80 __u64 mcgc_maxindexes;
84 /* return first registered ChangeLog user idle since too long
85 * use ChangeLog's user plain LLOG mtime for this */
86 static int mdd_changelog_gc_cb(const struct lu_env *env,
87 struct llog_handle *llh,
88 struct llog_rec_hdr *hdr, void *data)
90 struct llog_changelog_user_rec *rec;
91 struct mdd_changelog_gc *mcgc = (struct mdd_changelog_gc *)data;
92 struct mdd_device *mdd = mcgc->mcgc_mdd;
95 if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0)
98 rec = container_of(hdr, struct llog_changelog_user_rec,
101 /* find oldest idle user, based on last record update/cancel time (new
102 * behavior), or for old user records, last record index vs current
103 * ChangeLog index. Late users with old record format will be treated
104 * first as we assume they could be idle since longer
106 if (rec->cur_time != 0) {
107 __u32 time_now = (__u32)ktime_get_real_seconds();
108 __u32 time_out = rec->cur_time +
109 mdd->mdd_changelog_max_idle_time;
110 __u32 idle_time = time_now - rec->cur_time;
112 /* treat oldest idle user first, and if no old format user
113 * has been already selected
115 if (time_after32(time_now, time_out) &&
116 idle_time > mcgc->mcgc_maxtime &&
117 mcgc->mcgc_maxindexes == 0) {
118 mcgc->mcgc_maxtime = idle_time;
119 mcgc->mcgc_id = rec->cur_id;
120 mcgc->mcgc_found = true;
123 /* old user record with no idle time stamp, so use empirical
124 * method based on its current index/position
128 idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec;
130 /* treat user with the oldest/smallest current index first */
131 if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes &&
132 idle_indexes > mcgc->mcgc_maxindexes) {
133 mcgc->mcgc_maxindexes = idle_indexes;
134 mcgc->mcgc_id = rec->cur_id;
135 mcgc->mcgc_found = true;
142 /* recover space from long-term inactive ChangeLog users */
143 static int mdd_chlg_garbage_collect(void *data)
145 struct mdd_device *mdd = (struct mdd_device *)data;
146 struct lu_env *env = NULL;
148 struct llog_ctxt *ctxt;
149 struct mdd_changelog_gc mcgc = {
153 .mcgc_maxindexes = 0,
157 mdd->mdd_cl.mc_gc_task = current;
159 CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start with PID %d\n",
160 mdd2obd_dev(mdd)->obd_name, current->pid);
164 GOTO(out, rc = -ENOMEM);
166 rc = lu_env_init(env, LCT_MD_THREAD);
171 ctxt = llog_get_context(mdd2obd_dev(mdd),
172 LLOG_CHANGELOG_USER_ORIG_CTXT);
174 (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0)
175 GOTO(out_ctxt, rc = -ENXIO);
177 rc = llog_cat_process(env, ctxt->loc_handle,
178 mdd_changelog_gc_cb, &mcgc, 0, 0);
179 if (rc != 0 || mcgc.mcgc_found == false)
183 if (mcgc.mcgc_maxindexes != 0)
184 CWARN("%s: Force deregister of ChangeLog user cl%d "
185 "idle with more than %llu unprocessed records\n",
186 mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
187 mcgc.mcgc_maxindexes);
189 CWARN("%s: Force deregister of ChangeLog user cl%d "
190 "idle since more than %us\n",
191 mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
194 mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id);
196 if (kthread_should_stop())
197 GOTO(out_env, rc = 0);
199 /* try again to search for another candidate */
200 mcgc.mcgc_found = false;
201 mcgc.mcgc_maxtime = 0;
202 mcgc.mcgc_maxindexes = 0;
216 spin_lock(&mdd->mdd_cl.mc_lock);
217 mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
218 spin_unlock(&mdd->mdd_cl.mc_lock);
223 int mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd,
224 int result, struct thandle *handle)
228 handle->th_result = result;
229 rc = mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle);
230 barrier_exit(mdd->mdd_bottom);
232 /* bottom half of changelog garbage-collection mechanism, started
233 * from mdd_changelog_store(). This is required, as running a
234 * kthead can't occur during a journal transaction is being filled
235 * because otherwise a deadlock can happen if memory reclaim is
236 * triggered by kthreadd when forking the new thread, and thus
237 * I/Os could be attempted to the same device from shrinkers
238 * requiring a new journal transaction to be started when current
239 * could never complete (LU-10680).
241 if (unlikely(mdd->mdd_cl.mc_flags & CLM_ON &&
242 cmpxchg(&mdd->mdd_cl.mc_gc_task, MDD_CHLG_GC_NEED,
243 MDD_CHLG_GC_START) == MDD_CHLG_GC_NEED)) {
244 /* XXX we may want to cmpxchg() only if MDD_CHLG_GC_NEED
245 * to save its cost in the frequent case and have an extra
246 * if/test cost in the rare case where we need to spawn?
248 struct task_struct *gc_task;
249 struct obd_device *obd = mdd2obd_dev(mdd);
251 gc_task = kthread_run(mdd_chlg_garbage_collect, mdd,
253 if (IS_ERR(gc_task)) {
254 CERROR("%s: cannot start ChangeLog garbage collection "
255 "thread: rc = %ld\n", obd->obd_name,
257 mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
259 CDEBUG(D_HA, "%s: a ChangeLog garbage collection "
260 "thread has been started\n", obd->obd_name);
264 /* if operation failed, return \a result, otherwise return status of