lustre/mdd/mdd_trans.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/mdd/mdd_trans.c
  33  *
  34  * Lustre Metadata Server (mdd) routines
  35  *
  36  * Author: Wang Di <wangdi@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_MDS
  40
  41 #include <linux/kthread.h>
  42
  43 #include <obd_class.h>
  44 #include <lprocfs_status.h>
  45 #include <lustre_mds.h>
  46 #include <lustre_barrier.h>
  47
  48 #include "mdd_internal.h"
  49
  50 struct thandle *mdd_trans_create(const struct lu_env *env,
  51                                  struct mdd_device *mdd)
  52 {
  53         struct thandle *th;
  54         struct lu_ucred *uc = lu_ucred_check(env);
  55
  56         /* If blocked by the write barrier, then return "-EINPROGRESS"
  57          * to the caller. Usually, such error will be forwarded to the
  58          * client, and the expected behaviour is to re-try such modify
  59          * RPC some time later until the barrier is thawed or expired. */
  60         if (unlikely(!barrier_entry(mdd->mdd_bottom)))
  61                 return ERR_PTR(-EINPROGRESS);
  62
  63         th = mdd_child_ops(mdd)->dt_trans_create(env, mdd->mdd_child);
  64         if (!IS_ERR(th) && uc)
  65                 th->th_ignore_quota = !!md_capable(uc, CFS_CAP_SYS_RESOURCE);
  66
  67         return th;
  68 }
  69
  70 int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd,
  71                     struct thandle *th)
  72 {
  73         return mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, th);
  74 }
  75
  76 struct mdd_changelog_gc {
  77         struct mdd_device *mcgc_mdd;
  78         __u32 mcgc_id;
  79         __u32 mcgc_maxtime;
  80         __u64 mcgc_maxindexes;
  81         bool mcgc_found;
  82 };
  83
  84 /* return first registered ChangeLog user idle since too long
  85  * use ChangeLog's user plain LLOG mtime for this */
  86 static int mdd_changelog_gc_cb(const struct lu_env *env,
  87                                struct llog_handle *llh,
  88                                struct llog_rec_hdr *hdr, void *data)
  89 {
  90         struct llog_changelog_user_rec *rec;
  91         struct mdd_changelog_gc *mcgc = (struct mdd_changelog_gc *)data;
  92         struct mdd_device *mdd = mcgc->mcgc_mdd;
  93         ENTRY;
  94
  95         if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0)
  96                 RETURN(-ENXIO);
  97
  98         rec = container_of(hdr, struct llog_changelog_user_rec,
  99                            cur_hdr);
 100
 101         /* find oldest idle user, based on last record update/cancel time (new
 102          * behavior), or for old user records, last record index vs current
 103          * ChangeLog index. Late users with old record format will be treated
 104          * first as we assume they could be idle since longer
 105          */
 106         if (rec->cur_time != 0) {
 107                 __u32 time_now = (__u32)ktime_get_real_seconds();
 108                 __u32 time_out = rec->cur_time +
 109                                  mdd->mdd_changelog_max_idle_time;
 110                 __u32 idle_time = time_now - rec->cur_time;
 111
 112                 /* treat oldest idle user first, and if no old format user
 113                  * has been already selected
 114                  */
 115                 if (time_after32(time_now, time_out) &&
 116                     idle_time > mcgc->mcgc_maxtime &&
 117                     mcgc->mcgc_maxindexes == 0) {
 118                         mcgc->mcgc_maxtime = idle_time;
 119                         mcgc->mcgc_id = rec->cur_id;
 120                         mcgc->mcgc_found = true;
 121                 }
 122         } else {
 123                 /* old user record with no idle time stamp, so use empirical
 124                  * method based on its current index/position
 125                  */
 126                 __u64 idle_indexes;
 127
 128                 idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec;
 129
 130                 /* treat user with the oldest/smallest current index first */
 131                 if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes &&
 132                     idle_indexes > mcgc->mcgc_maxindexes) {
 133                         mcgc->mcgc_maxindexes = idle_indexes;
 134                         mcgc->mcgc_id = rec->cur_id;
 135                         mcgc->mcgc_found = true;
 136                 }
 137
 138         }
 139         RETURN(0);
 140 }
 141
 142 /* recover space from long-term inactive ChangeLog users */
 143 static int mdd_chlg_garbage_collect(void *data)
 144 {
 145         struct mdd_device *mdd = (struct mdd_device *)data;
 146         struct lu_env             *env = NULL;
 147         int                        rc;
 148         struct llog_ctxt *ctxt;
 149         struct mdd_changelog_gc mcgc = {
 150                 .mcgc_mdd = mdd,
 151                 .mcgc_found = false,
 152                 .mcgc_maxtime = 0,
 153                 .mcgc_maxindexes = 0,
 154         };
 155         ENTRY;
 156
 157         mdd->mdd_cl.mc_gc_task = current;
 158
 159         CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start with PID %d\n",
 160                mdd2obd_dev(mdd)->obd_name, current->pid);
 161
 162         OBD_ALLOC_PTR(env);
 163         if (env == NULL)
 164                 GOTO(out, rc = -ENOMEM);
 165
 166         rc = lu_env_init(env, LCT_MD_THREAD);
 167         if (rc)
 168                 GOTO(out, rc);
 169
 170         for (;;) {
 171                 ctxt = llog_get_context(mdd2obd_dev(mdd),
 172                                         LLOG_CHANGELOG_USER_ORIG_CTXT);
 173                 if (ctxt == NULL ||
 174                     (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0)
 175                         GOTO(out_ctxt, rc = -ENXIO);
 176
 177                 rc = llog_cat_process(env, ctxt->loc_handle,
 178                                       mdd_changelog_gc_cb, &mcgc, 0, 0);
 179                 if (rc != 0 || mcgc.mcgc_found == false)
 180                         break;
 181                 llog_ctxt_put(ctxt);
 182
 183                 if (mcgc.mcgc_maxindexes != 0)
 184                         CWARN("%s: Force deregister of ChangeLog user cl%d "
 185                               "idle with more than %llu unprocessed records\n",
 186                               mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
 187                               mcgc.mcgc_maxindexes);
 188                 else
 189                         CWARN("%s: Force deregister of ChangeLog user cl%d "
 190                               "idle since more than %us\n",
 191                               mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
 192                               mcgc.mcgc_maxtime);
 193
 194                 mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id);
 195
 196                 if (kthread_should_stop())
 197                         GOTO(out_env, rc = 0);
 198
 199                 /* try again to search for another candidate */
 200                 mcgc.mcgc_found = false;
 201                 mcgc.mcgc_maxtime = 0;
 202                 mcgc.mcgc_maxindexes = 0;
 203         }
 204
 205 out_ctxt:
 206         if (ctxt != NULL)
 207                 llog_ctxt_put(ctxt);
 208
 209 out_env:
 210         lu_env_fini(env);
 211         GOTO(out, rc);
 212 out:
 213         if (env)
 214                 OBD_FREE_PTR(env);
 215
 216         spin_lock(&mdd->mdd_cl.mc_lock);
 217         mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
 218         spin_unlock(&mdd->mdd_cl.mc_lock);
 219
 220         return rc;
 221 }
 222
 223 int mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd,
 224                    int result, struct thandle *handle)
 225 {
 226         int rc;
 227
 228         handle->th_result = result;
 229         rc = mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle);
 230         barrier_exit(mdd->mdd_bottom);
 231
 232         /* bottom half of changelog garbage-collection mechanism, started
 233          * from mdd_changelog_store(). This is required, as running a
 234          * kthead can't occur during a journal transaction is being filled
 235          * because otherwise a deadlock can happen if memory reclaim is
 236          * triggered by kthreadd when forking the new thread, and thus
 237          * I/Os could be attempted to the same device from shrinkers
 238          * requiring a new journal transaction to be started when current
 239          * could never complete (LU-10680).
 240          */
 241         if (unlikely(mdd->mdd_cl.mc_flags & CLM_ON &&
 242                      cmpxchg(&mdd->mdd_cl.mc_gc_task, MDD_CHLG_GC_NEED,
 243                              MDD_CHLG_GC_START) == MDD_CHLG_GC_NEED)) {
 244                 /* XXX we may want to cmpxchg() only if MDD_CHLG_GC_NEED
 245                  * to save its cost in the frequent case and have an extra
 246                  * if/test cost in the rare case where we need to spawn?
 247                  */
 248                 struct task_struct *gc_task;
 249                 struct obd_device *obd = mdd2obd_dev(mdd);
 250
 251                 gc_task = kthread_run(mdd_chlg_garbage_collect, mdd,
 252                                       "chlg_gc_thread");
 253                 if (IS_ERR(gc_task)) {
 254                         CERROR("%s: cannot start ChangeLog garbage collection "
 255                                "thread: rc = %ld\n", obd->obd_name,
 256                                PTR_ERR(gc_task));
 257                         mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
 258                 } else {
 259                         CDEBUG(D_HA, "%s: a ChangeLog garbage collection "
 260                                "thread has been started\n", obd->obd_name);
 261                 }
 262         }
 263
 264         /* if operation failed, return \a result, otherwise return status of
 265          * dt_trans_stop */
 266         return result ?: rc;
 267 }