Whamcloud - gitweb
LU-12616 obclass: fix MDS start/stop race
[fs/lustre-release.git] / lustre / mdd / mdd_trans.c
index 46a8b9b..075110d 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Author: Wang Di <wangdi@clusterfs.com>
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
 #define DEBUG_SUBSYSTEM S_MDS
 
-#include <linux/module.h>
-#ifdef HAVE_EXT4_LDISKFS
-#include <ldiskfs/ldiskfs_jbd2.h>
-#else
-#include <linux/jbd.h>
-#endif
-#include <obd.h>
+#include <linux/kthread.h>
+
 #include <obd_class.h>
-#include <lustre_ver.h>
-#include <obd_support.h>
 #include <lprocfs_status.h>
-
-#ifdef HAVE_EXT4_LDISKFS
-#include <ldiskfs/ldiskfs.h>
-#else
-#include <linux/ldiskfs_fs.h>
-#endif
 #include <lustre_mds.h>
-#include <lustre/lustre_idl.h>
+#include <lustre_barrier.h>
 
 #include "mdd_internal.h"
 
-static int dto_txn_credits[DTO_NR];
-
-int mdd_txn_start_cb(const struct lu_env *env, struct txn_param *param,
-                     void *cookie)
+struct thandle *mdd_trans_create(const struct lu_env *env,
+                                 struct mdd_device *mdd)
 {
-        struct mdd_device *mdd = cookie;
-        struct obd_device *obd = mdd2obd_dev(mdd);
-        /* Each transaction updates lov objids, the credits should be added for
-         * this */
-        int blk, shift = mdd->mdd_dt_conf.ddp_block_shift;
-        blk = ((obd->u.mds.mds_lov_desc.ld_tgt_count * sizeof(obd_id) +
-               (1 << shift) - 1) >> shift) + 1;
-
-        /* add lov objids credits */
-        param->tp_credits += blk * dto_txn_credits[DTO_WRITE_BLOCK] +
-                             dto_txn_credits[DTO_WRITE_BASE];
-
-        return 0;
-}
+       struct thandle *th;
+       struct lu_ucred *uc = lu_ucred_check(env);
 
-int mdd_txn_stop_cb(const struct lu_env *env, struct thandle *txn,
-                    void *cookie)
-{
-        struct mdd_device *mdd = cookie;
-        struct obd_device *obd = mdd2obd_dev(mdd);
+       /* If blocked by the write barrier, then return "-EINPROGRESS"
+        * to the caller. Usually, such error will be forwarded to the
+        * client, and the expected behaviour is to re-try such modify
+        * RPC some time later until the barrier is thawed or expired. */
+       if (unlikely(!barrier_entry(mdd->mdd_bottom)))
+               return ERR_PTR(-EINPROGRESS);
+
+       th = mdd_child_ops(mdd)->dt_trans_create(env, mdd->mdd_child);
+       if (!IS_ERR(th) && uc)
+               th->th_ignore_quota = !!md_capable(uc, CFS_CAP_SYS_RESOURCE);
 
-        LASSERT(obd);
-        return mds_lov_write_objids(obd);
+       return th;
 }
 
-int mdd_txn_commit_cb(const struct lu_env *env, struct thandle *txn,
-                      void *cookie)
+int mdd_trans_start(const struct lu_env *env, struct mdd_device *mdd,
+                    struct thandle *th)
 {
-        return 0;
+        return mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, th);
 }
 
-void mdd_txn_param_build(const struct lu_env *env, struct mdd_device *mdd,
-                         enum mdd_txn_op op)
+struct mdd_changelog_gc {
+       struct mdd_device *mcgc_mdd;
+       __u32 mcgc_id;
+       __u32 mcgc_maxtime;
+       __u64 mcgc_maxindexes;
+       bool mcgc_found;
+};
+
+/* return first registered ChangeLog user idle since too long
+ * use ChangeLog's user plain LLOG mtime for this */
+static int mdd_changelog_gc_cb(const struct lu_env *env,
+                              struct llog_handle *llh,
+                              struct llog_rec_hdr *hdr, void *data)
 {
-        LASSERT(0 <= op && op < MDD_TXN_LAST_OP);
+       struct llog_changelog_user_rec *rec;
+       struct mdd_changelog_gc *mcgc = (struct mdd_changelog_gc *)data;
+       struct mdd_device *mdd = mcgc->mcgc_mdd;
+       ENTRY;
+
+       if ((llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) == 0)
+               RETURN(-ENXIO);
+
+       rec = container_of(hdr, struct llog_changelog_user_rec,
+                          cur_hdr);
+
+       /* find oldest idle user, based on last record update/cancel time (new
+        * behavior), or for old user records, last record index vs current
+        * ChangeLog index. Late users with old record format will be treated
+        * first as we assume they could be idle since longer
+        */
+       if (rec->cur_time != 0) {
+               __u32 time_now = (__u32)ktime_get_real_seconds();
+               __u32 time_out = rec->cur_time +
+                                mdd->mdd_changelog_max_idle_time;
+               __u32 idle_time = time_now - rec->cur_time;
+
+               /* treat oldest idle user first, and if no old format user
+                * has been already selected
+                */
+               if (time_after32(time_now, time_out) &&
+                   idle_time > mcgc->mcgc_maxtime &&
+                   mcgc->mcgc_maxindexes == 0) {
+                       mcgc->mcgc_maxtime = idle_time;
+                       mcgc->mcgc_id = rec->cur_id;
+                       mcgc->mcgc_found = true;
+               }
+       } else {
+               /* old user record with no idle time stamp, so use empirical
+                * method based on its current index/position
+                */
+               __u64 idle_indexes;
 
-        txn_param_init(&mdd_env_info(env)->mti_param,
-                       mdd->mdd_tod[op].mod_credits);
+               idle_indexes = mdd->mdd_cl.mc_index - rec->cur_endrec;
+
+               /* treat user with the oldest/smallest current index first */
+               if (idle_indexes >= mdd->mdd_changelog_max_idle_indexes &&
+                   idle_indexes > mcgc->mcgc_maxindexes) {
+                       mcgc->mcgc_maxindexes = idle_indexes;
+                       mcgc->mcgc_id = rec->cur_id;
+                       mcgc->mcgc_found = true;
+               }
+
+       }
+       RETURN(0);
 }
 
-int mdd_log_txn_param_build(const struct lu_env *env, struct md_object *obj,
-                            struct md_attr *ma, enum mdd_txn_op op)
+/* recover space from long-term inactive ChangeLog users */
+static int mdd_chlg_garbage_collect(void *data)
 {
-        struct mdd_device *mdd = mdo2mdd(&md2mdd_obj(obj)->mod_obj);
-        int rc, log_credits, stripe;
-        ENTRY;
+       struct mdd_device *mdd = (struct mdd_device *)data;
+       struct lu_env             *env = NULL;
+       int                        rc;
+       struct llog_ctxt *ctxt;
+       struct mdd_changelog_gc mcgc = {
+               .mcgc_mdd = mdd,
+               .mcgc_found = false,
+               .mcgc_maxtime = 0,
+               .mcgc_maxindexes = 0,
+       };
+       ENTRY;
 
-        mdd_txn_param_build(env, mdd, op);
+       mdd->mdd_cl.mc_gc_task = current;
 
-        if (S_ISDIR(lu_object_attr(&obj->mo_lu)))
-                RETURN(0);
+       CDEBUG(D_HA, "%s: ChangeLog garbage collect thread start with PID %d\n",
+              mdd2obd_dev(mdd)->obd_name, current->pid);
 
-        LASSERT(op == MDD_TXN_UNLINK_OP || op == MDD_TXN_RENAME_OP);
-        rc = mdd_lmm_get_locked(env, md2mdd_obj(obj), ma);
-        if (rc || !(ma->ma_valid & MA_LOV))
-                RETURN(rc);
+       OBD_ALLOC_PTR(env);
+       if (env == NULL)
+               GOTO(out, rc = -ENOMEM);
 
-        LASSERTF(le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V1 ||
-                 le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V3,
-                 "%08x", le32_to_cpu(ma->ma_lmm->lmm_magic));
+       rc = lu_env_init(env, LCT_MD_THREAD);
+       if (rc)
+               GOTO(out, rc);
 
-        if ((int)le32_to_cpu(ma->ma_lmm->lmm_stripe_count) < 0)
-                stripe = mdd2obd_dev(mdd)->u.mds.mds_lov_desc.ld_tgt_count;
-        else
-                stripe = le32_to_cpu(ma->ma_lmm->lmm_stripe_count);
+       for (;;) {
+               ctxt = llog_get_context(mdd2obd_dev(mdd),
+                                       LLOG_CHANGELOG_USER_ORIG_CTXT);
+               if (ctxt == NULL ||
+                   (ctxt->loc_handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) == 0)
+                       GOTO(out_ctxt, rc = -ENXIO);
 
-        log_credits = stripe * dto_txn_credits[DTO_LOG_REC];
-        txn_param_credit_add(&mdd_env_info(env)->mti_param, log_credits);
-        RETURN(rc);
-}
+               rc = llog_cat_process(env, ctxt->loc_handle,
+                                     mdd_changelog_gc_cb, &mcgc, 0, 0);
+               if (rc != 0 || mcgc.mcgc_found == false)
+                       break;
+               llog_ctxt_put(ctxt);
 
-int mdd_setattr_txn_param_build(const struct lu_env *env, struct md_object *obj,
-                                struct md_attr *ma, enum mdd_txn_op op)
-{
-        struct mdd_device *mdd = mdo2mdd(&md2mdd_obj(obj)->mod_obj);
-        ENTRY;
+               if (mcgc.mcgc_maxindexes != 0)
+                       CWARN("%s: Force deregister of ChangeLog user cl%d "
+                             "idle with more than %llu unprocessed records\n",
+                             mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
+                             mcgc.mcgc_maxindexes);
+               else
+                       CWARN("%s: Force deregister of ChangeLog user cl%d "
+                             "idle since more than %us\n",
+                             mdd2obd_dev(mdd)->obd_name, mcgc.mcgc_id,
+                             mcgc.mcgc_maxtime);
 
-        mdd_txn_param_build(env, mdd, op);
-        if (ma->ma_attr.la_valid & (LA_UID | LA_GID))
-                txn_param_credit_add(&mdd_env_info(env)->mti_param,
-                                     dto_txn_credits[DTO_ATTR_SET_CHOWN]);
+               mdd_changelog_user_purge(env, mdd, mcgc.mcgc_id);
 
-        /* permission changes may require sync operation */
-        if (ma->ma_attr.la_valid & (LA_MODE|LA_UID|LA_GID) &&
-            mdd->mdd_sync_permission == 1)
-                txn_param_sync(&mdd_env_info(env)->mti_param);
+               if (kthread_should_stop())
+                       GOTO(out_env, rc = 0);
 
-        RETURN(0);
-}
+               /* try again to search for another candidate */
+               mcgc.mcgc_found = false;
+               mcgc.mcgc_maxtime = 0;
+               mcgc.mcgc_maxindexes = 0;
+       }
 
-static void mdd_txn_init_dto_credits(const struct lu_env *env,
-                                     struct mdd_device *mdd, int *dto_credits)
-{
-        int op, credits;
-        for (op = 0; op < DTO_NR; op++) {
-                credits = mdd_child_ops(mdd)->dt_credit_get(env, mdd->mdd_child,
-                                                            op);
-                LASSERT(credits >= 0);
-                dto_txn_credits[op] = credits;
-        }
-}
+out_ctxt:
+       if (ctxt != NULL)
+               llog_ctxt_put(ctxt);
 
-int mdd_txn_init_credits(const struct lu_env *env, struct mdd_device *mdd)
-{
-        int op;
-
-        /* Init credits for each ops. */
-        mdd_txn_init_dto_credits(env, mdd, dto_txn_credits);
-
-        /* Calculate the mdd credits. */
-        for (op = MDD_TXN_OBJECT_DESTROY_OP; op < MDD_TXN_LAST_OP; op++) {
-                int *c = &mdd->mdd_tod[op].mod_credits;
-                int *dt = dto_txn_credits;
-                mdd->mdd_tod[op].mod_op = op;
-                switch(op) {
-                        case MDD_TXN_OBJECT_DESTROY_OP:
-                                /* Unused now */
-                                *c = dt[DTO_OBJECT_DELETE];
-                                break;
-                        case MDD_TXN_OBJECT_CREATE_OP:
-                                /* OI INSERT + CREATE OBJECT */
-                                *c = dt[DTO_INDEX_INSERT] +
-                                     dt[DTO_OBJECT_CREATE];
-                                break;
-                        case MDD_TXN_ATTR_SET_OP:
-                                /* ATTR set + XATTR(lsm, lmv) set */
-                                *c = dt[DTO_ATTR_SET_BASE] +
-                                     dt[DTO_XATTR_SET];
-                                break;
-                        case MDD_TXN_XATTR_SET_OP:
-                                *c = dt[DTO_XATTR_SET];
-                                break;
-                        case MDD_TXN_INDEX_INSERT_OP:
-                                *c = dt[DTO_INDEX_INSERT];
-                                break;
-                        case MDD_TXN_INDEX_DELETE_OP:
-                                *c = dt[DTO_INDEX_DELETE];
-                                break;
-                        case MDD_TXN_LINK_OP:
-                                *c = dt[DTO_INDEX_INSERT];
-                                break;
-                        case MDD_TXN_UNLINK_OP:
-                                /* delete index + Unlink log +
-                                 * mdd orphan handling */
-                                *c = dt[DTO_INDEX_DELETE] +
-                                        dt[DTO_INDEX_DELETE] +
-                                        dt[DTO_INDEX_INSERT] * 2 +
-                                        dt[DTO_XATTR_SET] * 3;
-                                break;
-                        case MDD_TXN_RENAME_OP:
-                                /* 2 delete index + 1 insert + Unlink log */
-                                *c = 2 * dt[DTO_INDEX_DELETE] +
-                                        dt[DTO_INDEX_INSERT] +
-                                        dt[DTO_INDEX_DELETE] +
-                                        dt[DTO_INDEX_INSERT] * 2 +
-                                        dt[DTO_XATTR_SET] * 3;
-                                break;
-                        case MDD_TXN_RENAME_TGT_OP:
-                                /* index insert + index delete */
-                                *c = dt[DTO_INDEX_DELETE] +
-                                        dt[DTO_INDEX_INSERT] +
-                                        dt[DTO_INDEX_DELETE] +
-                                        dt[DTO_INDEX_INSERT] * 2 +
-                                        dt[DTO_XATTR_SET] * 3;
-                                break;
-                        case MDD_TXN_CREATE_DATA_OP:
-                                /* same as set xattr(lsm) */
-                                *c = dt[DTO_XATTR_SET];
-                                break;
-                        case MDD_TXN_MKDIR_OP:
-                                /* INDEX INSERT + OI INSERT +
-                                 * CREATE_OBJECT_CREDITS
-                                 * SET_MD CREDITS is already counted in
-                                 * CREATE_OBJECT CREDITS
-                                 */
-                                 *c = 2 * dt[DTO_INDEX_INSERT] +
-                                          dt[DTO_OBJECT_CREATE];
-                                break;
-                        default:
-                                CERROR("Invalid op %d init its credit\n", op);
-                                LBUG();
-                }
-        }
-        RETURN(0);
-}
+out_env:
+       lu_env_fini(env);
+       GOTO(out, rc);
+out:
+       if (env)
+               OBD_FREE_PTR(env);
 
-struct thandle* mdd_trans_start(const struct lu_env *env,
-                                struct mdd_device *mdd)
-{
-        struct txn_param *p = &mdd_env_info(env)->mti_param;
-        struct thandle *th;
+       spin_lock(&mdd->mdd_cl.mc_lock);
+       mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
+       spin_unlock(&mdd->mdd_cl.mc_lock);
 
-        th = mdd_child_ops(mdd)->dt_trans_start(env, mdd->mdd_child, p);
-        return th;
+       return rc;
 }
 
-void mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd,
-                    int result, struct thandle *handle)
+int mdd_trans_stop(const struct lu_env *env, struct mdd_device *mdd,
+                  int result, struct thandle *handle)
 {
-        handle->th_result = result;
-        mdd_child_ops(mdd)->dt_trans_stop(env, handle);
+       int rc;
+
+       handle->th_result = result;
+       rc = mdd_child_ops(mdd)->dt_trans_stop(env, mdd->mdd_child, handle);
+       barrier_exit(mdd->mdd_bottom);
+
+       /* bottom half of changelog garbage-collection mechanism, started
+        * from mdd_changelog_store(). This is required, as running a
+        * kthead can't occur during a journal transaction is being filled
+        * because otherwise a deadlock can happen if memory reclaim is
+        * triggered by kthreadd when forking the new thread, and thus
+        * I/Os could be attempted to the same device from shrinkers
+        * requiring a new journal transaction to be started when current
+        * could never complete (LU-10680).
+        */
+       if (unlikely(mdd->mdd_cl.mc_flags & CLM_ON &&
+                    cmpxchg(&mdd->mdd_cl.mc_gc_task, MDD_CHLG_GC_NEED,
+                            MDD_CHLG_GC_START) == MDD_CHLG_GC_NEED)) {
+               /* XXX we may want to cmpxchg() only if MDD_CHLG_GC_NEED
+                * to save its cost in the frequent case and have an extra
+                * if/test cost in the rare case where we need to spawn?
+                */
+               struct task_struct *gc_task;
+               struct obd_device *obd = mdd2obd_dev(mdd);
+
+               gc_task = kthread_run(mdd_chlg_garbage_collect, mdd,
+                                     "chlg_gc_thread");
+               if (IS_ERR(gc_task)) {
+                       CERROR("%s: cannot start ChangeLog garbage collection "
+                              "thread: rc = %ld\n", obd->obd_name,
+                              PTR_ERR(gc_task));
+                       mdd->mdd_cl.mc_gc_task = MDD_CHLG_GC_NONE;
+               } else {
+                       CDEBUG(D_HA, "%s: a ChangeLog garbage collection "
+                              "thread has been started\n", obd->obd_name);
+               }
+       }
+
+       /* if operation failed, return \a result, otherwise return status of
+        * dt_trans_stop */
+       return result ?: rc;
 }