Whamcloud - gitweb
LU-15283 quota: deadlock between reint & lquota_wb
[fs/lustre-release.git] / lustre / quota / qsd_reint.c
index 08e8f93..666df96 100644 (file)
@@ -21,7 +21,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012, 2013, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
  * Author: Johann Lombardi <johann.lombardi@intel.com>
@@ -30,6 +30,8 @@
 
 #define DEBUG_SUBSYSTEM S_LQUOTA
 
+#include <linux/kthread.h>
+#include <lustre_swab.h>
 #include "qsd_internal.h"
 
 /*
@@ -56,7 +58,7 @@ static void qsd_reint_completion(const struct lu_env *env,
        }
 
        CDEBUG(D_QUOTA, "%s: global quota lock successfully acquired, glb "
-              "fid:"DFID", glb ver:"LPU64", slv fid:"DFID", slv ver:"LPU64"\n",
+              "fid:"DFID", glb ver:%llu, slv fid:"DFID", slv ver:%llu\n",
               qsd->qsd_svname, PFID(&req_qbody->qb_fid),
               lvb->lvb_glb_ver, PFID(&rep_qbody->qb_slv_fid),
               rep_qbody->qb_slv_ver);
@@ -87,6 +89,14 @@ static int qsd_reint_qid(const struct lu_env *env, struct qsd_qtype_info *qqi,
 
        rc = qsd_update_index(env, qqi, qid, global, 0, rec);
 out:
+
+       if (global && qid->qid_uid == 0) {
+               struct lquota_glb_rec *glb_rec = (struct lquota_glb_rec *)rec;
+               qsd_update_default_quota(qqi, glb_rec->qbr_hardlimit,
+                                        glb_rec->qbr_softlimit,
+                                        glb_rec->qbr_time);
+       }
+
        lqe_putref(lqe);
        RETURN(rc);
 }
@@ -194,14 +204,14 @@ static int qsd_reint_index(const struct lu_env *env, struct qsd_qtype_info *qqi,
 
        /* let's do a 1MB bulk */
        npages = min_t(unsigned int, OFD_MAX_BRW_SIZE, 1 << 20);
-       npages /= PAGE_CACHE_SIZE;
+       npages /= PAGE_SIZE;
 
        /* allocate pages for bulk index read */
-       OBD_ALLOC(pages, npages * sizeof(*pages));
+       OBD_ALLOC_PTR_ARRAY(pages, npages);
        if (pages == NULL)
                GOTO(out, rc = -ENOMEM);
        for (i = 0; i < npages; i++) {
-               pages[i] = alloc_page(GFP_IOFS);
+               pages[i] = alloc_page(GFP_NOFS);
                if (pages[i] == NULL)
                        GOTO(out, rc = -ENOMEM);
        }
@@ -254,7 +264,7 @@ repeat:
                ver = ii->ii_version;
 
        pg_cnt = (ii->ii_count + (LU_PAGE_COUNT) - 1);
-       pg_cnt >>= PAGE_CACHE_SHIFT - LU_PAGE_SHIFT;
+       pg_cnt >>= PAGE_SHIFT - LU_PAGE_SHIFT;
 
        if (pg_cnt > npages) {
                CERROR("%s: master returned more pages than expected, %u > %u"
@@ -275,15 +285,15 @@ out:
                for (i = 0; i < npages; i++)
                        if (pages[i] != NULL)
                                __free_page(pages[i]);
-               OBD_FREE(pages, npages * sizeof(*pages));
+               OBD_FREE_PTR_ARRAY(pages, npages);
        }
 
        /* Update index version */
        if (rc == 0) {
                rc = qsd_write_version(env, qqi, ver, global);
                if (rc)
-                       CERROR("%s: write version "LPU64" to "DFID" failed. "
-                              "%d\n", qsd->qsd_svname, ver, PFID(fid), rc);
+                       CERROR("%s: write version %llu to "DFID" failed : rc = %d\n",
+                              qsd->qsd_svname, ver, PFID(fid), rc);
        }
 
        RETURN(rc);
@@ -305,7 +315,7 @@ static int qsd_reconciliation(const struct lu_env *env,
        LASSERT(qqi->qqi_glb_obj != NULL);
        iops = &qqi->qqi_glb_obj->do_index_ops->dio_it;
 
-       it = iops->init(env, qqi->qqi_glb_obj, 0, BYPASS_CAPA);
+       it = iops->init(env, qqi->qqi_glb_obj, 0);
        if (IS_ERR(it)) {
                CWARN("%s: Initialize it for "DFID" failed. %ld\n",
                      qsd->qsd_svname, PFID(&qqi->qqi_fid), PTR_ERR(it));
@@ -399,17 +409,26 @@ static int qsd_started(struct qsd_instance *qsd)
        return started;
 }
 
+struct qsd_reint_args {
+       struct qsd_qtype_info   *qra_qqi;
+       struct lu_env            qra_env;
+       struct completion       *qra_started;
+};
+
+#ifndef TASK_IDLE
+#define TASK_IDLE TASK_INTERRUPTIBLE
+#endif
+
 /*
  * Routine executed by the reintegration thread.
  */
-static int qsd_reint_main(void *args)
+static int qsd_reint_main(void *_args)
 {
-       struct lu_env           *env;
+       struct qsd_reint_args   *args = _args;
+       struct lu_env           *env = &args->qra_env;
        struct qsd_thread_info  *qti;
-       struct qsd_qtype_info   *qqi = (struct qsd_qtype_info *)args;
+       struct qsd_qtype_info   *qqi = args->qra_qqi;
        struct qsd_instance     *qsd = qqi->qqi_qsd;
-       struct ptlrpc_thread    *thread = &qqi->qqi_reint_thread;
-       struct l_wait_info       lwi = { 0 };
        int                      rc;
        ENTRY;
 
@@ -417,27 +436,19 @@ static int qsd_reint_main(void *args)
               qsd->qsd_svname, PFID(&qqi->qqi_fid));
 
        qqi_getref(qqi);
-       lu_ref_add(&qqi->qqi_reference, "reint_thread", thread);
-
-       thread_set_flags(thread, SVC_RUNNING);
-       wake_up(&thread->t_ctl_waitq);
-
-       OBD_ALLOC_PTR(env);
-       if (env == NULL)
-               GOTO(out, rc = -ENOMEM);
-
-       /* initialize environment */
-       rc = lu_env_init(env, LCT_DT_THREAD);
-       if (rc)
-               GOTO(out_env, rc);
+       lu_ref_add(&qqi->qqi_reference, "reint_thread", current);
        qti = qsd_info(env);
 
+       complete(args->qra_started);
+
        /* wait for the connection to master established */
-       l_wait_event(thread->t_ctl_waitq,
-                    qsd_connected(qsd) || !thread_is_running(thread), &lwi);
+       while (({set_current_state(TASK_IDLE);
+                !qsd_connected(qsd) && !kthread_should_stop(); }))
+               schedule();
+       __set_current_state(TASK_RUNNING);
 
        /* Step 1: enqueue global index lock */
-       if (!thread_is_running(thread))
+       if (kthread_should_stop())
                GOTO(out_env_init, rc = 0);
 
        LASSERT(qsd->qsd_exp != NULL);
@@ -467,14 +478,14 @@ static int qsd_reint_main(void *args)
                if (rc)
                        GOTO(out_env_init, rc);
 
-               CDEBUG(D_QUOTA, "%s: glb_ver:"LPU64"/"LPU64",slv_ver:"LPU64"/"
-                      LPU64"\n", qsd->qsd_svname,
+               CDEBUG(D_QUOTA, "%s: glb_ver:%llu/%llu,slv_ver:%llu/"
+                      "%llu\n", qsd->qsd_svname,
                       qti->qti_lvb.lvb_glb_ver, qqi->qqi_glb_ver,
                       qti->qti_slv_ver, qqi->qqi_slv_ver);
        }
 
        /* Step 2: reintegrate global index */
-       if (!thread_is_running(thread))
+       if (kthread_should_stop())
                GOTO(out_lock, rc = 0);
 
        OBD_FAIL_TIMEOUT(OBD_FAIL_QUOTA_DELAY_REINT, 10);
@@ -491,13 +502,13 @@ static int qsd_reint_main(void *args)
        }
 
        /* Step 3: reintegrate slave index */
-       if (!thread_is_running(thread))
+       if (kthread_should_stop())
                GOTO(out_lock, rc = 0);
 
        if (qqi->qqi_slv_ver != qti->qti_slv_ver) {
                rc = qsd_reint_index(env, qqi, false);
                if (rc) {
-                       CWARN("%s: Reint slave for "DFID" failed. %d\n",
+                       CWARN("%s: reintegration for "DFID" failed with %d\n",
                              qsd->qsd_svname, PFID(&qqi->qqi_slv_fid), rc);
                        GOTO(out_lock, rc);
                }
@@ -506,59 +517,55 @@ static int qsd_reint_main(void *args)
        }
 
        /* wait for the qsd instance started (target recovery done) */
-       l_wait_event(thread->t_ctl_waitq,
-                    qsd_started(qsd) || !thread_is_running(thread), &lwi);
+       while (({set_current_state(TASK_IDLE);
+                !qsd_started(qsd) && !kthread_should_stop(); }))
+               schedule();
+       __set_current_state(TASK_RUNNING);
 
-       if (!thread_is_running(thread))
+       if (kthread_should_stop())
                GOTO(out_lock, rc = 0);
 
        /* Step 4: start reconciliation for each enforced ID */
        rc = qsd_reconciliation(env, qqi);
        if (rc)
-               CWARN("%s: reconciliation failed. "DFID", %d\n",
-                     qsd->qsd_svname, PFID(&qti->qti_fid), rc);
+               CWARN("%s: reconciliation for "DFID" failed with %d\n",
+                     qsd->qsd_svname, PFID(&qqi->qqi_slv_fid), rc);
 
        EXIT;
 out_lock:
        ldlm_lock_decref(&qqi->qqi_lockh, qsd_glb_einfo.ei_mode);
 out_env_init:
        lu_env_fini(env);
-out_env:
-       OBD_FREE_PTR(env);
-out:
+       OBD_FREE_PTR(args);
        write_lock(&qsd->qsd_lock);
        qqi->qqi_reint = 0;
        write_unlock(&qsd->qsd_lock);
 
+       if (xchg(&qqi->qqi_reint_task, NULL) == NULL)
+               wait_var_event(qqi, kthread_should_stop());
+
+       lu_ref_del(&qqi->qqi_reference, "reint_thread", current);
        qqi_putref(qqi);
-       lu_ref_del(&qqi->qqi_reference, "reint_thread", thread);
 
-       thread_set_flags(thread, SVC_STOPPED);
-       wake_up(&thread->t_ctl_waitq);
        return rc;
 }
 
 void qsd_stop_reint_thread(struct qsd_qtype_info *qqi)
 {
-       struct ptlrpc_thread    *thread = &qqi->qqi_reint_thread;
-       struct l_wait_info       lwi = { 0 };
-
-       if (!thread_is_stopped(thread)) {
-               thread_set_flags(thread, SVC_STOPPING);
-               wake_up(&thread->t_ctl_waitq);
+       struct task_struct *task;
 
-               l_wait_event(thread->t_ctl_waitq,
-                            thread_is_stopped(thread), &lwi);
-       }
+       task = xchg(&qqi->qqi_reint_task, NULL);
+       if (task)
+               kthread_stop(task);
 }
 
-static int qsd_entry_iter_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd,
-                            cfs_hlist_node_t *hnode, void *data)
+static int qsd_entry_iter_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+                            struct hlist_node *hnode, void *data)
 {
        struct lquota_entry     *lqe;
        int                     *pending = (int *)data;
 
-       lqe = cfs_hlist_entry(hnode, struct lquota_entry, lqe_hash);
+       lqe = hlist_entry(hnode, struct lquota_entry, lqe_hash);
        LASSERT(atomic_read(&lqe->lqe_ref) > 0);
 
        lqe_read_lock(lqe);
@@ -568,115 +575,122 @@ static int qsd_entry_iter_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd,
        return 0;
 }
 
-static bool qsd_pending_updates(struct qsd_qtype_info *qqi)
+static bool qqi_reint_delayed(struct qsd_qtype_info *qqi)
 {
        struct qsd_instance     *qsd = qqi->qqi_qsd;
        struct qsd_upd_rec      *upd;
        struct lquota_entry     *lqe, *n;
        int                      dqacq = 0;
-       bool                     updates = false;
+       bool                     delay = false;
        ENTRY;
 
        /* any pending quota adjust? */
        spin_lock(&qsd->qsd_adjust_lock);
-       cfs_list_for_each_entry_safe(lqe, n, &qsd->qsd_adjust_list, lqe_link) {
+       list_for_each_entry_safe(lqe, n, &qsd->qsd_adjust_list, lqe_link) {
                if (lqe2qqi(lqe) == qqi) {
-                       cfs_list_del_init(&lqe->lqe_link);
+                       list_del_init(&lqe->lqe_link);
                        lqe_putref(lqe);
                }
        }
        spin_unlock(&qsd->qsd_adjust_lock);
 
+       /* any pending quota request? */
+       cfs_hash_for_each_safe(qqi->qqi_site->lqs_hash, qsd_entry_iter_cb,
+                              &dqacq);
+       if (dqacq) {
+               CDEBUG(D_QUOTA, "%s: pending dqacq for type:%d.\n",
+                      qsd->qsd_svname, qqi->qqi_qtype);
+               GOTO(out, delay = true);
+       }
+
        /* any pending updates? */
-       read_lock(&qsd->qsd_lock);
-       cfs_list_for_each_entry(upd, &qsd->qsd_upd_list, qur_link) {
+       write_lock(&qsd->qsd_lock);
+
+       /* check if the reintegration has already started or finished */
+       if ((qqi->qqi_glb_uptodate && qqi->qqi_slv_uptodate) ||
+            qqi->qqi_reint || qsd->qsd_stopping || qsd->qsd_updating)
+               GOTO(out_lock, delay = true);
+
+       /* there could be some unfinished global or index entry updates
+        * (very unlikely), to avoid them messing up with the reint
+        * procedure, we just return and try to re-start reint later. */
+       list_for_each_entry(upd, &qsd->qsd_upd_list, qur_link) {
                if (upd->qur_qqi == qqi) {
-                       read_unlock(&qsd->qsd_lock);
                        CDEBUG(D_QUOTA, "%s: pending %s updates for type:%d.\n",
                               qsd->qsd_svname,
                               upd->qur_global ? "global" : "slave",
                               qqi->qqi_qtype);
-                       GOTO(out, updates = true);
+                       GOTO(out_lock, delay = true);
                }
        }
-       read_unlock(&qsd->qsd_lock);
+       qqi->qqi_reint = 1;
 
-       /* any pending quota request? */
-       cfs_hash_for_each_safe(qqi->qqi_site->lqs_hash, qsd_entry_iter_cb,
-                              &dqacq);
-       if (dqacq) {
-               CDEBUG(D_QUOTA, "%s: pending dqacq for type:%d.\n",
-                      qsd->qsd_svname, qqi->qqi_qtype);
-               updates = true;
-       }
        EXIT;
+out_lock:
+       write_unlock(&qsd->qsd_lock);
 out:
-       if (updates)
+       if (delay)
                CERROR("%s: Delaying reintegration for qtype:%d until pending "
                       "updates are flushed.\n",
                       qsd->qsd_svname, qqi->qqi_qtype);
-       return updates;
+       return delay;
 }
 
 int qsd_start_reint_thread(struct qsd_qtype_info *qqi)
 {
-       struct ptlrpc_thread    *thread = &qqi->qqi_reint_thread;
        struct qsd_instance     *qsd = qqi->qqi_qsd;
-       struct l_wait_info       lwi = { 0 };
+       struct task_struct      *task;
+       struct qsd_reint_args   *args;
+       DECLARE_COMPLETION_ONSTACK(started);
        int                      rc;
-       char                    *name;
        ENTRY;
 
-       /* don't bother to do reintegration when quota isn't enabled */
-       if (!qsd_type_enabled(qsd, qqi->qqi_qtype))
+       /* do not try to start a new thread as this can lead to a deadlock */
+       if (current->flags & (PF_MEMALLOC | PF_KSWAPD))
                RETURN(0);
 
-       if (qsd->qsd_acct_failed)
-               /* no space accounting support, can't enable enforcement */
+       if (qsd->qsd_dev->dd_rdonly)
                RETURN(0);
 
-       /* check if the reintegration has already started or finished */
-       write_lock(&qsd->qsd_lock);
-
-       if ((qqi->qqi_glb_uptodate && qqi->qqi_slv_uptodate) ||
-            qqi->qqi_reint || qsd->qsd_stopping) {
-               write_unlock(&qsd->qsd_lock);
+       /* don't bother to do reintegration when quota isn't enabled */
+       if (!qsd_type_enabled(qsd, qqi->qqi_qtype))
                RETURN(0);
-       }
-       qqi->qqi_reint = 1;
-
-       write_unlock(&qsd->qsd_lock);
 
-       /* there could be some unfinished global or index entry updates
-        * (very unlikely), to avoid them messing up with the reint
-        * procedure, we just return and try to re-start reint later. */
-       if (qsd_pending_updates(qqi)) {
-               write_lock(&qsd->qsd_lock);
-               qqi->qqi_reint = 0;
-               write_unlock(&qsd->qsd_lock);
+       if (qqi->qqi_acct_failed)
+               /* no space accounting support, can't enable enforcement */
                RETURN(0);
-       }
-
-       OBD_ALLOC(name, MTI_NAME_MAXLEN);
-       if (name == NULL)
-               RETURN(-ENOMEM);
 
-       snprintf(name, MTI_NAME_MAXLEN, "qsd_reint_%d.%s",
-                qqi->qqi_qtype, qsd->qsd_svname);
+       if (qqi_reint_delayed(qqi))
+               RETURN(0);
 
-       rc = PTR_ERR(kthread_run(qsd_reint_main, (void *)qqi, name));
-       OBD_FREE(name, MTI_NAME_MAXLEN);
+       OBD_ALLOC_PTR(args);
+       if (args == NULL)
+               GOTO(out, rc = -ENOMEM);
 
-       if (IS_ERR_VALUE(rc)) {
-               thread_set_flags(thread, SVC_STOPPED);
+       args->qra_started = &started;
+       args->qra_qqi = qqi;
+       /* initialize environment */
+       rc = lu_env_init(&args->qra_env, LCT_DT_THREAD);
+       if (rc)
+               GOTO(out_args, rc);
+       task = kthread_create(qsd_reint_main, args, "qsd_reint_%d.%s",
+                             qqi->qqi_qtype, qsd->qsd_svname);
+
+       if (IS_ERR(task)) {
+               rc = PTR_ERR(task);
+               lu_env_fini(&args->qra_env);
+out_args:
+               OBD_FREE_PTR(args);
+out:
                write_lock(&qsd->qsd_lock);
                qqi->qqi_reint = 0;
                write_unlock(&qsd->qsd_lock);
                RETURN(rc);
        }
 
-       l_wait_event(thread->t_ctl_waitq,
-                    thread_is_running(thread) || thread_is_stopped(thread),
-                    &lwi);
+       qqi->qqi_reint_task = task;
+       wake_up_process(task);
+       wait_for_completion(&started);
+
        RETURN(0);
 }