From d527e812461baf9db2f6ed960a3b6cc12d4ab37c Mon Sep 17 00:00:00 2001 From: Yang Sheng Date: Mon, 29 Nov 2021 23:00:03 +0800 Subject: [PATCH] LU-15283 quota: deadlock between reint & lquota_wb The reintegration thread may be still running while the lquota_wb thread process the update record. The reint thread will hold the dynlock and start a transaction, lquota_wb thread will start a transacation then try to grab the dynlock. So we must avoid the reint & writeback thread running in parallel. This issue only occur on the ldiskfs case. COMMAND: "qsd_reint_2.wor" __schedule schedule wait_transaction_locked [jbd2] add_transaction_credits [jbd2] start_this_handle [jbd2] jbd2__journal_start [jbd2] __ldiskfs_journal_start_sb [ldiskfs] ldiskfs_release_dquot [ldiskfs] dqput dquot_get_dqblk osd_acct_index_lookup [osd_ldiskfs] lquota_disk_read [lquota] qsd_refresh_usage [lquota] qsd_reconciliation [lquota] qsd_reint_main [lquota] kthread ret_from_fork COMMAND: "lquota_wb_work-" __schedule schedule dynlock_lock [osd_ldiskfs] __iam_it_get [osd_ldiskfs] iam_it_get [osd_ldiskfs] osd_index_iam_lookup [osd_ldiskfs] lquota_disk_write [lquota] qsd_update_index [lquota] qsd_upd_thread [lquota] kthread ret_from_fork Signed-off-by: Yang Sheng Change-Id: I8cdd6227d3b0c5d4f67c432c3129da42a83c0ef2 Reviewed-on: https://review.whamcloud.com/45667 Tested-by: jenkins Reviewed-by: Sergey Cheremencev Tested-by: Maloo Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- lustre/include/lustre_quota.h | 2 +- lustre/osd-ldiskfs/osd_handler.c | 6 ++-- lustre/osd-zfs/osd_handler.c | 6 ++-- lustre/quota/qsd_internal.h | 4 ++- lustre/quota/qsd_lib.c | 5 +++- lustre/quota/qsd_reint.c | 65 +++++++++++++++++----------------------- lustre/quota/qsd_writeback.c | 46 ++++++++++++++++++++++++---- 7 files changed, 82 insertions(+), 52 deletions(-) diff --git a/lustre/include/lustre_quota.h b/lustre/include/lustre_quota.h index 9844dbf..4b674d8 100644 --- a/lustre/include/lustre_quota.h +++ b/lustre/include/lustre_quota.h @@ -185,7 +185,7 @@ enum osd_quota_local_flags { }; struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *, - struct proc_dir_entry *, bool is_md); + struct proc_dir_entry *, bool is_md, bool excl); int qsd_prepare(const struct lu_env *, struct qsd_instance *); int qsd_start(const struct lu_env *, struct qsd_instance *); void qsd_fini(const struct lu_env *, struct qsd_instance *); diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 55403b4..d206292 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -8207,8 +8207,8 @@ static int osd_device_init0(const struct lu_env *env, /* currently it's no need to prepare qsd_instance_md for OST */ if (!o->od_is_ost) { o->od_quota_slave_md = qsd_init(env, o->od_svname, - &o->od_dt_dev, - o->od_proc_entry, true); + &o->od_dt_dev, o->od_proc_entry, + true, true); if (IS_ERR(o->od_quota_slave_md)) { rc = PTR_ERR(o->od_quota_slave_md); o->od_quota_slave_md = NULL; @@ -8217,7 +8217,7 @@ static int osd_device_init0(const struct lu_env *env, } o->od_quota_slave_dt = qsd_init(env, o->od_svname, &o->od_dt_dev, - o->od_proc_entry, false); + o->od_proc_entry, false, true); if (IS_ERR(o->od_quota_slave_dt)) { if (o->od_quota_slave_md != NULL) { diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 8a9d4c2..f4f3ea8 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -1208,8 +1208,8 @@ static int osd_mount(const struct lu_env *env, /* currently it's no need to prepare qsd_instance_md for OST */ if (!o->od_is_ost) { o->od_quota_slave_md = qsd_init(env, o->od_svname, - &o->od_dt_dev, - o->od_proc_entry, true); + &o->od_dt_dev, o->od_proc_entry, + true, false); if (IS_ERR(o->od_quota_slave_md)) { rc = PTR_ERR(o->od_quota_slave_md); o->od_quota_slave_md = NULL; @@ -1218,7 +1218,7 @@ static int osd_mount(const struct lu_env *env, } o->od_quota_slave_dt = qsd_init(env, o->od_svname, &o->od_dt_dev, - o->od_proc_entry, false); + o->od_proc_entry, false, false); if (IS_ERR(o->od_quota_slave_dt)) { if (o->od_quota_slave_md != NULL) { diff --git a/lustre/quota/qsd_internal.h b/lustre/quota/qsd_internal.h index 7144e2e..b590b84 100644 --- a/lustre/quota/qsd_internal.h +++ b/lustre/quota/qsd_internal.h @@ -111,7 +111,9 @@ struct qsd_instance { qsd_prepared:1, /* qsd_prepare() successfully * called */ qsd_exp_valid:1,/* qsd_exp is now valid */ - qsd_stopping:1; /* qsd_instance is stopping */ + qsd_stopping:1, /* qsd_instance is stopping */ + qsd_updating:1, /* qsd is updating record */ + qsd_exclusive:1; /* upd exclusive with reint */ }; diff --git a/lustre/quota/qsd_lib.c b/lustre/quota/qsd_lib.c index 03e6f79..6b827cb 100644 --- a/lustre/quota/qsd_lib.c +++ b/lustre/quota/qsd_lib.c @@ -651,7 +651,8 @@ EXPORT_SYMBOL(qsd_fini); */ struct qsd_instance *qsd_init(const struct lu_env *env, char *svname, struct dt_device *dev, - struct proc_dir_entry *osd_proc, bool is_md) + struct proc_dir_entry *osd_proc, + bool is_md, bool excl) { struct qsd_thread_info *qti = qsd_info(env); struct qsd_instance *qsd; @@ -677,6 +678,8 @@ struct qsd_instance *qsd_init(const struct lu_env *env, char *svname, qsd->qsd_prepared = false; qsd->qsd_started = false; qsd->qsd_is_md = is_md; + qsd->qsd_updating = false; + qsd->qsd_exclusive = excl; /* copy service name */ if (strlcpy(qsd->qsd_svname, svname, sizeof(qsd->qsd_svname)) diff --git a/lustre/quota/qsd_reint.c b/lustre/quota/qsd_reint.c index ac9b86c..666df96 100644 --- a/lustre/quota/qsd_reint.c +++ b/lustre/quota/qsd_reint.c @@ -575,13 +575,13 @@ static int qsd_entry_iter_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd, return 0; } -static bool qsd_pending_updates(struct qsd_qtype_info *qqi) +static bool qqi_reint_delayed(struct qsd_qtype_info *qqi) { struct qsd_instance *qsd = qqi->qqi_qsd; struct qsd_upd_rec *upd; struct lquota_entry *lqe, *n; int dqacq = 0; - bool updates = false; + bool delay = false; ENTRY; /* any pending quota adjust? */ @@ -594,35 +594,46 @@ static bool qsd_pending_updates(struct qsd_qtype_info *qqi) } spin_unlock(&qsd->qsd_adjust_lock); + /* any pending quota request? */ + cfs_hash_for_each_safe(qqi->qqi_site->lqs_hash, qsd_entry_iter_cb, + &dqacq); + if (dqacq) { + CDEBUG(D_QUOTA, "%s: pending dqacq for type:%d.\n", + qsd->qsd_svname, qqi->qqi_qtype); + GOTO(out, delay = true); + } + /* any pending updates? */ - read_lock(&qsd->qsd_lock); + write_lock(&qsd->qsd_lock); + + /* check if the reintegration has already started or finished */ + if ((qqi->qqi_glb_uptodate && qqi->qqi_slv_uptodate) || + qqi->qqi_reint || qsd->qsd_stopping || qsd->qsd_updating) + GOTO(out_lock, delay = true); + + /* there could be some unfinished global or index entry updates + * (very unlikely), to avoid them messing up with the reint + * procedure, we just return and try to re-start reint later. */ list_for_each_entry(upd, &qsd->qsd_upd_list, qur_link) { if (upd->qur_qqi == qqi) { - read_unlock(&qsd->qsd_lock); CDEBUG(D_QUOTA, "%s: pending %s updates for type:%d.\n", qsd->qsd_svname, upd->qur_global ? "global" : "slave", qqi->qqi_qtype); - GOTO(out, updates = true); + GOTO(out_lock, delay = true); } } - read_unlock(&qsd->qsd_lock); + qqi->qqi_reint = 1; - /* any pending quota request? */ - cfs_hash_for_each_safe(qqi->qqi_site->lqs_hash, qsd_entry_iter_cb, - &dqacq); - if (dqacq) { - CDEBUG(D_QUOTA, "%s: pending dqacq for type:%d.\n", - qsd->qsd_svname, qqi->qqi_qtype); - updates = true; - } EXIT; +out_lock: + write_unlock(&qsd->qsd_lock); out: - if (updates) + if (delay) CERROR("%s: Delaying reintegration for qtype:%d until pending " "updates are flushed.\n", qsd->qsd_svname, qqi->qqi_qtype); - return updates; + return delay; } int qsd_start_reint_thread(struct qsd_qtype_info *qqi) @@ -649,28 +660,8 @@ int qsd_start_reint_thread(struct qsd_qtype_info *qqi) /* no space accounting support, can't enable enforcement */ RETURN(0); - /* check if the reintegration has already started or finished */ - write_lock(&qsd->qsd_lock); - - if ((qqi->qqi_glb_uptodate && qqi->qqi_slv_uptodate) || - qqi->qqi_reint || qsd->qsd_stopping) { - write_unlock(&qsd->qsd_lock); + if (qqi_reint_delayed(qqi)) RETURN(0); - } - qqi->qqi_reint = 1; - - write_unlock(&qsd->qsd_lock); - - /* there could be some unfinished global or index entry updates - * (very unlikely), to avoid them messing up with the reint - * procedure, we just return and try to re-start reint later. */ - if (qsd_pending_updates(qqi)) { - write_lock(&qsd->qsd_lock); - qqi->qqi_reint = 0; - write_unlock(&qsd->qsd_lock); - RETURN(0); - } - OBD_ALLOC_PTR(args); if (args == NULL) diff --git a/lustre/quota/qsd_writeback.c b/lustre/quota/qsd_writeback.c index 6584df5..e32045e 100644 --- a/lustre/quota/qsd_writeback.c +++ b/lustre/quota/qsd_writeback.c @@ -280,9 +280,18 @@ static int qsd_process_upd(const struct lu_env *env, struct qsd_upd_rec *upd) { struct lquota_entry *lqe = upd->qur_lqe; struct qsd_qtype_info *qqi = upd->qur_qqi; + struct qsd_instance *qsd = qqi->qqi_qsd; int rc; ENTRY; + if (qsd->qsd_exclusive) { /* It could be deadlock running with reint */ + read_lock(&qsd->qsd_lock); + rc = qqi->qqi_reint; + read_unlock(&qsd->qsd_lock); + if (rc) + return 1; + } + if (lqe == NULL) { lqe = lqe_locate(env, qqi->qqi_site, &upd->qur_qid); if (IS_ERR(lqe)) @@ -298,9 +307,9 @@ static int qsd_process_upd(const struct lu_env *env, struct qsd_upd_rec *upd) /* refresh usage */ qsd_refresh_usage(env, lqe); - spin_lock(&qqi->qqi_qsd->qsd_adjust_lock); + spin_lock(&qsd->qsd_adjust_lock); lqe->lqe_adjust_time = 0; - spin_unlock(&qqi->qqi_qsd->qsd_adjust_lock); + spin_unlock(&qsd->qsd_adjust_lock); /* Report usage asynchronously */ rc = qsd_adjust(env, lqe); @@ -411,6 +420,8 @@ static bool qsd_job_pending(struct qsd_instance *qsd, struct list_head *upd, list_splice_init(&qsd->qsd_upd_list, upd); job_pending = true; } + if (qsd->qsd_exclusive) + qsd->qsd_updating = job_pending; for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++) { struct qsd_qtype_info *qqi = qsd->qsd_type_array[qtype]; @@ -466,15 +477,38 @@ static int qsd_upd_thread(void *_args) complete(args->qua_started); while (({set_current_state(TASK_IDLE); !kthread_should_stop(); })) { + int count = 0; if (!qsd_job_pending(qsd, &queue, &uptodate)) schedule_timeout(cfs_time_seconds(QSD_WB_INTERVAL)); __set_current_state(TASK_RUNNING); - list_for_each_entry_safe(upd, n, &queue, qur_link) { - list_del_init(&upd->qur_link); - qsd_process_upd(env, upd); - qsd_upd_free(upd); + while (1) { + list_for_each_entry_safe(upd, n, &queue, qur_link) { + if (qsd_process_upd(env, upd) <= 0) { + list_del_init(&upd->qur_link); + qsd_upd_free(upd); + } + } + if (list_empty(&queue)) + break; + count++; + if (count % 7 == 0) { + n = list_entry(&queue, struct qsd_upd_rec, + qur_link); + CWARN("%s: The reintegration thread [%d] " + "blocked more than %ld seconds\n", + n->qur_qqi->qqi_qsd->qsd_svname, + n->qur_qqi->qqi_qtype, count * + cfs_time_seconds(QSD_WB_INTERVAL) / 10); + } + schedule_timeout_interruptible( + cfs_time_seconds(QSD_WB_INTERVAL) / 10); + } + if (qsd->qsd_exclusive) { + write_lock(&qsd->qsd_lock); + qsd->qsd_updating = false; + write_unlock(&qsd->qsd_lock); } spin_lock(&qsd->qsd_adjust_lock); -- 1.8.3.1