From 28b0398689a3627f626d434142f0e16b987061cd Mon Sep 17 00:00:00 2001 From: James Simmons Date: Sat, 22 Feb 2025 07:56:17 -0500 Subject: [PATCH] LU-18516 quota: use wait_woken for qsd_op_begin0() Kernels with debugging enabled report for Lustre quota handling: do not call blocking ops when !TASK_RUNNING; ? __might_sleep+0x9d/0xc0 down_read_nested+0x2e/0x4b0 lquota_disk_read+0x46e/0x800 [lquota] qsd_refresh_usage+0x105/0x3d0 [lquota] qsd_acquire+0xbe/0x7c0 [lquota] qsd_op_begin0+0x5f8/0xc80 [lquota] This is due to qsd_acquire() performing operations that can sleep while the kthread is in an idle state. The Linux kernel solution for this is wait_woken(). Move the function qsd_op_begin0() from using wait_event_idle_timeout() to wait_woken(). This will resolve the potential sleeping issues. Lustre-change: https://review.whamcloud.com/58156 Lustre-commit: 2417aeddc649840179a2575a28fdedf3fe662916 Test-Parameters: trivial testlist=sanity-quota Change-Id: Id2b7a5886869bf0ed3d560e159524dcda841d8b0 Signed-off-by: James Simmons Reviewed-by: Andreas Dilger Reviewed-by: Timothy Day Reviewed-by: Sergey Cheremencev Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59141 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/quota/qsd_handler.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/lustre/quota/qsd_handler.c b/lustre/quota/qsd_handler.c index f9f8987..c186a34 100644 --- a/lustre/quota/qsd_handler.c +++ b/lustre/quota/qsd_handler.c @@ -706,10 +706,12 @@ static int qsd_op_begin0(const struct lu_env *env, struct qsd_qtype_info *qqi, enum osd_quota_local_flags *local_flags) { struct lquota_entry *lqe; + DEFINE_WAIT_FUNC(wait, woken_wake_function); enum osd_quota_local_flags qtype_flag = 0; - int rc, ret = -EINPROGRESS; - ENTRY; + int rc = 0, ret = -EINPROGRESS; + long remaining; + ENTRY; if (qid->lqi_qentry != NULL) { /* we already had to deal with this id for this transaction */ lqe = qid->lqi_qentry; @@ -747,18 +749,24 @@ static int qsd_op_begin0(const struct lu_env *env, struct qsd_qtype_info *qqi, lqe_write_unlock(lqe); /* acquire quota space for the operation, cap overall wait time to - * prevent a service thread from being stuck for too long */ - rc = wait_event_idle_timeout( - lqe->lqe_waiters, qsd_acquire(env, lqe, space, &ret), - cfs_time_seconds(qsd_wait_timeout(qqi->qqi_qsd))); + * prevent a service thread from being stuck for too long + */ + remaining = cfs_time_seconds(qsd_wait_timeout(qqi->qqi_qsd)); + add_wait_queue(&lqe->lqe_waiters, &wait); + do { + if (qsd_acquire(env, lqe, space, &ret)) + break; + + remaining = wait_woken(&wait, TASK_IDLE, remaining); + } while (remaining > 0); - if (rc > 0 && ret == 0) { + if (remaining > 0 && ret == 0) { qid->lqi_space += space; rc = 0; } else { - if (rc > 0) + if (remaining > 0) rc = ret; - else if (rc == 0) + else if (remaining == 0) rc = -ETIMEDOUT; LQUOTA_DEBUG(lqe, "acquire quota failed:%d", rc); @@ -768,25 +776,24 @@ static int qsd_op_begin0(const struct lu_env *env, struct qsd_qtype_info *qqi, if (local_flags && lqe->lqe_pending_write != 0) /* Inform OSD layer that there are pending writes. - * It might want to retry after a sync if appropriate */ + * It might want to retry after a sync if appropriate + */ *local_flags |= QUOTA_FL_SYNC; lqe_write_unlock(lqe); /* convert recoverable error into -EINPROGRESS, client will - * retry */ + * retry + */ if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ENOLCK || rc == -EAGAIN || rc == -EINTR) { rc = -EINPROGRESS; } else if (rc == -ESRCH) { rc = 0; - LQUOTA_ERROR(lqe, "ID isn't enforced on master, it " - "probably due to a legeal race, if this " - "message is showing up constantly, there " - "could be some inconsistence between " - "master & slave, and quota reintegration " - "needs be re-triggered."); + LQUOTA_ERROR(lqe, + "ID isn't enforced on master, it probably due to a legeal race, if this message is showing up constantly, there could be some inconsistence between master & slave, and quota reintegration needs be re-triggered."); } } + remove_wait_queue(&lqe->lqe_waiters, &wait); if (local_flags != NULL) { out_flags: -- 1.8.3.1