Whamcloud - gitweb
LU-18516 quota: use wait_woken for qsd_op_begin0() 56/58156/3
authorJames Simmons <jsimmons@infradead.org>
Sat, 22 Feb 2025 12:56:17 +0000 (07:56 -0500)
committerOleg Drokin <green@whamcloud.com>
Thu, 6 Mar 2025 08:07:27 +0000 (08:07 +0000)
Kernels with debugging enabled report for Lustre quota handling:

do not call blocking ops when !TASK_RUNNING;
? __might_sleep+0x9d/0xc0
  down_read_nested+0x2e/0x4b0
  lquota_disk_read+0x46e/0x800 [lquota]
  qsd_refresh_usage+0x105/0x3d0 [lquota]
  qsd_acquire+0xbe/0x7c0 [lquota]
  qsd_op_begin0+0x5f8/0xc80 [lquota]

This is due to qsd_acquire() performing operations that can sleep while
the kthread is in an idle state. The Linux kernel solution for this
is wait_woken(). Move the function qsd_op_begin0() from using
wait_event_idle_timeout() to wait_woken(). This will resolve the
potential sleeping issues.

Test-Parameters: trivial testlist=sanity-quota
Change-Id: Id2b7a5886869bf0ed3d560e159524dcda841d8b0
Signed-off-by: James Simmons <jsimmons@infradead.org>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58156
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Timothy Day <timday@amazon.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/quota/qsd_handler.c

index 3d7f82a..0567ad7 100644 (file)
@@ -707,10 +707,12 @@ static int qsd_op_begin0(const struct lu_env *env, struct qsd_qtype_info *qqi,
                         enum osd_quota_local_flags *local_flags)
 {
        struct lquota_entry *lqe;
+       DEFINE_WAIT_FUNC(wait, woken_wake_function);
        enum osd_quota_local_flags qtype_flag = 0;
-       int rc, ret = -EINPROGRESS;
-       ENTRY;
+       int rc = 0, ret = -EINPROGRESS;
+       long remaining;
 
+       ENTRY;
        if (qid->lqi_qentry != NULL) {
                /* we already had to deal with this id for this transaction */
                lqe = qid->lqi_qentry;
@@ -748,18 +750,24 @@ static int qsd_op_begin0(const struct lu_env *env, struct qsd_qtype_info *qqi,
        lqe_write_unlock(lqe);
 
        /* acquire quota space for the operation, cap overall wait time to
-        * prevent a service thread from being stuck for too long */
-       rc = wait_event_idle_timeout(
-               lqe->lqe_waiters, qsd_acquire(env, lqe, space, &ret),
-               cfs_time_seconds(qsd_wait_timeout(qqi->qqi_qsd)));
+        * prevent a service thread from being stuck for too long
+        */
+       remaining = cfs_time_seconds(qsd_wait_timeout(qqi->qqi_qsd));
+       add_wait_queue(&lqe->lqe_waiters, &wait);
+       do {
+               if (qsd_acquire(env, lqe, space, &ret))
+                       break;
 
-       if (rc > 0 && ret == 0) {
+               remaining = wait_woken(&wait, TASK_IDLE, remaining);
+       } while (remaining > 0);
+
+       if (remaining > 0 && ret == 0) {
                qid->lqi_space += space;
                rc = 0;
        } else {
-               if (rc > 0)
+               if (remaining > 0)
                        rc = ret;
-               else if (rc == 0)
+               else if (remaining == 0)
                        rc = -ETIMEDOUT;
 
                LQUOTA_DEBUG(lqe, "acquire quota failed:%d", rc);
@@ -769,25 +777,24 @@ static int qsd_op_begin0(const struct lu_env *env, struct qsd_qtype_info *qqi,
 
                if (local_flags && lqe->lqe_pending_write != 0)
                        /* Inform OSD layer that there are pending writes.
-                        * It might want to retry after a sync if appropriate */
+                        * It might want to retry after a sync if appropriate
+                        */
                         *local_flags |= QUOTA_FL_SYNC;
                lqe_write_unlock(lqe);
 
                /* convert recoverable error into -EINPROGRESS, client will
-                * retry */
+                * retry
+                */
                if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ENOLCK ||
                    rc == -EAGAIN || rc == -EINTR) {
                        rc = -EINPROGRESS;
                } else if (rc == -ESRCH) {
                        rc = 0;
-                       LQUOTA_ERROR(lqe, "ID isn't enforced on master, it "
-                                    "probably due to a legeal race, if this "
-                                    "message is showing up constantly, there "
-                                    "could be some inconsistence between "
-                                    "master & slave, and quota reintegration "
-                                    "needs be re-triggered.");
+                       LQUOTA_ERROR(lqe,
+                                    "ID isn't enforced on master, it probably due to a legeal race, if this message is showing up constantly, there could be some inconsistence between master & slave, and quota reintegration needs be re-triggered.");
                }
        }
+       remove_wait_queue(&lqe->lqe_waiters, &wait);
 
        if (local_flags != NULL) {
 out_flags: