Whamcloud - gitweb
LU-18610 obdclass: add job expired flag 16/57616/7
authorShaun Tancheff <shaun.tancheff@hpe.com>
Wed, 5 Feb 2025 16:56:47 +0000 (23:56 +0700)
committerOleg Drokin <green@whamcloud.com>
Fri, 28 Feb 2025 08:12:37 +0000 (08:12 +0000)
In lprocfs_job_cleanup() expired jobs are de-referenced before
being removed from the lru to defer holding a spinlock.
This opens a race where a job can be put multiple times
when only a single put on expiry is expected. To avoid a double
de-reference race use a bit flag to avoid the extra de-reference
on jobs in the process of being expired and removed.

HPE-bug-id: LUS-12670
Test-Parameters: testlist=sanity env=ONLY=205,ONLY_REPEAT=100
Fixes: cad59b9b72 ("LU-18351 obdclass: jobstat scaling")
Signed-off-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Change-Id: Ia7dc91cac313919827cc13db971ffb3debe318c2
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57616
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
lustre/obdclass/lprocfs_jobstats.c

index c68818b..30f7e48 100644 (file)
 
 #ifdef CONFIG_PROC_FS
 
+enum js_info_flags {
+       JS_EXPIRED,             /* job is timed out and schedule for removal */
+};
+
 #define JOB_CLEANUP_BATCH 1024
 /*
  * JobID formats & JobID environment variable names for supported
@@ -49,6 +53,7 @@ struct job_stat {
        struct rb_node          js_idnode;      /* js_jobid sorted node */
        struct rb_node          js_posnode;     /* pos sorted node */
        struct list_head        js_lru;         /* on ojs_lru, with ojs_lock */
+       unsigned long           js_flags;       /* JS_* flags */
        struct llist_node       js_deleted;     /* on ojs_deleted w/ojs_lock */
        u64                     js_pos_id;      /* pos for job stats seq file */
        struct kref             js_refcount;    /* num users of this struct */
@@ -180,6 +185,9 @@ static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool clear)
        list_for_each_entry_rcu(job, &stats->ojs_lru, js_lru) {
                if (!ktime_before(job->js_time_latest, oldest))
                        break;
+               /* only put jobs that have not expired */
+               if (test_and_set_bit(JS_EXPIRED, &job->js_flags))
+                       continue;
                job_putref(job); /* drop ref to initiate removal */
        }
        rcu_read_unlock();
@@ -213,6 +221,7 @@ static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
        job->js_jobstats = jobs;
        RB_CLEAR_NODE(&job->js_idnode);
        INIT_LIST_HEAD(&job->js_lru);
+       clear_bit(JS_EXPIRED, &job->js_flags);
        /* open code init_llist_node */
        job->js_deleted.next = &job->js_deleted;
        kref_init(&job->js_refcount);
@@ -319,6 +328,8 @@ static struct job_stat *job_insert(struct obd_job_stats *stats,
                struct job_stat *existing_job;
 
                existing_job = container_of(node, struct job_stat, js_idnode);
+               if (test_bit(JS_EXPIRED, &existing_job->js_flags))
+                       return ERR_PTR(-EAGAIN);
                if (kref_get_unless_zero(&existing_job->js_refcount))
                        return existing_job;
                /* entry is being deleted */