From: Shaun Tancheff Date: Wed, 5 Feb 2025 16:56:47 +0000 (+0700) Subject: LU-18610 obdclass: add job expired flag X-Git-Tag: 2.16.53~93 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=b1a07c17b620e7ed6437d35b477f3b34c21f9200;p=fs%2Flustre-release.git LU-18610 obdclass: add job expired flag In lprocfs_job_cleanup() expired jobs are de-referenced before being removed from the lru to defer holding a spinlock. This opens a race where a job can be put multiple times when only a single put on expiry is expected. To avoid a double de-reference race use a bit flag to avoid the extra de-reference on jobs in the process of being expired and removed. HPE-bug-id: LUS-12670 Test-Parameters: testlist=sanity env=ONLY=205,ONLY_REPEAT=100 Fixes: cad59b9b72 ("LU-18351 obdclass: jobstat scaling") Signed-off-by: Shaun Tancheff Change-Id: Ia7dc91cac313919827cc13db971ffb3debe318c2 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57616 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin Reviewed-by: James Simmons --- diff --git a/lustre/obdclass/lprocfs_jobstats.c b/lustre/obdclass/lprocfs_jobstats.c index c68818b..30f7e48 100644 --- a/lustre/obdclass/lprocfs_jobstats.c +++ b/lustre/obdclass/lprocfs_jobstats.c @@ -18,6 +18,10 @@ #ifdef CONFIG_PROC_FS +enum js_info_flags { + JS_EXPIRED, /* job is timed out and schedule for removal */ +}; + #define JOB_CLEANUP_BATCH 1024 /* * JobID formats & JobID environment variable names for supported @@ -49,6 +53,7 @@ struct job_stat { struct rb_node js_idnode; /* js_jobid sorted node */ struct rb_node js_posnode; /* pos sorted node */ struct list_head js_lru; /* on ojs_lru, with ojs_lock */ + unsigned long js_flags; /* JS_* flags */ struct llist_node js_deleted; /* on ojs_deleted w/ojs_lock */ u64 js_pos_id; /* pos for job stats seq file */ struct kref js_refcount; /* num users of this struct */ @@ -180,6 +185,9 @@ static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool clear) list_for_each_entry_rcu(job, &stats->ojs_lru, js_lru) { if (!ktime_before(job->js_time_latest, oldest)) break; + /* only put jobs that have not expired */ + if (test_and_set_bit(JS_EXPIRED, &job->js_flags)) + continue; job_putref(job); /* drop ref to initiate removal */ } rcu_read_unlock(); @@ -213,6 +221,7 @@ static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs) job->js_jobstats = jobs; RB_CLEAR_NODE(&job->js_idnode); INIT_LIST_HEAD(&job->js_lru); + clear_bit(JS_EXPIRED, &job->js_flags); /* open code init_llist_node */ job->js_deleted.next = &job->js_deleted; kref_init(&job->js_refcount); @@ -319,6 +328,8 @@ static struct job_stat *job_insert(struct obd_job_stats *stats, struct job_stat *existing_job; existing_job = container_of(node, struct job_stat, js_idnode); + if (test_bit(JS_EXPIRED, &existing_job->js_flags)) + return ERR_PTR(-EAGAIN); if (kref_get_unless_zero(&existing_job->js_refcount)) return existing_job; /* entry is being deleted */