From 75a11fe9f6af21d75c75480c2838b00a122fdaca Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 12 Oct 2012 15:23:13 -0600 Subject: [PATCH] LU-2163 lprocfs: fix jobstats initialization race If two threads are racing to add the same jobid into the job stats list in lprocfs_job_stats_log(), one thread will lose the race from cfs_hash_findadd_unique() and enter the "if (job != job2)" case. It could fail LASSERT(!cfs_list_empty(&job->js_list)) depending whether the other thread in "else" added "job2" to the list first or not. Simply locking the check for cfs_list_empty(&job->js_list) is not sufficient to fix the race. There would need to be locking over the whole cfs_hash_findadd_unique() and cfs_list_add() calls, but since ojs_lock is global for the whole OST this may have performance costs. Instead, just remove the LASSERT() entirely, since it provides no value, and the "losing" thread can happily use the job_stat struct immediately since it was fully initialized in job_alloc(). Signed-off-by: Andreas Dilger Change-Id: Iecb17e2dc80621fd388295998df5708bcaabcab0 Reviewed-on: http://review.whamcloud.com/4263 Tested-by: Hudson Reviewed-by: Oleg Drokin --- lustre/obdclass/class_obd.c | 2 +- lustre/obdclass/lprocfs_jobstats.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index dd981e9..8d4b47f 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -129,7 +129,7 @@ int lustre_get_jobid(char *jobid) /* Use process name + fsuid as jobid */ if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { - snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s_%u", + snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u", cfs_curproc_comm(), cfs_curproc_fsuid()); RETURN(0); } diff --git a/lustre/obdclass/lprocfs_jobstats.c b/lustre/obdclass/lprocfs_jobstats.c index f369f97..6adbe83 100644 --- a/lustre/obdclass/lprocfs_jobstats.c +++ b/lustre/obdclass/lprocfs_jobstats.c @@ -245,7 +245,11 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, if (job2 != job) { job_putref(job); job = job2; - LASSERT(!cfs_list_empty(&job->js_list)); + /* We cannot LASSERT(!cfs_list_empty(&job->js_list)) here, + * since we just lost the race for inserting "job" into the + * ojs_list, and some other thread is doing it _right_now_. + * Instead, be content the other thread is doing this, since + * "job2" was initialized in job_alloc() already. LU-2163 */ } else { LASSERT(cfs_list_empty(&job->js_list)); cfs_write_lock(&stats->ojs_lock); -- 1.8.3.1