From: Patrick Farrell Date: Tue, 13 Dec 2016 15:43:34 +0000 (-0600) Subject: LU-8926 llite: reduce jobstats race window X-Git-Tag: 2.9.52~12 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8eca92b365fd3efd1541a48b1bb239926838d947;hp=f1af92e7fc249fd4ce54f0c61b04388a3b0da6bb LU-8926 llite: reduce jobstats race window In the current code, lli_jobid is set to zero on every call to lustre_get_jobid. This causes problems, because it's used asynchronously to set the job id in RPCs, and some RPCs will falsely get no jobid set. (For small IO sizes, this can be up to 60% of RPCs.) It would be very expensive to put hard synchronization between this and every outbound RPC, and it's OK to very rarely get an RPC without correct job stats info. This patch only updates the lli_jobid when the job id has changed, which leaves only a very small window for reading an inconsistent job id. Signed-off-by: Patrick Farrell Change-Id: I6c3a7f8683dc5f5d467940920938db18b0c20462 Reviewed-on: https://review.whamcloud.com/24253 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Chris Horn --- diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index e542d38..f750a12 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -955,6 +955,7 @@ void ll_lli_init(struct ll_inode_info *lli) lli->lli_async_rc = 0; } mutex_init(&lli->lli_layout_mutex); + memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE); } static inline int ll_bdi_register(struct backing_dev_info *bdi) diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 293b559..cf38cde 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -126,29 +126,29 @@ char obd_jobid_node[LUSTRE_JOBID_SIZE + 1]; int lustre_get_jobid(char *jobid) { int jobid_len = LUSTRE_JOBID_SIZE; + char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 }; int rc = 0; ENTRY; - memset(jobid, 0, LUSTRE_JOBID_SIZE); /* Jobstats isn't enabled */ if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) - RETURN(0); + GOTO(out, rc = 0); /* Whole node dedicated to single job */ if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) { - memcpy(jobid, obd_jobid_node, LUSTRE_JOBID_SIZE); - RETURN(0); + memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE); + GOTO(out, rc = 0); } /* Use process name + fsuid as jobid */ if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { - snprintf(jobid, LUSTRE_JOBID_SIZE, "%s.%u", + snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u", current_comm(), from_kuid(&init_user_ns, current_fsuid())); - RETURN(0); + GOTO(out, rc = 0); } - rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len); + rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len); if (rc) { if (rc == -EOVERFLOW) { /* For the PBS_JOBID and LOADL_STEP_ID keys (which are @@ -172,7 +172,16 @@ int lustre_get_jobid(char *jobid) obd_jobid_var, rc); } } - RETURN(rc); + +out: + if (rc != 0) + RETURN(rc); + + /* Only replace the job ID if it changed. */ + if (strcmp(jobid, tmp_jobid) != 0) + memcpy(jobid, tmp_jobid, jobid_len); + + RETURN(0); } EXPORT_SYMBOL(lustre_get_jobid);