From: Ben Evans Date: Wed, 1 Feb 2017 22:06:36 +0000 (-0600) Subject: LU-9221 jobstats: Create a pid-based hash for jobid values X-Git-Tag: 2.10.54~30 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=08479b74ec3599ee91e14f3f646389bb0aca4575;p=fs%2Flustre-release.git LU-9221 jobstats: Create a pid-based hash for jobid values Use cfs_hash_table to create a pid to jobID based mapping. Change default behavior of JobIDs to default to procname_uid if a suitable value cannot be found in the environment. All entries older than RESCAN_INTERVAL seonds are refreshed on access. Items can be purged by writing to procfs_name. "" will remove all entries When purging the cache, items older than DELETE_INTERVAL are deleted. Signed-off-by: Ben Evans Change-Id: I22e9d73c4585d7c5496829bc20bce191304e0d58 Reviewed-on: https://review.whamcloud.com/25208 Reviewed-by: Andreas Dilger Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andrew Perepechko Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 21a7419..969912e 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -54,7 +54,10 @@ extern rwlock_t obd_dev_lock; extern struct obd_device *class_conn2obd(struct lustre_handle *); extern struct obd_device *class_exp2obd(struct obd_export *); extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); -extern int lustre_get_jobid(char *jobid); +int lustre_get_jobid(char *jobid); +void lustre_jobid_clear(const char *jobid); +void jobid_cache_fini(void); +int jobid_cache_init(void); struct lu_device_type; diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index 9f9a8ea..3ab3240 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -13,7 +13,7 @@ obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o obdclass-all-objs += lu_object.o dt_object.o obdclass-all-objs += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o obdclass-all-objs += linkea.o -obdclass-all-objs += kernelcomm.o +obdclass-all-objs += kernelcomm.o jobid.o @SERVER_TRUE@obdclass-all-objs += acl.o @SERVER_TRUE@obdclass-all-objs += idmap.o diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 1eda71f..bbb9096 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -98,89 +98,11 @@ EXPORT_SYMBOL(at_extra); atomic_long_t obd_dirty_transit_pages; EXPORT_SYMBOL(obd_dirty_transit_pages); -char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; - #ifdef CONFIG_PROC_FS struct lprocfs_stats *obd_memory = NULL; EXPORT_SYMBOL(obd_memory); #endif -char obd_jobid_node[LUSTRE_JOBID_SIZE + 1]; - -/* Get jobid of current process by reading the environment variable - * stored in between the "env_start" & "env_end" of task struct. - * - * TODO: - * It's better to cache the jobid for later use if there is any - * efficient way, the cl_env code probably could be reused for this - * purpose. - * - * If some job scheduler doesn't store jobid in the "env_start/end", - * then an upcall could be issued here to get the jobid by utilizing - * the userspace tools/api. Then, the jobid must be cached. - */ -int lustre_get_jobid(char *jobid) -{ - int jobid_len = LUSTRE_JOBID_SIZE; - char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 }; - int rc = 0; - ENTRY; - - /* Jobstats isn't enabled */ - if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) - GOTO(out, rc = 0); - - /* Whole node dedicated to single job */ - if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) { - memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE); - GOTO(out, rc = 0); - } - - /* Use process name + fsuid as jobid */ - if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { - snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u", - current_comm(), - from_kuid(&init_user_ns, current_fsuid())); - GOTO(out, rc = 0); - } - - rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len); - if (rc) { - if (rc == -EOVERFLOW) { - /* For the PBS_JOBID and LOADL_STEP_ID keys (which are - * variable length strings instead of just numbers), it - * might make sense to keep the unique parts for JobID, - * instead of just returning an error. That means a - * larger temp buffer for cfs_get_environ(), then - * truncating the string at some separator to fit into - * the specified jobid_len. Fix later if needed. */ - static bool printed; - if (unlikely(!printed)) { - LCONSOLE_ERROR_MSG(0x16b, "%s value too large " - "for JobID buffer (%d)\n", - obd_jobid_var, jobid_len); - printed = true; - } - } else { - CDEBUG((rc == -ENOENT || rc == -EINVAL || - rc == -EDEADLK) ? D_INFO : D_ERROR, - "Get jobid for (%s) failed: rc = %d\n", - obd_jobid_var, rc); - } - } - -out: - if (rc != 0) - RETURN(rc); - - /* Only replace the job ID if it changed. */ - if (strcmp(jobid, tmp_jobid) != 0) - memcpy(jobid, tmp_jobid, jobid_len); - - RETURN(0); -} -EXPORT_SYMBOL(lustre_get_jobid); - static int class_resolve_dev_name(__u32 len, const char *name) { int rc; diff --git a/lustre/obdclass/jobid.c b/lustre/obdclass/jobid.c new file mode 100644 index 0000000..8fc4956 --- /dev/null +++ b/lustre/obdclass/jobid.c @@ -0,0 +1,461 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2014, Intel Corporation. + * + * Copyright 2017 Cray Inc, all rights reserved. + * Author: Ben Evans. + * + * Store PID->JobID mappings + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#ifdef HAVE_UIDGID_HEADER +#include +#endif + +#include +#include +#include + +static struct cfs_hash *jobid_hash; +static struct cfs_hash_ops jobid_hash_ops; +spinlock_t jobid_hash_lock; + +#define RESCAN_INTERVAL 30 +#define DELETE_INTERVAL 300 + +char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; +char obd_jobid_node[LUSTRE_JOBID_SIZE + 1]; + +/** + * Structure to store a single jobID/PID mapping + */ +struct jobid_to_pid_map { + struct hlist_node jp_hash; + time64_t jp_time; + atomic_t jp_refcount; + spinlock_t jp_lock; /* protects jp_jobid */ + char jp_jobid[LUSTRE_JOBID_SIZE + 1]; + pid_t jp_pid; +}; + +/* Get jobid of current process by reading the environment variable + * stored in between the "env_start" & "env_end" of task struct. + * + * If some job scheduler doesn't store jobid in the "env_start/end", + * then an upcall could be issued here to get the jobid by utilizing + * the userspace tools/API. Then, the jobid must be cached. + */ +int get_jobid_from_environ(char *jobid_var, char *jobid, int jobid_len) +{ + int rc; + + rc = cfs_get_environ(jobid_var, jobid, &jobid_len); + if (!rc) + goto out; + + if (rc == -EOVERFLOW) { + /* For the PBS_JOBID and LOADL_STEP_ID keys (which are + * variable length strings instead of just numbers), it + * might make sense to keep the unique parts for JobID, + * instead of just returning an error. That means a + * larger temp buffer for cfs_get_environ(), then + * truncating the string at some separator to fit into + * the specified jobid_len. Fix later if needed. */ + static bool printed; + if (unlikely(!printed)) { + LCONSOLE_ERROR_MSG(0x16b, "%s value too large " + "for JobID buffer (%d)\n", + obd_jobid_var, jobid_len); + printed = true; + } + } else { + CDEBUG((rc == -ENOENT || rc == -EINVAL || + rc == -EDEADLK) ? D_INFO : D_ERROR, + "Get jobid for (%s) failed: rc = %d\n", + obd_jobid_var, rc); + } + +out: + return rc; +} + +/* + * jobid_should_free_item + * + * Each item is checked to see if it should be released + * Removed from hash table by caller + * Actually freed in jobid_put_locked + * + * Returns 1 if item is to be freed, 0 if it is to be kept + */ + +static int jobid_should_free_item(void *obj, void *data) +{ + char *jobid = data; + struct jobid_to_pid_map *pidmap = obj; + int rc = 0; + + if (obj == NULL) + return 0; + + spin_lock(&pidmap->jp_lock); + if (jobid == NULL) + rc = 1; + else if (jobid[0] == '\0') + rc = 1; + else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL) + rc = 1; + else if (strcmp(pidmap->jp_jobid, jobid) == 0) + rc = 1; + spin_unlock(&pidmap->jp_lock); + + return rc; +} + +/* + * check_job_name + * + * Checks if the jobid is a Lustre process + * + * Returns true if jobid is valid + * Returns false if jobid looks like it's a Lustre process + */ +static bool check_job_name(char *jobid) +{ + const char *const lustre_reserved[] = {"ll_ping", "ptlrpc", + "ldlm", "ll_sa", NULL}; + int i; + + for (i = 0; lustre_reserved[i] != NULL; i++) { + if (strncmp(jobid, lustre_reserved[i], + strlen(lustre_reserved[i])) == 0) + return false; + } + return true; +} + +/* + * get_jobid + * + * Returns the jobid for the current pid. + * + * If no jobid is found in the table, the jobid is calculated based on + * the value of jobid_var, using procname_uid as the default. + * + * Return: -ENOMEM if allocating a new pidmap fails + * 0 for success + */ +int get_jobid(char *jobid) +{ + pid_t pid = current_pid(); + struct jobid_to_pid_map *pidmap = NULL; + struct jobid_to_pid_map *pidmap2; + char tmp_jobid[LUSTRE_JOBID_SIZE + 1]; + int rc = 0; + ENTRY; + + pidmap = cfs_hash_lookup(jobid_hash, &pid); + if (pidmap == NULL) { + OBD_ALLOC_PTR(pidmap); + if (pidmap == NULL) + GOTO(out, rc = -ENOMEM); + + pidmap->jp_pid = pid; + pidmap->jp_time = 0; + pidmap->jp_jobid[0] = '\0'; + spin_lock_init(&pidmap->jp_lock); + INIT_HLIST_NODE(&pidmap->jp_hash); + + /* + * Add the newly created map to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * map. The object which exists in the hash will be + * returned. + */ + pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid, + &pidmap->jp_hash); + if (unlikely(pidmap != pidmap2)) { + CDEBUG(D_INFO, "Duplicate jobid found\n"); + OBD_FREE_PTR(pidmap); + pidmap = pidmap2; + } else { + cfs_hash_get(jobid_hash, &pidmap->jp_hash); + } + } + + spin_lock(&pidmap->jp_lock); + if ((ktime_get_real_seconds() - pidmap->jp_time >= RESCAN_INTERVAL) || + pidmap->jp_jobid[0] == '\0') { + /* mark the pidmap as being up to date, if we fail to find + * a good jobid, revert to the old time and try again later + * prevent a race with deletion */ + + time64_t tmp_time = pidmap->jp_time; + pidmap->jp_time = ktime_get_real_seconds(); + + spin_unlock(&pidmap->jp_lock); + if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { + rc = 1; + } else { + memset(tmp_jobid, '\0', LUSTRE_JOBID_SIZE + 1); + rc = get_jobid_from_environ(obd_jobid_var, + tmp_jobid, + LUSTRE_JOBID_SIZE + 1); + } + + /* Use process name + fsuid as jobid default, or when + * specified by "jobname_uid" */ + if (rc) { + snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u", + current_comm(), + from_kuid(&init_user_ns, current_fsuid())); + rc = 0; + } + + CDEBUG(D_INFO, "Jobid to pid mapping established: %d->%s\n", + pidmap->jp_pid, tmp_jobid); + + spin_lock(&pidmap->jp_lock); + if (check_job_name(tmp_jobid)) + strncpy(pidmap->jp_jobid, tmp_jobid, + LUSTRE_JOBID_SIZE); + else + pidmap->jp_time = tmp_time; + } + + if (strlen(pidmap->jp_jobid) != 0) + strncpy(jobid, pidmap->jp_jobid, LUSTRE_JOBID_SIZE); + + spin_unlock(&pidmap->jp_lock); + + cfs_hash_put(jobid_hash, &pidmap->jp_hash); + + EXIT; +out: + return rc; +} + +/* + * Hash initialization, copied from server-side job stats bucket sizes + */ +#define HASH_JOBID_BKT_BITS 5 +#define HASH_JOBID_CUR_BITS 7 +#define HASH_JOBID_MAX_BITS 12 + +int jobid_cache_init(void) +{ + int rc = 0; + struct cfs_hash *tmp_jobid_hash; + ENTRY; + + spin_lock_init(&jobid_hash_lock); + + tmp_jobid_hash = cfs_hash_create("JOBID_HASH", + HASH_JOBID_CUR_BITS, + HASH_JOBID_MAX_BITS, + HASH_JOBID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &jobid_hash_ops, + CFS_HASH_DEFAULT); + + spin_lock(&jobid_hash_lock); + if (jobid_hash == NULL) { + jobid_hash = tmp_jobid_hash; + spin_unlock(&jobid_hash_lock); + } else { + spin_unlock(&jobid_hash_lock); + if (tmp_jobid_hash != NULL) + cfs_hash_putref(tmp_jobid_hash); + } + + if (!jobid_hash) + rc = -ENOMEM; + + RETURN(rc); +} +EXPORT_SYMBOL(jobid_cache_init); + +void jobid_cache_fini(void) +{ + struct cfs_hash *tmp_hash; + ENTRY; + + spin_lock(&jobid_hash_lock); + tmp_hash = jobid_hash; + jobid_hash = NULL; + spin_unlock(&jobid_hash_lock); + + if (tmp_hash != NULL) { + cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL); + cfs_hash_putref(tmp_hash); + } + + EXIT; +} +EXPORT_SYMBOL(jobid_cache_fini); + +/* + * Hash operations for pid<->jobid + */ +static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key, + unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(pid_t), mask); +} + +static void *jobid_key(struct hlist_node *hnode) +{ + struct jobid_to_pid_map *pidmap; + + pidmap = hlist_entry(hnode, struct jobid_to_pid_map, jp_hash); + return &pidmap->jp_pid; +} + +static int jobid_keycmp(const void *key, struct hlist_node *hnode) +{ + const pid_t *pid_key1; + const pid_t *pid_key2; + + LASSERT(key != NULL); + pid_key1 = (pid_t *)key; + pid_key2 = (pid_t *)jobid_key(hnode); + + return *pid_key1 == *pid_key2; +} + +static void *jobid_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct jobid_to_pid_map, jp_hash); +} + +static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct jobid_to_pid_map *pidmap; + + pidmap = hlist_entry(hnode, struct jobid_to_pid_map, jp_hash); + + atomic_inc(&pidmap->jp_refcount); +} + +static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) +{ + struct jobid_to_pid_map *pidmap; + + if (hnode == NULL) + return; + + pidmap = hlist_entry(hnode, struct jobid_to_pid_map, jp_hash); + LASSERT(atomic_read(&pidmap->jp_refcount) > 0); + if (atomic_dec_and_test(&pidmap->jp_refcount)) { + CDEBUG(D_INFO, "Freeing: %d->%s\n", + pidmap->jp_pid, pidmap->jp_jobid); + + OBD_FREE_PTR(pidmap); + } +} + +static struct cfs_hash_ops jobid_hash_ops = { + .hs_hash = jobid_hashfn, + .hs_keycmp = jobid_keycmp, + .hs_key = jobid_key, + .hs_object = jobid_object, + .hs_get = jobid_get, + .hs_put = jobid_put_locked, + .hs_put_locked = jobid_put_locked, +}; + +/* + * Return the jobid: + * + * Based on the value of obd_jobid_var + * JOBSTATS_DISABLE: none + * JOBSTATS_NODELOCAL: Contents of obd_jobid_name + * JOBSTATS_PROCNAME_UID: Process name/UID + * anything else: Look up the value in the processes environment + * default: JOBSTATS_PROCNAME_UID + */ + +int lustre_get_jobid(char *jobid) +{ + int rc = 0; + int clear = 0; + static time64_t last_delete; + ENTRY; + + LASSERT(jobid_hash != NULL); + + spin_lock(&jobid_hash_lock); + if (last_delete + DELETE_INTERVAL <= ktime_get_real_seconds()) { + clear = 1; + last_delete = ktime_get_real_seconds(); + } + spin_unlock(&jobid_hash_lock); + + if (clear) + cfs_hash_cond_del(jobid_hash, jobid_should_free_item, + "intentionally_bad_jobid"); + + if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) + /* Jobstats isn't enabled */ + memset(jobid, 0, LUSTRE_JOBID_SIZE); + else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) + /* Whole node dedicated to single job */ + memcpy(jobid, obd_jobid_node, LUSTRE_JOBID_SIZE); + else + /* Get jobid from hash table */ + rc = get_jobid(jobid); + + RETURN(rc); +} +EXPORT_SYMBOL(lustre_get_jobid); + +/* + * lustre_jobid_clear + * + * uses value pushed in via jobid_name + * If any entries in the hash table match the value, they are removed + */ +void lustre_jobid_clear(const char *data) +{ + char jobid[LUSTRE_JOBID_SIZE + 1]; + + if (jobid_hash == NULL) + return; + + strncpy(jobid, data, LUSTRE_JOBID_SIZE); + /* trim \n off the end of the incoming jobid */ + if (jobid[strlen(jobid) - 1] == '\n') + jobid[strlen(jobid) - 1] = '\0'; + + CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid); + cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid); + + CDEBUG(D_INFO, "%d items remain in jobID table\n", + atomic_read(&jobid_hash->hs_count)); +} diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c index d67aa05..c59c9a7 100644 --- a/lustre/obdclass/linux/linux-module.c +++ b/lustre/obdclass/linux/linux-module.c @@ -387,6 +387,11 @@ static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr, if (!count || count > LUSTRE_JOBID_SIZE) return -EINVAL; + if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0) { + lustre_jobid_clear(buffer); + return count; + } + /* clear previous value */ memset(obd_jobid_node, 0, LUSTRE_JOBID_SIZE); @@ -538,6 +543,12 @@ int class_procfs_init(void) goto out; } + rc = jobid_cache_init(); + if (rc) { + kobject_put(lustre_kobj); + goto out; + } + debugfs_lustre_root = debugfs_create_dir("lustre", NULL); if (IS_ERR_OR_NULL(debugfs_lustre_root)) { rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root) @@ -575,6 +586,7 @@ int class_procfs_clean(void) debugfs_remove_recursive(debugfs_lustre_root); debugfs_lustre_root = NULL; + jobid_cache_fini(); if (proc_lustre_root) lprocfs_remove(&proc_lustre_root);