Whamcloud - gitweb
LU-9221 jobstats: Create a pid-based hash for jobid values 08/25208/26
authorBen Evans <bevans@cray.com>
Wed, 1 Feb 2017 22:06:36 +0000 (16:06 -0600)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 21 Sep 2017 06:13:14 +0000 (06:13 +0000)
Use cfs_hash_table to create a pid to jobID based mapping.
Change default behavior of JobIDs to default to procname_uid if
a suitable value cannot be found in the environment.

All entries older than RESCAN_INTERVAL  seonds are refreshed
on access.
Items can be purged by writing to procfs_name.
"" will remove all entries
When purging the cache, items older than DELETE_INTERVAL are
deleted.

Signed-off-by: Ben Evans <bevans@cray.com>
Change-Id: I22e9d73c4585d7c5496829bc20bce191304e0d58
Reviewed-on: https://review.whamcloud.com/25208
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@seagate.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_class.h
lustre/obdclass/Makefile.in
lustre/obdclass/class_obd.c
lustre/obdclass/jobid.c [new file with mode: 0644]
lustre/obdclass/linux/linux-module.c

index 21a7419..969912e 100644 (file)
@@ -54,7 +54,10 @@ extern rwlock_t obd_dev_lock;
 extern struct obd_device *class_conn2obd(struct lustre_handle *);
 extern struct obd_device *class_exp2obd(struct obd_export *);
 extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
-extern int lustre_get_jobid(char *jobid);
+int lustre_get_jobid(char *jobid);
+void lustre_jobid_clear(const char *jobid);
+void jobid_cache_fini(void);
+int jobid_cache_init(void);
 
 struct lu_device_type;
 
index 9f9a8ea..3ab3240 100644 (file)
@@ -13,7 +13,7 @@ obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o
 obdclass-all-objs += lu_object.o dt_object.o
 obdclass-all-objs += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
 obdclass-all-objs += linkea.o
-obdclass-all-objs += kernelcomm.o
+obdclass-all-objs += kernelcomm.o jobid.o
 
 @SERVER_TRUE@obdclass-all-objs += acl.o
 @SERVER_TRUE@obdclass-all-objs += idmap.o
index 1eda71f..bbb9096 100644 (file)
@@ -98,89 +98,11 @@ EXPORT_SYMBOL(at_extra);
 atomic_long_t obd_dirty_transit_pages;
 EXPORT_SYMBOL(obd_dirty_transit_pages);
 
-char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
-
 #ifdef CONFIG_PROC_FS
 struct lprocfs_stats *obd_memory = NULL;
 EXPORT_SYMBOL(obd_memory);
 #endif
 
-char obd_jobid_node[LUSTRE_JOBID_SIZE + 1];
-
-/* Get jobid of current process by reading the environment variable
- * stored in between the "env_start" & "env_end" of task struct.
- *
- * TODO:
- * It's better to cache the jobid for later use if there is any
- * efficient way, the cl_env code probably could be reused for this
- * purpose.
- *
- * If some job scheduler doesn't store jobid in the "env_start/end",
- * then an upcall could be issued here to get the jobid by utilizing
- * the userspace tools/api. Then, the jobid must be cached.
- */
-int lustre_get_jobid(char *jobid)
-{
-       int jobid_len = LUSTRE_JOBID_SIZE;
-       char tmp_jobid[LUSTRE_JOBID_SIZE] = { 0 };
-       int rc = 0;
-       ENTRY;
-
-       /* Jobstats isn't enabled */
-       if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
-               GOTO(out, rc = 0);
-
-       /* Whole node dedicated to single job */
-       if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
-               memcpy(tmp_jobid, obd_jobid_node, LUSTRE_JOBID_SIZE);
-               GOTO(out, rc = 0);
-       }
-
-       /* Use process name + fsuid as jobid */
-       if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
-               snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u",
-                        current_comm(),
-                        from_kuid(&init_user_ns, current_fsuid()));
-               GOTO(out, rc = 0);
-       }
-
-       rc = cfs_get_environ(obd_jobid_var, tmp_jobid, &jobid_len);
-       if (rc) {
-               if (rc == -EOVERFLOW) {
-                       /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
-                        * variable length strings instead of just numbers), it
-                        * might make sense to keep the unique parts for JobID,
-                        * instead of just returning an error.  That means a
-                        * larger temp buffer for cfs_get_environ(), then
-                        * truncating the string at some separator to fit into
-                        * the specified jobid_len.  Fix later if needed. */
-                       static bool printed;
-                       if (unlikely(!printed)) {
-                               LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
-                                                  "for JobID buffer (%d)\n",
-                                                  obd_jobid_var, jobid_len);
-                               printed = true;
-                       }
-               } else {
-                       CDEBUG((rc == -ENOENT || rc == -EINVAL ||
-                               rc == -EDEADLK) ? D_INFO : D_ERROR,
-                              "Get jobid for (%s) failed: rc = %d\n",
-                              obd_jobid_var, rc);
-               }
-       }
-
-out:
-       if (rc != 0)
-               RETURN(rc);
-
-       /* Only replace the job ID if it changed. */
-       if (strcmp(jobid, tmp_jobid) != 0)
-               memcpy(jobid, tmp_jobid, jobid_len);
-
-       RETURN(0);
-}
-EXPORT_SYMBOL(lustre_get_jobid);
-
 static int class_resolve_dev_name(__u32 len, const char *name)
 {
         int rc;
diff --git a/lustre/obdclass/jobid.c b/lustre/obdclass/jobid.c
new file mode 100644 (file)
index 0000000..8fc4956
--- /dev/null
@@ -0,0 +1,461 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
+ *
+ * Copyright 2017 Cray Inc, all rights reserved.
+ * Author: Ben Evans.
+ *
+ * Store PID->JobID mappings
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+#include <linux/uidgid.h>
+#endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+static struct cfs_hash *jobid_hash;
+static struct cfs_hash_ops jobid_hash_ops;
+spinlock_t jobid_hash_lock;
+
+#define RESCAN_INTERVAL 30
+#define DELETE_INTERVAL 300
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+char obd_jobid_node[LUSTRE_JOBID_SIZE + 1];
+
+/**
+ * Structure to store a single jobID/PID mapping
+ */
+struct jobid_to_pid_map {
+       struct hlist_node       jp_hash;
+       time64_t                jp_time;
+       atomic_t                jp_refcount;
+       spinlock_t              jp_lock; /* protects jp_jobid */
+       char                    jp_jobid[LUSTRE_JOBID_SIZE + 1];
+       pid_t                   jp_pid;
+};
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/API. Then, the jobid must be cached.
+ */
+int get_jobid_from_environ(char *jobid_var, char *jobid, int jobid_len)
+{
+       int rc;
+
+       rc = cfs_get_environ(jobid_var, jobid, &jobid_len);
+       if (!rc)
+               goto out;
+
+       if (rc == -EOVERFLOW) {
+               /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+                * variable length strings instead of just numbers), it
+                * might make sense to keep the unique parts for JobID,
+                * instead of just returning an error.  That means a
+                * larger temp buffer for cfs_get_environ(), then
+                * truncating the string at some separator to fit into
+                * the specified jobid_len.  Fix later if needed. */
+               static bool printed;
+               if (unlikely(!printed)) {
+                       LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+                                          "for JobID buffer (%d)\n",
+                                          obd_jobid_var, jobid_len);
+                       printed = true;
+               }
+       } else {
+               CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+                       rc == -EDEADLK) ? D_INFO : D_ERROR,
+                      "Get jobid for (%s) failed: rc = %d\n",
+                      obd_jobid_var, rc);
+       }
+
+out:
+       return rc;
+}
+
+/*
+ * jobid_should_free_item
+ *
+ * Each item is checked to see if it should be released
+ * Removed from hash table by caller
+ * Actually freed in jobid_put_locked
+ *
+ * Returns 1 if item is to be freed, 0 if it is to be kept
+ */
+
+static int jobid_should_free_item(void *obj, void *data)
+{
+       char *jobid = data;
+       struct jobid_to_pid_map *pidmap = obj;
+       int rc = 0;
+
+       if (obj == NULL)
+               return 0;
+
+       spin_lock(&pidmap->jp_lock);
+       if (jobid == NULL)
+               rc = 1;
+       else if (jobid[0] == '\0')
+               rc = 1;
+       else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
+               rc = 1;
+       else if (strcmp(pidmap->jp_jobid, jobid) == 0)
+               rc = 1;
+       spin_unlock(&pidmap->jp_lock);
+
+       return rc;
+}
+
+/*
+ * check_job_name
+ *
+ * Checks if the jobid is a Lustre process
+ *
+ * Returns true if jobid is valid
+ * Returns false if jobid looks like it's a Lustre process
+ */
+static bool check_job_name(char *jobid)
+{
+       const char *const lustre_reserved[] = {"ll_ping", "ptlrpc",
+                                               "ldlm", "ll_sa", NULL};
+       int i;
+
+       for (i = 0; lustre_reserved[i] != NULL; i++) {
+               if (strncmp(jobid, lustre_reserved[i],
+                           strlen(lustre_reserved[i])) == 0)
+                       return false;
+       }
+       return true;
+}
+
+/*
+ * get_jobid
+ *
+ * Returns the jobid for the current pid.
+ *
+ * If no jobid is found in the table, the jobid is calculated based on
+ * the value of jobid_var, using procname_uid as the default.
+ *
+ * Return: -ENOMEM if allocating a new pidmap fails
+ *         0 for success
+ */
+int get_jobid(char *jobid)
+{
+       pid_t pid = current_pid();
+       struct jobid_to_pid_map *pidmap = NULL;
+       struct jobid_to_pid_map *pidmap2;
+       char tmp_jobid[LUSTRE_JOBID_SIZE + 1];
+       int rc = 0;
+       ENTRY;
+
+       pidmap = cfs_hash_lookup(jobid_hash, &pid);
+       if (pidmap == NULL) {
+               OBD_ALLOC_PTR(pidmap);
+               if (pidmap == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               pidmap->jp_pid = pid;
+               pidmap->jp_time = 0;
+               pidmap->jp_jobid[0] = '\0';
+               spin_lock_init(&pidmap->jp_lock);
+               INIT_HLIST_NODE(&pidmap->jp_hash);
+
+               /*
+                * Add the newly created map to the hash, on key collision we
+                * lost a racing addition and must destroy our newly allocated
+                * map.  The object which exists in the hash will be
+                * returned.
+                */
+               pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
+                                                 &pidmap->jp_hash);
+               if (unlikely(pidmap != pidmap2)) {
+                       CDEBUG(D_INFO, "Duplicate jobid found\n");
+                       OBD_FREE_PTR(pidmap);
+                       pidmap = pidmap2;
+               } else {
+                       cfs_hash_get(jobid_hash, &pidmap->jp_hash);
+               }
+       }
+
+       spin_lock(&pidmap->jp_lock);
+       if ((ktime_get_real_seconds() - pidmap->jp_time >= RESCAN_INTERVAL) ||
+           pidmap->jp_jobid[0] == '\0') {
+               /* mark the pidmap as being up to date, if we fail to find
+                * a good jobid, revert to the old time and try again later
+                * prevent a race with deletion */
+
+               time64_t tmp_time = pidmap->jp_time;
+               pidmap->jp_time = ktime_get_real_seconds();
+
+               spin_unlock(&pidmap->jp_lock);
+               if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+                       rc = 1;
+               } else {
+                       memset(tmp_jobid, '\0', LUSTRE_JOBID_SIZE + 1);
+                       rc = get_jobid_from_environ(obd_jobid_var,
+                                                   tmp_jobid,
+                                                   LUSTRE_JOBID_SIZE + 1);
+               }
+
+               /* Use process name + fsuid as jobid default, or when
+                * specified by "jobname_uid" */
+               if (rc) {
+                       snprintf(tmp_jobid, LUSTRE_JOBID_SIZE, "%s.%u",
+                                current_comm(),
+                                from_kuid(&init_user_ns, current_fsuid()));
+                       rc = 0;
+               }
+
+               CDEBUG(D_INFO, "Jobid to pid mapping established: %d->%s\n",
+                      pidmap->jp_pid, tmp_jobid);
+
+               spin_lock(&pidmap->jp_lock);
+               if (check_job_name(tmp_jobid))
+                       strncpy(pidmap->jp_jobid, tmp_jobid,
+                               LUSTRE_JOBID_SIZE);
+               else
+                       pidmap->jp_time = tmp_time;
+       }
+
+       if (strlen(pidmap->jp_jobid) != 0)
+               strncpy(jobid, pidmap->jp_jobid, LUSTRE_JOBID_SIZE);
+
+       spin_unlock(&pidmap->jp_lock);
+
+       cfs_hash_put(jobid_hash, &pidmap->jp_hash);
+
+       EXIT;
+out:
+       return rc;
+}
+
+/*
+ * Hash initialization, copied from server-side job stats bucket sizes
+ */
+#define HASH_JOBID_BKT_BITS 5
+#define HASH_JOBID_CUR_BITS 7
+#define HASH_JOBID_MAX_BITS 12
+
+int jobid_cache_init(void)
+{
+       int rc = 0;
+       struct cfs_hash *tmp_jobid_hash;
+       ENTRY;
+
+       spin_lock_init(&jobid_hash_lock);
+
+       tmp_jobid_hash = cfs_hash_create("JOBID_HASH",
+                                        HASH_JOBID_CUR_BITS,
+                                        HASH_JOBID_MAX_BITS,
+                                        HASH_JOBID_BKT_BITS, 0,
+                                        CFS_HASH_MIN_THETA,
+                                        CFS_HASH_MAX_THETA,
+                                        &jobid_hash_ops,
+                                        CFS_HASH_DEFAULT);
+
+       spin_lock(&jobid_hash_lock);
+       if (jobid_hash == NULL) {
+               jobid_hash = tmp_jobid_hash;
+               spin_unlock(&jobid_hash_lock);
+       } else {
+               spin_unlock(&jobid_hash_lock);
+               if (tmp_jobid_hash != NULL)
+                       cfs_hash_putref(tmp_jobid_hash);
+       }
+
+       if (!jobid_hash)
+               rc = -ENOMEM;
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(jobid_cache_init);
+
+void jobid_cache_fini(void)
+{
+       struct cfs_hash *tmp_hash;
+       ENTRY;
+
+       spin_lock(&jobid_hash_lock);
+       tmp_hash = jobid_hash;
+       jobid_hash = NULL;
+       spin_unlock(&jobid_hash_lock);
+
+       if (tmp_hash != NULL) {
+               cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
+               cfs_hash_putref(tmp_hash);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(jobid_cache_fini);
+
+/*
+ * Hash operations for pid<->jobid
+ */
+static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
+                            unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
+}
+
+static void *jobid_key(struct hlist_node *hnode)
+{
+       struct jobid_to_pid_map *pidmap;
+
+       pidmap = hlist_entry(hnode, struct jobid_to_pid_map, jp_hash);
+       return &pidmap->jp_pid;
+}
+
+static int jobid_keycmp(const void *key, struct hlist_node *hnode)
+{
+       const pid_t *pid_key1;
+       const pid_t *pid_key2;
+
+       LASSERT(key != NULL);
+       pid_key1 = (pid_t *)key;
+       pid_key2 = (pid_t *)jobid_key(hnode);
+
+       return *pid_key1 == *pid_key2;
+}
+
+static void *jobid_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct jobid_to_pid_map, jp_hash);
+}
+
+static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+       struct jobid_to_pid_map *pidmap;
+
+       pidmap = hlist_entry(hnode, struct jobid_to_pid_map, jp_hash);
+
+       atomic_inc(&pidmap->jp_refcount);
+}
+
+static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+       struct jobid_to_pid_map *pidmap;
+
+       if (hnode == NULL)
+               return;
+
+       pidmap = hlist_entry(hnode, struct jobid_to_pid_map, jp_hash);
+       LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
+       if (atomic_dec_and_test(&pidmap->jp_refcount)) {
+               CDEBUG(D_INFO, "Freeing: %d->%s\n",
+                      pidmap->jp_pid, pidmap->jp_jobid);
+
+               OBD_FREE_PTR(pidmap);
+       }
+}
+
+static struct cfs_hash_ops jobid_hash_ops = {
+       .hs_hash        = jobid_hashfn,
+       .hs_keycmp      = jobid_keycmp,
+       .hs_key         = jobid_key,
+       .hs_object      = jobid_object,
+       .hs_get         = jobid_get,
+       .hs_put         = jobid_put_locked,
+       .hs_put_locked  = jobid_put_locked,
+};
+
+/*
+ * Return the jobid:
+ *
+ * Based on the value of obd_jobid_var
+ * JOBSTATS_DISABLE:  none
+ * JOBSTATS_NODELOCAL:  Contents of obd_jobid_name
+ * JOBSTATS_PROCNAME_UID:  Process name/UID
+ * anything else:  Look up the value in the processes environment
+ * default: JOBSTATS_PROCNAME_UID
+ */
+
+int lustre_get_jobid(char *jobid)
+{
+       int rc = 0;
+       int clear = 0;
+       static time64_t last_delete;
+       ENTRY;
+
+       LASSERT(jobid_hash != NULL);
+
+       spin_lock(&jobid_hash_lock);
+       if (last_delete + DELETE_INTERVAL <= ktime_get_real_seconds()) {
+               clear = 1;
+               last_delete = ktime_get_real_seconds();
+       }
+       spin_unlock(&jobid_hash_lock);
+
+       if (clear)
+               cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
+                                 "intentionally_bad_jobid");
+
+       if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+               /* Jobstats isn't enabled */
+               memset(jobid, 0, LUSTRE_JOBID_SIZE);
+       else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0)
+               /* Whole node dedicated to single job */
+               memcpy(jobid, obd_jobid_node, LUSTRE_JOBID_SIZE);
+       else
+               /* Get jobid from hash table */
+               rc = get_jobid(jobid);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+/*
+ * lustre_jobid_clear
+ *
+ * uses value pushed in via jobid_name
+ * If any entries in the hash table match the value, they are removed
+ */
+void lustre_jobid_clear(const char *data)
+{
+       char jobid[LUSTRE_JOBID_SIZE + 1];
+
+       if (jobid_hash == NULL)
+               return;
+
+       strncpy(jobid, data, LUSTRE_JOBID_SIZE);
+       /* trim \n off the end of the incoming jobid */
+       if (jobid[strlen(jobid) - 1] == '\n')
+               jobid[strlen(jobid) - 1] = '\0';
+
+       CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
+       cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
+
+       CDEBUG(D_INFO, "%d items remain in jobID table\n",
+              atomic_read(&jobid_hash->hs_count));
+}
index d67aa05..c59c9a7 100644 (file)
@@ -387,6 +387,11 @@ static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr,
        if (!count || count > LUSTRE_JOBID_SIZE)
                return -EINVAL;
 
+       if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) != 0) {
+               lustre_jobid_clear(buffer);
+               return count;
+       }
+
        /* clear previous value */
        memset(obd_jobid_node, 0, LUSTRE_JOBID_SIZE);
 
@@ -538,6 +543,12 @@ int class_procfs_init(void)
                goto out;
        }
 
+       rc = jobid_cache_init();
+       if (rc) {
+               kobject_put(lustre_kobj);
+               goto out;
+       }
+
        debugfs_lustre_root = debugfs_create_dir("lustre", NULL);
        if (IS_ERR_OR_NULL(debugfs_lustre_root)) {
                rc = debugfs_lustre_root ? PTR_ERR(debugfs_lustre_root)
@@ -575,6 +586,7 @@ int class_procfs_clean(void)
        debugfs_remove_recursive(debugfs_lustre_root);
 
        debugfs_lustre_root = NULL;
+       jobid_cache_fini();
 
        if (proc_lustre_root)
                lprocfs_remove(&proc_lustre_root);