4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2014, Intel Corporation.
28 * Copyright 2017 Cray Inc, all rights reserved.
31 * Store PID->JobID mappings
34 #define DEBUG_SUBSYSTEM S_RPC
35 #include <linux/user_namespace.h>
36 #ifdef HAVE_UIDGID_HEADER
37 #include <linux/uidgid.h>
39 #include <linux/utsname.h>
41 #include <libcfs/libcfs.h>
42 #include <obd_support.h>
43 #include <obd_class.h>
44 #include <lustre_net.h>
46 static struct cfs_hash *jobid_hash;
47 static struct cfs_hash_ops jobid_hash_ops;
48 spinlock_t jobid_hash_lock;
50 #define RESCAN_INTERVAL 30
51 #define DELETE_INTERVAL 300
53 char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
54 char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
57 * Structure to store a single PID->JobID mapping
59 struct jobid_pid_map {
60 struct hlist_node jp_hash;
62 spinlock_t jp_lock; /* protects jp_jobid */
63 char jp_jobid[LUSTRE_JOBID_SIZE];
64 unsigned int jp_joblen;
70 * Jobid can be set for a session (see setsid(2)) by writing to
71 * a sysfs file from any process in that session.
72 * The jobids are stored in a hash table indexed by the relevant
73 * struct pid. We periodically look for entries where the pid has
74 * no PIDTYPE_SID tasks any more, and prune them. This happens within
75 * 5 seconds of a jobid being added, and every 5 minutes when jobids exist,
78 #define JOBID_EXPEDITED_CLEAN (5)
79 #define JOBID_BACKGROUND_CLEAN (5 * 60)
81 struct session_jobid {
82 struct pid *sj_session;
83 struct rhash_head sj_linkage;
84 struct rcu_head sj_rcu;
88 static const struct rhashtable_params jobid_params = {
89 .key_len = sizeof(struct pid *),
90 .key_offset = offsetof(struct session_jobid, sj_session),
91 .head_offset = offsetof(struct session_jobid, sj_linkage),
94 static struct rhashtable session_jobids;
97 * jobid_current must be called with rcu_read_lock held.
98 * if it returns non-NULL, the string can only be used
99 * until rcu_read_unlock is called.
101 char *jobid_current(void)
103 struct pid *sid = task_session(current);
104 struct session_jobid *sj;
106 sj = rhashtable_lookup_fast(&session_jobids, &sid, jobid_params);
112 static void jobid_prune_expedite(void);
114 * jobid_set_current will try to add a new entry
115 * to the table. If one exists with the same key, the
116 * jobid will be replaced
118 int jobid_set_current(char *jobid)
121 struct session_jobid *sj, *origsj;
123 int len = strlen(jobid);
125 sj = kmalloc(sizeof(*sj) + len, GFP_KERNEL);
129 sid = task_session(current);
130 sj->sj_session = get_pid(sid);
131 strncpy(sj->sj_jobid, jobid, len+1);
132 origsj = rhashtable_lookup_get_insert_fast(&session_jobids,
135 if (origsj == NULL) {
136 /* successful insert */
138 jobid_prune_expedite();
142 if (IS_ERR(origsj)) {
143 put_pid(sj->sj_session);
146 return PTR_ERR(origsj);
148 ret = rhashtable_replace_fast(&session_jobids,
153 put_pid(sj->sj_session);
158 put_pid(origsj->sj_session);
160 kfree_rcu(origsj, sj_rcu);
161 jobid_prune_expedite();
166 static void jobid_free(void *vsj, void *arg)
168 struct session_jobid *sj = vsj;
170 put_pid(sj->sj_session);
174 static void jobid_prune(struct work_struct *work);
175 static DECLARE_DELAYED_WORK(jobid_prune_work, jobid_prune);
176 static int jobid_prune_expedited;
177 static void jobid_prune(struct work_struct *work)
180 struct rhashtable_iter iter;
181 struct session_jobid *sj;
183 jobid_prune_expedited = 0;
184 rhashtable_walk_enter(&session_jobids, &iter);
185 rhashtable_walk_start(&iter);
186 while ((sj = rhashtable_walk_next(&iter)) != NULL) {
187 if (!hlist_empty(&sj->sj_session->tasks[PIDTYPE_SID])) {
191 if (rhashtable_remove_fast(&session_jobids,
193 jobid_params) == 0) {
194 put_pid(sj->sj_session);
195 kfree_rcu(sj, sj_rcu);
198 rhashtable_walk_stop(&iter);
199 rhashtable_walk_exit(&iter);
201 schedule_delayed_work(&jobid_prune_work,
202 cfs_time_seconds(JOBID_BACKGROUND_CLEAN));
205 static void jobid_prune_expedite(void)
207 if (!jobid_prune_expedited) {
208 jobid_prune_expedited = 1;
209 mod_delayed_work(system_wq, &jobid_prune_work,
210 cfs_time_seconds(JOBID_EXPEDITED_CLEAN));
215 * Get jobid of current process by reading the environment variable
216 * stored in between the "env_start" & "env_end" of task struct.
218 * If some job scheduler doesn't store jobid in the "env_start/end",
219 * then an upcall could be issued here to get the jobid by utilizing
220 * the userspace tools/API. Then, the jobid must be cached.
222 int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
226 rc = cfs_get_environ(jobid_var, jobid, jobid_len);
230 if (rc == -EOVERFLOW) {
231 /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
232 * variable length strings instead of just numbers), it
233 * might make sense to keep the unique parts for JobID,
234 * instead of just returning an error. That means a
235 * larger temp buffer for cfs_get_environ(), then
236 * truncating the string at some separator to fit into
237 * the specified jobid_len. Fix later if needed. */
238 static ktime_t printed;
240 if (unlikely(ktime_to_ns(printed) == 0 ||
241 ktime_after(ktime_get(),
242 ktime_add_ns(printed,
243 3600*24*NSEC_PER_SEC)))) {
244 LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
245 obd_jobid_var, *jobid_len);
246 printed = ktime_get();
251 CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
252 rc == -EDEADLK) ? D_INFO : D_ERROR,
253 "jobid: get '%s' failed: rc = %d\n",
262 * jobid_should_free_item
264 * Each item is checked to see if it should be released
265 * Removed from hash table by caller
266 * Actually freed in jobid_put_locked
268 * Returns 1 if item is to be freed, 0 if it is to be kept
271 static int jobid_should_free_item(void *obj, void *data)
274 struct jobid_pid_map *pidmap = obj;
281 WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1);
285 spin_lock(&pidmap->jp_lock);
286 /* prevent newly inserted items from deleting */
287 if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1)
289 else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
291 else if (strcmp(pidmap->jp_jobid, jobid) == 0)
293 spin_unlock(&pidmap->jp_lock);
299 * jobid_name_is_valid
301 * Checks if the jobid is a Lustre process
303 * Returns true if jobid is valid
304 * Returns false if jobid looks like it's a Lustre process
306 static bool jobid_name_is_valid(char *jobid)
308 const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
309 "ldlm", "ll_sa", NULL };
312 if (jobid[0] == '\0')
315 for (i = 0; lustre_reserved[i] != NULL; i++) {
316 if (strncmp(jobid, lustre_reserved[i],
317 strlen(lustre_reserved[i])) == 0)
324 * jobid_get_from_cache()
326 * Returns contents of jobid_var from process environment for current PID,
327 * or from the per-session jobid table.
328 * Values fetch from process environment will be cached for some time to avoid
329 * the overhead of scanning the environment.
331 * Return: -ENOMEM if allocating a new pidmap fails
332 * -ENOENT if no entry could be found
333 * +ve string length for success (something was returned in jobid)
335 static int jobid_get_from_cache(char *jobid, size_t joblen)
337 static time64_t last_expire;
338 bool expire_cache = false;
339 pid_t pid = current_pid();
340 struct jobid_pid_map *pidmap = NULL;
341 time64_t now = ktime_get_real_seconds();
345 if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) {
349 jid = jobid_current();
351 strlcpy(jobid, jid, joblen);
352 joblen = strlen(jobid);
360 LASSERT(jobid_hash != NULL);
362 /* scan hash periodically to remove old PID entries from cache */
363 spin_lock(&jobid_hash_lock);
364 if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
368 spin_unlock(&jobid_hash_lock);
371 cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
372 "intentionally_bad_jobid");
374 /* first try to find PID in the hash and use that value */
375 pidmap = cfs_hash_lookup(jobid_hash, &pid);
376 if (pidmap == NULL) {
377 struct jobid_pid_map *pidmap2;
379 OBD_ALLOC_PTR(pidmap);
381 GOTO(out, rc = -ENOMEM);
383 pidmap->jp_pid = pid;
385 pidmap->jp_jobid[0] = '\0';
386 spin_lock_init(&pidmap->jp_lock);
387 INIT_HLIST_NODE(&pidmap->jp_hash);
389 * @pidmap might be reclaimed just after it is added into
390 * hash list, init @jp_refcount as 1 to make sure memory
391 * could be not freed during access.
393 atomic_set(&pidmap->jp_refcount, 1);
396 * Add the newly created map to the hash, on key collision we
397 * lost a racing addition and must destroy our newly allocated
398 * map. The object which exists in the hash will be returned.
400 pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
402 if (unlikely(pidmap != pidmap2)) {
403 CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
405 OBD_FREE_PTR(pidmap);
411 * If pidmap is old (this is always true for new entries) refresh it.
412 * If obd_jobid_var is not found, cache empty entry and try again
413 * later, to avoid repeat lookups for PID if obd_jobid_var missing.
415 spin_lock(&pidmap->jp_lock);
416 if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
417 char env_jobid[LUSTRE_JOBID_SIZE] = "";
418 int env_len = sizeof(env_jobid);
420 pidmap->jp_time = now;
422 spin_unlock(&pidmap->jp_lock);
423 rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
425 CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
426 pidmap->jp_pid, env_jobid);
427 spin_lock(&pidmap->jp_lock);
429 pidmap->jp_joblen = env_len;
430 strlcpy(pidmap->jp_jobid, env_jobid,
431 sizeof(pidmap->jp_jobid));
433 } else if (rc == -ENOENT) {
434 /* It might have been deleted, clear out old entry */
435 pidmap->jp_joblen = 0;
436 pidmap->jp_jobid[0] = '\0';
441 * Regardless of how pidmap was found, if it contains a valid entry
442 * use that for now. If there was a technical error (e.g. -ENOMEM)
443 * use the old cached value until it can be looked up again properly.
444 * If a cached missing entry was found, return -ENOENT.
446 if (pidmap->jp_joblen) {
447 strlcpy(jobid, pidmap->jp_jobid, joblen);
448 joblen = pidmap->jp_joblen;
453 spin_unlock(&pidmap->jp_lock);
455 cfs_hash_put(jobid_hash, &pidmap->jp_hash);
459 return rc < 0 ? rc : joblen;
463 * jobid_interpret_string()
465 * Interpret the jobfmt string to expand specified fields, like coredumps do:
469 * %j = jobid from environment
473 * Unknown escape strings are dropped. Other characters are copied through,
474 * excluding whitespace (to avoid making jobid parsing difficult).
476 * Return: -EOVERFLOW if the expanded string does not fit within @joblen
479 static int jobid_interpret_string(const char *jobfmt, char *jobid,
484 while ((c = *jobfmt++) && joblen > 1) {
488 if (isspace(c)) /* Don't allow embedded spaces */
498 switch ((f = *jobfmt++)) {
499 case 'e': /* executable name */
500 l = snprintf(jobid, joblen, "%s", current_comm());
502 case 'g': /* group ID */
503 l = snprintf(jobid, joblen, "%u",
504 from_kgid(&init_user_ns, current_fsgid()));
506 case 'h': /* hostname */
507 l = snprintf(jobid, joblen, "%s",
508 init_utsname()->nodename);
510 case 'j': /* jobid stored in process environment */
511 l = jobid_get_from_cache(jobid, joblen);
515 case 'p': /* process ID */
516 l = snprintf(jobid, joblen, "%u", current_pid());
518 case 'u': /* user ID */
519 l = snprintf(jobid, joblen, "%u",
520 from_kuid(&init_user_ns, current_fsuid()));
522 case '\0': /* '%' at end of format string */
525 default: /* drop unknown %x format strings */
533 * This points at the end of the buffer, so long as jobid is always
534 * incremented the same amount as joblen is decremented.
537 jobid[joblen - 1] = '\0';
539 return joblen < 0 ? -EOVERFLOW : 0;
543 * Hash initialization, copied from server-side job stats bucket sizes
545 #define HASH_JOBID_BKT_BITS 5
546 #define HASH_JOBID_CUR_BITS 7
547 #define HASH_JOBID_MAX_BITS 12
549 int jobid_cache_init(void)
557 spin_lock_init(&jobid_hash_lock);
558 jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
559 HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
560 0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
561 &jobid_hash_ops, CFS_HASH_DEFAULT);
565 rc = rhashtable_init(&session_jobids, &jobid_params);
567 cfs_hash_putref(jobid_hash);
574 EXPORT_SYMBOL(jobid_cache_init);
576 void jobid_cache_fini(void)
578 struct cfs_hash *tmp_hash;
581 spin_lock(&jobid_hash_lock);
582 tmp_hash = jobid_hash;
584 spin_unlock(&jobid_hash_lock);
586 cancel_delayed_work_sync(&jobid_prune_work);
588 if (tmp_hash != NULL) {
589 cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
590 cfs_hash_putref(tmp_hash);
592 rhashtable_free_and_destroy(&session_jobids, jobid_free, NULL);
598 EXPORT_SYMBOL(jobid_cache_fini);
601 * Hash operations for pid<->jobid
603 static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
606 return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
609 static void *jobid_key(struct hlist_node *hnode)
611 struct jobid_pid_map *pidmap;
613 pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
614 return &pidmap->jp_pid;
617 static int jobid_keycmp(const void *key, struct hlist_node *hnode)
619 const pid_t *pid_key1;
620 const pid_t *pid_key2;
622 LASSERT(key != NULL);
623 pid_key1 = (pid_t *)key;
624 pid_key2 = (pid_t *)jobid_key(hnode);
626 return *pid_key1 == *pid_key2;
629 static void *jobid_object(struct hlist_node *hnode)
631 return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
634 static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
636 struct jobid_pid_map *pidmap;
638 pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
640 atomic_inc(&pidmap->jp_refcount);
643 static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
645 struct jobid_pid_map *pidmap;
650 pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
651 LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
652 if (atomic_dec_and_test(&pidmap->jp_refcount)) {
653 CDEBUG(D_INFO, "Freeing: %d->%s\n",
654 pidmap->jp_pid, pidmap->jp_jobid);
656 OBD_FREE_PTR(pidmap);
660 static struct cfs_hash_ops jobid_hash_ops = {
661 .hs_hash = jobid_hashfn,
662 .hs_keycmp = jobid_keycmp,
664 .hs_object = jobid_object,
666 .hs_put = jobid_put_locked,
667 .hs_put_locked = jobid_put_locked,
671 * Generate the job identifier string for this process for tracking purposes.
673 * Fill in @jobid string based on the value of obd_jobid_var:
674 * JOBSTATS_DISABLE: none
675 * JOBSTATS_NODELOCAL: content of obd_jobid_name (jobid_interpret_string())
676 * JOBSTATS_PROCNAME_UID: process name/UID
677 * JOBSTATS_SESSION per-session value set by
678 * /sys/fs/lustre/jobid_this_session
679 * anything else: look up obd_jobid_var in the processes environment
681 * Return -ve error number, 0 on success.
683 int lustre_get_jobid(char *jobid, size_t joblen)
688 if (unlikely(joblen < 2)) {
694 if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
695 /* Jobstats isn't enabled */
696 memset(jobid, 0, joblen);
697 } else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
698 /* Whole node dedicated to single job */
699 rc = jobid_interpret_string(obd_jobid_name, jobid, joblen);
700 } else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
701 rc = jobid_interpret_string("%e.%u", jobid, joblen);
702 } else if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) {
706 jid = jobid_current();
708 strlcpy(jobid, jid, sizeof(jobid));
710 } else if (jobid_name_is_valid(current_comm())) {
712 * obd_jobid_var holds the jobid environment variable name.
713 * Skip initial check if obd_jobid_name already uses "%j",
714 * otherwise try just "%j" first, then fall back to whatever
715 * is in obd_jobid_name if obd_jobid_var is not found.
718 if (!strnstr(obd_jobid_name, "%j", joblen))
719 rc = jobid_get_from_cache(jobid, joblen);
721 /* fall back to jobid_node if jobid_var not in environment */
723 int rc2 = jobid_interpret_string(obd_jobid_name,
732 EXPORT_SYMBOL(lustre_get_jobid);
737 * Search cache for JobID given by @find_jobid.
738 * If any entries in the hash table match the value, they are removed
740 void lustre_jobid_clear(const char *find_jobid)
742 char jobid[LUSTRE_JOBID_SIZE];
745 if (jobid_hash == NULL)
748 strlcpy(jobid, find_jobid, sizeof(jobid));
749 /* trim \n off the end of the incoming jobid */
750 end = strchr(jobid, '\n');
751 if (end && *end == '\n')
754 CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
755 cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
757 CDEBUG(D_INFO, "%d items remain in jobID table\n",
758 atomic_read(&jobid_hash->hs_count));