4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2014, Intel Corporation.
28 * Copyright 2017 Cray Inc, all rights reserved.
31 * Store PID->JobID mappings
34 #define DEBUG_SUBSYSTEM S_RPC
35 #include <linux/user_namespace.h>
36 #include <linux/uidgid.h>
37 #include <linux/utsname.h>
39 #include <libcfs/libcfs.h>
40 #include <obd_support.h>
41 #include <obd_class.h>
42 #include <lustre_net.h>
44 static struct cfs_hash *jobid_hash;
45 static struct cfs_hash_ops jobid_hash_ops;
46 spinlock_t jobid_hash_lock;
48 #define RESCAN_INTERVAL 30
49 #define DELETE_INTERVAL 300
51 char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
52 char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
55 * Structure to store a single PID->JobID mapping
57 struct jobid_pid_map {
58 struct hlist_node jp_hash;
60 spinlock_t jp_lock; /* protects jp_jobid */
61 char jp_jobid[LUSTRE_JOBID_SIZE];
62 unsigned int jp_joblen;
68 * Jobid can be set for a session (see setsid(2)) by writing to
69 * a sysfs file from any process in that session.
70 * The jobids are stored in a hash table indexed by the relevant
71 * struct pid. We periodically look for entries where the pid has
72 * no PIDTYPE_SID tasks any more, and prune them. This happens within
73 * 5 seconds of a jobid being added, and every 5 minutes when jobids exist,
76 #define JOBID_EXPEDITED_CLEAN (5)
77 #define JOBID_BACKGROUND_CLEAN (5 * 60)
79 struct session_jobid {
80 struct pid *sj_session;
81 struct rhash_head sj_linkage;
82 struct rcu_head sj_rcu;
86 static const struct rhashtable_params jobid_params = {
87 .key_len = sizeof(struct pid *),
88 .key_offset = offsetof(struct session_jobid, sj_session),
89 .head_offset = offsetof(struct session_jobid, sj_linkage),
92 static struct rhashtable session_jobids;
95 * jobid_current must be called with rcu_read_lock held.
96 * if it returns non-NULL, the string can only be used
97 * until rcu_read_unlock is called.
99 char *jobid_current(void)
101 struct pid *sid = task_session(current);
102 struct session_jobid *sj;
104 sj = rhashtable_lookup_fast(&session_jobids, &sid, jobid_params);
110 static void jobid_prune_expedite(void);
112 * jobid_set_current will try to add a new entry
113 * to the table. If one exists with the same key, the
114 * jobid will be replaced
116 int jobid_set_current(char *jobid)
119 struct session_jobid *sj, *origsj;
121 int len = strlen(jobid);
123 sj = kmalloc(sizeof(*sj) + len, GFP_KERNEL);
127 sid = task_session(current);
128 sj->sj_session = get_pid(sid);
129 strncpy(sj->sj_jobid, jobid, len+1);
130 origsj = rhashtable_lookup_get_insert_fast(&session_jobids,
133 if (origsj == NULL) {
134 /* successful insert */
136 jobid_prune_expedite();
140 if (IS_ERR(origsj)) {
141 put_pid(sj->sj_session);
144 return PTR_ERR(origsj);
146 ret = rhashtable_replace_fast(&session_jobids,
151 put_pid(sj->sj_session);
156 put_pid(origsj->sj_session);
158 kfree_rcu(origsj, sj_rcu);
159 jobid_prune_expedite();
164 static void jobid_free(void *vsj, void *arg)
166 struct session_jobid *sj = vsj;
168 put_pid(sj->sj_session);
172 static void jobid_prune(struct work_struct *work);
173 static DECLARE_DELAYED_WORK(jobid_prune_work, jobid_prune);
174 static int jobid_prune_expedited;
175 static void jobid_prune(struct work_struct *work)
178 struct rhashtable_iter iter;
179 struct session_jobid *sj;
181 jobid_prune_expedited = 0;
182 rhashtable_walk_enter(&session_jobids, &iter);
183 rhashtable_walk_start(&iter);
184 while ((sj = rhashtable_walk_next(&iter)) != NULL) {
186 if (PTR_ERR(sj) == -EAGAIN)
190 if (!hlist_empty(&sj->sj_session->tasks[PIDTYPE_SID])) {
194 if (rhashtable_remove_fast(&session_jobids,
196 jobid_params) == 0) {
197 put_pid(sj->sj_session);
198 kfree_rcu(sj, sj_rcu);
201 rhashtable_walk_stop(&iter);
202 rhashtable_walk_exit(&iter);
204 schedule_delayed_work(&jobid_prune_work,
205 cfs_time_seconds(JOBID_BACKGROUND_CLEAN));
208 static void jobid_prune_expedite(void)
210 if (!jobid_prune_expedited) {
211 jobid_prune_expedited = 1;
212 mod_delayed_work(system_wq, &jobid_prune_work,
213 cfs_time_seconds(JOBID_EXPEDITED_CLEAN));
217 static int cfs_access_process_vm(struct task_struct *tsk,
218 struct mm_struct *mm,
220 void *buf, int len, int write)
222 /* Just copied from kernel for the kernels which doesn't
223 * have access_process_vm() exported
225 struct vm_area_struct *vma;
229 /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
230 * which is already holding mmap_sem for writes. If some other
231 * thread gets the write lock in the meantime, this thread will
232 * block, but at least it won't deadlock on itself. LU-1735
234 if (!mmap_read_trylock(mm))
237 /* ignore errors, just check how much was successfully transferred */
239 int bytes, rc, offset;
242 #if defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
243 rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page,
245 #elif defined(HAVE_GET_USER_PAGES_6ARG)
246 rc = get_user_pages(addr, 1, write, 1, &page, &vma);
248 rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma);
254 offset = addr & (PAGE_SIZE-1);
255 if (bytes > PAGE_SIZE-offset)
256 bytes = PAGE_SIZE-offset;
260 copy_to_user_page(vma, page, addr,
261 maddr + offset, buf, bytes);
262 set_page_dirty_lock(page);
264 copy_from_user_page(vma, page, addr,
265 buf, maddr + offset, bytes);
273 mmap_read_unlock(mm);
275 return buf - old_buf;
278 /* Read the environment variable of current process specified by @key. */
279 static int cfs_get_environ(const char *key, char *value, int *val_len)
281 struct mm_struct *mm;
283 int buf_len = PAGE_SIZE;
284 int key_len = strlen(key);
290 buffer = kmalloc(buf_len, GFP_USER);
294 mm = get_task_mm(current);
300 addr = mm->env_start;
301 while (addr < mm->env_end) {
302 int this_len, retval, scan_len;
303 char *env_start, *env_end;
305 memset(buffer, 0, buf_len);
307 this_len = min_t(int, mm->env_end - addr, buf_len);
308 retval = cfs_access_process_vm(current, mm, addr, buffer,
311 GOTO(out, rc = retval);
312 else if (retval != this_len)
317 /* Parse the buffer to find out the specified key/value pair.
318 * The "key=value" entries are separated by '\0'.
326 env_end = memscan(env_start, '\0', scan_len);
327 LASSERT(env_end >= env_start &&
328 env_end <= env_start + scan_len);
330 /* The last entry of this buffer cross the buffer
331 * boundary, reread it in next cycle.
333 if (unlikely(env_end - env_start == scan_len)) {
334 /* Just skip the entry larger than page size,
335 * it can't be jobID env variable.
337 if (unlikely(scan_len == this_len))
342 } else if (unlikely(skip)) {
347 entry_len = env_end - env_start;
348 CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry);
350 /* Key length + length of '=' */
351 if (entry_len > key_len + 1 &&
352 entry[key_len] == '=' &&
353 !memcmp(entry, key, key_len)) {
354 entry += key_len + 1;
355 entry_len -= key_len + 1;
357 /* The 'value' buffer passed in is too small.
358 * Copy what fits, but return -EOVERFLOW.
360 if (entry_len >= *val_len) {
361 memcpy(value, entry, *val_len);
362 value[*val_len - 1] = 0;
363 GOTO(out, rc = -EOVERFLOW);
366 memcpy(value, entry, entry_len);
367 *val_len = entry_len;
371 scan_len -= (env_end - env_start + 1);
372 env_start = env_end + 1;
375 GOTO(out, rc = -ENOENT);
379 kfree((void *)buffer);
384 * Get jobid of current process by reading the environment variable
385 * stored in between the "env_start" & "env_end" of task struct.
387 * If some job scheduler doesn't store jobid in the "env_start/end",
388 * then an upcall could be issued here to get the jobid by utilizing
389 * the userspace tools/API. Then, the jobid must be cached.
391 int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
395 rc = cfs_get_environ(jobid_var, jobid, jobid_len);
399 if (rc == -EOVERFLOW) {
400 /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
401 * variable length strings instead of just numbers), it
402 * might make sense to keep the unique parts for JobID,
403 * instead of just returning an error. That means a
404 * larger temp buffer for cfs_get_environ(), then
405 * truncating the string at some separator to fit into
406 * the specified jobid_len. Fix later if needed. */
407 static ktime_t printed;
409 if (unlikely(ktime_to_ns(printed) == 0 ||
410 ktime_after(ktime_get(),
411 ktime_add_ns(printed,
412 3600ULL * 24 * NSEC_PER_SEC)))) {
413 LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
414 obd_jobid_var, *jobid_len);
415 printed = ktime_get();
420 CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
421 rc == -EDEADLK) ? D_INFO : D_ERROR,
422 "jobid: get '%s' failed: rc = %d\n",
431 * jobid_should_free_item
433 * Each item is checked to see if it should be released
434 * Removed from hash table by caller
435 * Actually freed in jobid_put_locked
437 * Returns 1 if item is to be freed, 0 if it is to be kept
440 static int jobid_should_free_item(void *obj, void *data)
443 struct jobid_pid_map *pidmap = obj;
450 WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1);
454 spin_lock(&pidmap->jp_lock);
455 /* prevent newly inserted items from deleting */
456 if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1)
458 else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
460 else if (strcmp(pidmap->jp_jobid, jobid) == 0)
462 spin_unlock(&pidmap->jp_lock);
468 * jobid_name_is_valid
470 * Checks if the jobid is a Lustre process
472 * Returns true if jobid is valid
473 * Returns false if jobid looks like it's a Lustre process
475 static bool jobid_name_is_valid(char *jobid)
477 const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
478 "ldlm", "ll_sa", NULL };
481 if (jobid[0] == '\0')
484 for (i = 0; lustre_reserved[i] != NULL; i++) {
485 if (strncmp(jobid, lustre_reserved[i],
486 strlen(lustre_reserved[i])) == 0)
493 * jobid_get_from_cache()
495 * Returns contents of jobid_var from process environment for current PID,
496 * or from the per-session jobid table.
497 * Values fetch from process environment will be cached for some time to avoid
498 * the overhead of scanning the environment.
500 * Return: -ENOMEM if allocating a new pidmap fails
501 * -ENOENT if no entry could be found
502 * +ve string length for success (something was returned in jobid)
504 static int jobid_get_from_cache(char *jobid, size_t joblen)
506 static time64_t last_expire;
507 bool expire_cache = false;
508 pid_t pid = current->pid;
509 struct jobid_pid_map *pidmap = NULL;
510 time64_t now = ktime_get_real_seconds();
514 if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) {
518 jid = jobid_current();
520 strlcpy(jobid, jid, joblen);
521 joblen = strlen(jobid);
529 LASSERT(jobid_hash != NULL);
531 /* scan hash periodically to remove old PID entries from cache */
532 spin_lock(&jobid_hash_lock);
533 if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
537 spin_unlock(&jobid_hash_lock);
540 cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
541 "intentionally_bad_jobid");
543 /* first try to find PID in the hash and use that value */
544 pidmap = cfs_hash_lookup(jobid_hash, &pid);
545 if (pidmap == NULL) {
546 struct jobid_pid_map *pidmap2;
548 OBD_ALLOC_PTR(pidmap);
550 GOTO(out, rc = -ENOMEM);
552 pidmap->jp_pid = pid;
554 pidmap->jp_jobid[0] = '\0';
555 spin_lock_init(&pidmap->jp_lock);
556 INIT_HLIST_NODE(&pidmap->jp_hash);
558 * @pidmap might be reclaimed just after it is added into
559 * hash list, init @jp_refcount as 1 to make sure memory
560 * could be not freed during access.
562 atomic_set(&pidmap->jp_refcount, 1);
565 * Add the newly created map to the hash, on key collision we
566 * lost a racing addition and must destroy our newly allocated
567 * map. The object which exists in the hash will be returned.
569 pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
571 if (unlikely(pidmap != pidmap2)) {
572 CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
574 OBD_FREE_PTR(pidmap);
580 * If pidmap is old (this is always true for new entries) refresh it.
581 * If obd_jobid_var is not found, cache empty entry and try again
582 * later, to avoid repeat lookups for PID if obd_jobid_var missing.
584 spin_lock(&pidmap->jp_lock);
585 if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
586 char env_jobid[LUSTRE_JOBID_SIZE] = "";
587 int env_len = sizeof(env_jobid);
589 pidmap->jp_time = now;
591 spin_unlock(&pidmap->jp_lock);
592 rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
594 CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
595 pidmap->jp_pid, env_jobid);
596 spin_lock(&pidmap->jp_lock);
598 pidmap->jp_joblen = env_len;
599 strlcpy(pidmap->jp_jobid, env_jobid,
600 sizeof(pidmap->jp_jobid));
602 } else if (rc == -ENOENT) {
603 /* It might have been deleted, clear out old entry */
604 pidmap->jp_joblen = 0;
605 pidmap->jp_jobid[0] = '\0';
610 * Regardless of how pidmap was found, if it contains a valid entry
611 * use that for now. If there was a technical error (e.g. -ENOMEM)
612 * use the old cached value until it can be looked up again properly.
613 * If a cached missing entry was found, return -ENOENT.
615 if (pidmap->jp_joblen) {
616 strlcpy(jobid, pidmap->jp_jobid, joblen);
617 joblen = pidmap->jp_joblen;
622 spin_unlock(&pidmap->jp_lock);
624 cfs_hash_put(jobid_hash, &pidmap->jp_hash);
628 return rc < 0 ? rc : joblen;
632 * jobid_interpret_string()
634 * Interpret the jobfmt string to expand specified fields, like coredumps do:
638 * %H = short hostname
639 * %j = jobid from environment
643 * Unknown escape strings are dropped. Other characters are copied through,
644 * excluding whitespace (to avoid making jobid parsing difficult).
646 * Return: -EOVERFLOW if the expanded string does not fit within @joblen
649 static int jobid_interpret_string(const char *jobfmt, char *jobid,
654 while ((c = *jobfmt++) && joblen > 1) {
658 if (isspace(c)) /* Don't allow embedded spaces */
669 switch ((f = *jobfmt++)) {
670 case 'e': /* executable name */
671 l = snprintf(jobid, joblen, "%s", current->comm);
673 case 'g': /* group ID */
674 l = snprintf(jobid, joblen, "%u",
675 from_kgid(&init_user_ns, current_fsgid()));
677 case 'h': /* hostname */
678 l = snprintf(jobid, joblen, "%s",
679 init_utsname()->nodename);
681 case 'H': /* short hostname. Cut at first dot */
682 l = snprintf(jobid, joblen, "%s",
683 init_utsname()->nodename);
684 p = strnchr(jobid, joblen, '.');
690 case 'j': /* jobid stored in process environment */
691 l = jobid_get_from_cache(jobid, joblen);
695 case 'p': /* process ID */
696 l = snprintf(jobid, joblen, "%u", current->pid);
698 case 'u': /* user ID */
699 l = snprintf(jobid, joblen, "%u",
700 from_kuid(&init_user_ns, current_fsuid()));
702 case '\0': /* '%' at end of format string */
705 default: /* drop unknown %x format strings */
713 * This points at the end of the buffer, so long as jobid is always
714 * incremented the same amount as joblen is decremented.
717 jobid[joblen - 1] = '\0';
719 return joblen < 0 ? -EOVERFLOW : 0;
723 * Hash initialization, copied from server-side job stats bucket sizes
725 #define HASH_JOBID_BKT_BITS 5
726 #define HASH_JOBID_CUR_BITS 7
727 #define HASH_JOBID_MAX_BITS 12
729 int jobid_cache_init(void)
737 spin_lock_init(&jobid_hash_lock);
738 jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
739 HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
740 0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
741 &jobid_hash_ops, CFS_HASH_DEFAULT);
745 rc = rhashtable_init(&session_jobids, &jobid_params);
747 cfs_hash_putref(jobid_hash);
754 EXPORT_SYMBOL(jobid_cache_init);
756 void jobid_cache_fini(void)
758 struct cfs_hash *tmp_hash;
761 spin_lock(&jobid_hash_lock);
762 tmp_hash = jobid_hash;
764 spin_unlock(&jobid_hash_lock);
766 cancel_delayed_work_sync(&jobid_prune_work);
768 if (tmp_hash != NULL) {
769 cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
770 cfs_hash_putref(tmp_hash);
772 rhashtable_free_and_destroy(&session_jobids, jobid_free, NULL);
778 EXPORT_SYMBOL(jobid_cache_fini);
781 * Hash operations for pid<->jobid
783 static unsigned jobid_hashfn(struct cfs_hash *hs, const void *key,
786 return cfs_hash_djb2_hash(key, sizeof(pid_t), mask);
789 static void *jobid_key(struct hlist_node *hnode)
791 struct jobid_pid_map *pidmap;
793 pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
794 return &pidmap->jp_pid;
797 static int jobid_keycmp(const void *key, struct hlist_node *hnode)
799 const pid_t *pid_key1;
800 const pid_t *pid_key2;
802 LASSERT(key != NULL);
803 pid_key1 = (pid_t *)key;
804 pid_key2 = (pid_t *)jobid_key(hnode);
806 return *pid_key1 == *pid_key2;
809 static void *jobid_object(struct hlist_node *hnode)
811 return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
814 static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
816 struct jobid_pid_map *pidmap;
818 pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
820 atomic_inc(&pidmap->jp_refcount);
823 static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
825 struct jobid_pid_map *pidmap;
830 pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
831 LASSERT(atomic_read(&pidmap->jp_refcount) > 0);
832 if (atomic_dec_and_test(&pidmap->jp_refcount)) {
833 CDEBUG(D_INFO, "Freeing: %d->%s\n",
834 pidmap->jp_pid, pidmap->jp_jobid);
836 OBD_FREE_PTR(pidmap);
840 static struct cfs_hash_ops jobid_hash_ops = {
841 .hs_hash = jobid_hashfn,
842 .hs_keycmp = jobid_keycmp,
844 .hs_object = jobid_object,
846 .hs_put = jobid_put_locked,
847 .hs_put_locked = jobid_put_locked,
851 * Generate the job identifier string for this process for tracking purposes.
853 * Fill in @jobid string based on the value of obd_jobid_var:
854 * JOBSTATS_DISABLE: none
855 * JOBSTATS_NODELOCAL: content of obd_jobid_name (jobid_interpret_string())
856 * JOBSTATS_PROCNAME_UID: process name/UID
857 * JOBSTATS_SESSION per-session value set by
858 * /sys/fs/lustre/jobid_this_session
859 * anything else: look up obd_jobid_var in the processes environment
861 * Return -ve error number, 0 on success.
863 int lustre_get_jobid(char *jobid, size_t joblen)
868 if (unlikely(joblen < 2)) {
874 if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
875 /* Jobstats isn't enabled */
876 memset(jobid, 0, joblen);
877 } else if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
878 /* Whole node dedicated to single job */
879 rc = jobid_interpret_string(obd_jobid_name, jobid, joblen);
880 } else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
881 rc = jobid_interpret_string("%e.%u", jobid, joblen);
882 } else if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0 ||
883 jobid_name_is_valid(current->comm)) {
885 * per-process jobid wanted, either from environment or from
886 * per-session setting.
887 * If obd_jobid_name contains "%j" or if getting the per-process
888 * jobid directly fails, fall back to using obd_jobid_name.
891 if (!strnstr(obd_jobid_name, "%j", joblen))
892 rc = jobid_get_from_cache(jobid, joblen);
894 /* fall back to jobid_name if jobid_var not available */
896 int rc2 = jobid_interpret_string(obd_jobid_name,
905 EXPORT_SYMBOL(lustre_get_jobid);
910 * Search cache for JobID given by @find_jobid.
911 * If any entries in the hash table match the value, they are removed
913 void lustre_jobid_clear(const char *find_jobid)
915 char jobid[LUSTRE_JOBID_SIZE];
918 if (jobid_hash == NULL)
921 strlcpy(jobid, find_jobid, sizeof(jobid));
922 /* trim \n off the end of the incoming jobid */
923 end = strchr(jobid, '\n');
924 if (end && *end == '\n')
927 CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
928 cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
930 CDEBUG(D_INFO, "%d items remain in jobID table\n",
931 atomic_read(&jobid_hash->hs_count));