Whamcloud - gitweb
LU-17662 osd-zfs: Support for ZFS 2.2.3
[fs/lustre-release.git] / lustre / obdclass / jobid.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2014, Intel Corporation.
27  *
28  * Copyright 2017 Cray Inc, all rights reserved.
29  * Author: Ben Evans.
30  *
31  * Store PID->JobID mappings
32  */
33
34 #define DEBUG_SUBSYSTEM S_RPC
35 #include <linux/user_namespace.h>
36 #include <linux/uidgid.h>
37 #include <linux/utsname.h>
38
39 #include <libcfs/libcfs.h>
40 #include <obd_support.h>
41 #include <obd_class.h>
42 #include <lustre_net.h>
43
44 static struct cfs_hash *jobid_hash;
45 static struct cfs_hash_ops jobid_hash_ops;
46 spinlock_t jobid_hash_lock;
47
48 #define RESCAN_INTERVAL 30
49 #define DELETE_INTERVAL 300
50
51 char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
52 char obd_jobid_name[LUSTRE_JOBID_SIZE] = "%e.%u";
53
54 /**
55  * Structure to store a single PID->JobID mapping
56  */
57 struct jobid_pid_map {
58         struct hlist_node       jp_hash;
59         time64_t                jp_time;
60         spinlock_t              jp_lock; /* protects jp_jobid */
61         char                    jp_jobid[LUSTRE_JOBID_SIZE];
62         unsigned int            jp_joblen;
63         struct kref             jp_refcount;
64         pid_t                   jp_pid;
65 };
66
67 /*
68  * Jobid can be set for a session (see setsid(2)) by writing to
69  * a sysfs file from any process in that session.
70  * The jobids are stored in a hash table indexed by the relevant
71  * struct pid.  We periodically look for entries where the pid has
72  * no PIDTYPE_SID tasks any more, and prune them.  This happens within
73  * 5 seconds of a jobid being added, and every 5 minutes when jobids exist,
74  * but none are added.
75  */
76 #define JOBID_EXPEDITED_CLEAN (5)
77 #define JOBID_BACKGROUND_CLEAN (5 * 60)
78
79 struct session_jobid {
80         struct pid              *sj_session;
81         struct rhash_head       sj_linkage;
82         struct rcu_head         sj_rcu;
83         char                    sj_jobid[1];
84 };
85
86 static const struct rhashtable_params jobid_params = {
87         .key_len        = sizeof(struct pid *),
88         .key_offset     = offsetof(struct session_jobid, sj_session),
89         .head_offset    = offsetof(struct session_jobid, sj_linkage),
90 };
91
92 static struct rhashtable session_jobids;
93
94 /*
95  * jobid_current must be called with rcu_read_lock held.
96  * if it returns non-NULL, the string can only be used
97  * until rcu_read_unlock is called.
98  */
99 char *jobid_current(void)
100 {
101         struct pid *sid = task_session(current);
102         struct session_jobid *sj;
103
104         sj = rhashtable_lookup_fast(&session_jobids, &sid, jobid_params);
105         if (sj)
106                 return sj->sj_jobid;
107         return NULL;
108 }
109
110 static void jobid_prune_expedite(void);
111 /*
112  * jobid_set_current will try to add a new entry
113  * to the table.  If one exists with the same key, the
114  * jobid will be replaced
115  */
116 int jobid_set_current(char *jobid)
117 {
118         struct pid *sid;
119         struct session_jobid *sj, *origsj;
120         int ret;
121         int len = strlen(jobid);
122
123         sj = kmalloc(sizeof(*sj) + len, GFP_KERNEL);
124         if (!sj)
125                 return -ENOMEM;
126         rcu_read_lock();
127         sid = task_session(current);
128         sj->sj_session = get_pid(sid);
129         strncpy(sj->sj_jobid, jobid, len+1);
130         origsj = rhashtable_lookup_get_insert_fast(&session_jobids,
131                                                    &sj->sj_linkage,
132                                                    jobid_params);
133         if (origsj == NULL) {
134                 /* successful insert */
135                 rcu_read_unlock();
136                 jobid_prune_expedite();
137                 return 0;
138         }
139
140         if (IS_ERR(origsj)) {
141                 put_pid(sj->sj_session);
142                 kfree(sj);
143                 rcu_read_unlock();
144                 return PTR_ERR(origsj);
145         }
146         ret = rhashtable_replace_fast(&session_jobids,
147                                       &origsj->sj_linkage,
148                                       &sj->sj_linkage,
149                                       jobid_params);
150         if (ret) {
151                 put_pid(sj->sj_session);
152                 kfree(sj);
153                 rcu_read_unlock();
154                 return ret;
155         }
156         put_pid(origsj->sj_session);
157         rcu_read_unlock();
158         kfree_rcu(origsj, sj_rcu);
159         jobid_prune_expedite();
160
161         return 0;
162 }
163
164 static void jobid_free(void *vsj, void *arg)
165 {
166         struct session_jobid *sj = vsj;
167
168         put_pid(sj->sj_session);
169         kfree(sj);
170 }
171
172 static void jobid_prune(struct work_struct *work);
173 static DECLARE_DELAYED_WORK(jobid_prune_work, jobid_prune);
174 static int jobid_prune_expedited;
175 static void jobid_prune(struct work_struct *work)
176 {
177         int remaining = 0;
178         struct rhashtable_iter iter;
179         struct session_jobid *sj;
180
181         jobid_prune_expedited = 0;
182         rhashtable_walk_enter(&session_jobids, &iter);
183         rhashtable_walk_start(&iter);
184         while ((sj = rhashtable_walk_next(&iter)) != NULL) {
185                 if (IS_ERR(sj)) {
186                         if (PTR_ERR(sj) == -EAGAIN)
187                                 continue;
188                         break;
189                 }
190                 if (!hlist_empty(&sj->sj_session->tasks[PIDTYPE_SID])) {
191                         remaining++;
192                         continue;
193                 }
194                 if (rhashtable_remove_fast(&session_jobids,
195                                            &sj->sj_linkage,
196                                            jobid_params) == 0) {
197                         put_pid(sj->sj_session);
198                         kfree_rcu(sj, sj_rcu);
199                 }
200         }
201         rhashtable_walk_stop(&iter);
202         rhashtable_walk_exit(&iter);
203         if (remaining)
204                 schedule_delayed_work(&jobid_prune_work,
205                                       cfs_time_seconds(JOBID_BACKGROUND_CLEAN));
206 }
207
208 static void jobid_prune_expedite(void)
209 {
210         if (!jobid_prune_expedited) {
211                 jobid_prune_expedited = 1;
212                 mod_delayed_work(system_wq, &jobid_prune_work,
213                                  cfs_time_seconds(JOBID_EXPEDITED_CLEAN));
214         }
215 }
216
217 static int cfs_access_process_vm(struct task_struct *tsk,
218                                  struct mm_struct *mm,
219                                  unsigned long addr,
220                                  void *buf, int len, int write)
221 {
222         /* Just copied from kernel for the kernels which doesn't
223          * have access_process_vm() exported
224          */
225         struct vm_area_struct *vma = NULL;
226         struct page *page;
227         void *old_buf = buf;
228
229         /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
230          * which is already holding mmap_sem for writes.  If some other
231          * thread gets the write lock in the meantime, this thread will
232          * block, but at least it won't deadlock on itself.  LU-1735
233          */
234         if (!mmap_read_trylock(mm))
235                 return -EDEADLK;
236
237         /* ignore errors, just check how much was successfully transferred */
238         while (len) {
239                 int bytes, rc, offset;
240                 void *maddr;
241
242 #if defined(HAVE_GET_USER_PAGES_WITHOUT_VMA)
243                 rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page);
244                 if (rc > 0)
245                         vma = vma_lookup(mm, addr);
246 #elif defined(HAVE_GET_USER_PAGES_GUP_FLAGS)
247                 rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page,
248                                     &vma);
249 #elif defined(HAVE_GET_USER_PAGES_6ARG)
250                 rc = get_user_pages(addr, 1, write, 1, &page, &vma);
251 #else
252                 rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma);
253 #endif
254                 if (rc <= 0 || !vma)
255                         break;
256
257                 bytes = len;
258                 offset = addr & (PAGE_SIZE-1);
259                 if (bytes > PAGE_SIZE-offset)
260                         bytes = PAGE_SIZE-offset;
261
262                 maddr = kmap(page);
263                 if (write) {
264                         copy_to_user_page(vma, page, addr,
265                                           maddr + offset, buf, bytes);
266                         set_page_dirty_lock(page);
267                 } else {
268                         copy_from_user_page(vma, page, addr,
269                                             buf, maddr + offset, bytes);
270                 }
271                 kunmap(page);
272                 put_page(page);
273                 len -= bytes;
274                 buf += bytes;
275                 addr += bytes;
276         }
277         mmap_read_unlock(mm);
278
279         return buf - old_buf;
280 }
281
282 /* Read the environment variable of current process specified by @key. */
283 static int cfs_get_environ(const char *key, char *value, int *val_len)
284 {
285         struct mm_struct *mm;
286         char *buffer;
287         int buf_len = PAGE_SIZE;
288         int key_len = strlen(key);
289         unsigned long addr;
290         int rc;
291         bool skip = false;
292
293         ENTRY;
294         buffer = kmalloc(buf_len, GFP_USER);
295         if (!buffer)
296                 RETURN(-ENOMEM);
297
298         mm = get_task_mm(current);
299         if (!mm) {
300                 kfree(buffer);
301                 RETURN(-EINVAL);
302         }
303
304         addr = mm->env_start;
305         while (addr < mm->env_end) {
306                 int this_len, retval, scan_len;
307                 char *env_start, *env_end;
308
309                 memset(buffer, 0, buf_len);
310
311                 this_len = min_t(int, mm->env_end - addr, buf_len);
312                 retval = cfs_access_process_vm(current, mm, addr, buffer,
313                                                this_len, 0);
314                 if (retval < 0)
315                         GOTO(out, rc = retval);
316                 else if (retval != this_len)
317                         break;
318
319                 addr += retval;
320
321                 /* Parse the buffer to find out the specified key/value pair.
322                  * The "key=value" entries are separated by '\0'.
323                  */
324                 env_start = buffer;
325                 scan_len = this_len;
326                 while (scan_len) {
327                         char *entry;
328                         int entry_len;
329
330                         env_end = memscan(env_start, '\0', scan_len);
331                         LASSERT(env_end >= env_start &&
332                                 env_end <= env_start + scan_len);
333
334                         /* The last entry of this buffer cross the buffer
335                          * boundary, reread it in next cycle.
336                          */
337                         if (unlikely(env_end - env_start == scan_len)) {
338                                 /* Just skip the entry larger than page size,
339                                  * it can't be jobID env variable.
340                                  */
341                                 if (unlikely(scan_len == this_len))
342                                         skip = true;
343                                 else
344                                         addr -= scan_len;
345                                 break;
346                         } else if (unlikely(skip)) {
347                                 skip = false;
348                                 goto skip;
349                         }
350                         entry = env_start;
351                         entry_len = env_end - env_start;
352                         CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry);
353
354                         /* Key length + length of '=' */
355                         if (entry_len > key_len + 1 &&
356                             entry[key_len] == '='  &&
357                             !memcmp(entry, key, key_len)) {
358                                 entry += key_len + 1;
359                                 entry_len -= key_len + 1;
360
361                                 /* The 'value' buffer passed in is too small.
362                                  * Copy what fits, but return -EOVERFLOW.
363                                  */
364                                 if (entry_len >= *val_len) {
365                                         memcpy(value, entry, *val_len);
366                                         value[*val_len - 1] = 0;
367                                         GOTO(out, rc = -EOVERFLOW);
368                                 }
369
370                                 memcpy(value, entry, entry_len);
371                                 *val_len = entry_len;
372                                 GOTO(out, rc = 0);
373                         }
374 skip:
375                         scan_len -= (env_end - env_start + 1);
376                         env_start = env_end + 1;
377                 }
378         }
379         GOTO(out, rc = -ENOENT);
380
381 out:
382         mmput(mm);
383         kfree((void *)buffer);
384         return rc;
385 }
386
387 /*
388  * Get jobid of current process by reading the environment variable
389  * stored in between the "env_start" & "env_end" of task struct.
390  *
391  * If some job scheduler doesn't store jobid in the "env_start/end",
392  * then an upcall could be issued here to get the jobid by utilizing
393  * the userspace tools/API. Then, the jobid must be cached.
394  */
395 static int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len)
396 {
397         int rc;
398
399         rc = cfs_get_environ(jobid_var, jobid, jobid_len);
400         if (!rc)
401                 goto out;
402
403         if (rc == -EOVERFLOW) {
404                 /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
405                  * variable length strings instead of just numbers), it
406                  * might make sense to keep the unique parts for JobID,
407                  * instead of just returning an error.  That means a
408                  * larger temp buffer for cfs_get_environ(), then
409                  * truncating the string at some separator to fit into
410                  * the specified jobid_len.  Fix later if needed. */
411                 static ktime_t printed;
412
413                 if (unlikely(ktime_to_ns(printed) == 0 ||
414                              ktime_after(ktime_get(),
415                                          ktime_add_ns(printed,
416                                              3600ULL * 24 * NSEC_PER_SEC)))) {
417                         LCONSOLE_WARN("jobid: '%s' value too large (%d)\n",
418                                       obd_jobid_var, *jobid_len);
419                         printed = ktime_get();
420                 }
421
422                 rc = 0;
423         } else {
424                 CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL ||
425                               rc == -EDEADLK) ? D_INFO : D_ERROR,
426                              "jobid: get '%s' failed: rc = %d\n",
427                              obd_jobid_var, rc);
428         }
429
430 out:
431         return rc;
432 }
433
434 /*
435  * jobid_should_free_item
436  *
437  * Each item is checked to see if it should be released
438  * Removed from hash table by caller
439  * Actually freed in jobid_put_locked
440  *
441  * Returns 1 if item is to be freed, 0 if it is to be kept
442  */
443
444 static int jobid_should_free_item(void *obj, void *data)
445 {
446         char *jobid = data;
447         struct jobid_pid_map *pidmap = obj;
448         int rc = 0;
449
450         if (obj == NULL)
451                 return 0;
452
453         if (jobid == NULL) {
454                 WARN_ON_ONCE(kref_read(&pidmap->jp_refcount) != 1);
455                 return 1;
456         }
457
458         spin_lock(&pidmap->jp_lock);
459         /* prevent newly inserted items from deleting */
460         if (jobid[0] == '\0' && kref_read(&pidmap->jp_refcount) == 1)
461                 rc = 1;
462         else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL)
463                 rc = 1;
464         else if (strcmp(pidmap->jp_jobid, jobid) == 0)
465                 rc = 1;
466         spin_unlock(&pidmap->jp_lock);
467
468         return rc;
469 }
470
471 /*
472  * jobid_name_is_valid
473  *
474  * Checks if the jobid is a Lustre process
475  *
476  * Returns true if jobid is valid
477  * Returns false if jobid looks like it's a Lustre process
478  */
479 static bool jobid_name_is_valid(char *jobid)
480 {
481         const char *const lustre_reserved[] = { "ll_ping", "ptlrpc",
482                                                 "ldlm", "ll_sa", "kworker",
483                                                 "kswapd", "writeback", "irq",
484                                                 "ksoftirq", NULL };
485         int i;
486
487         if (jobid[0] == '\0')
488                 return false;
489
490         for (i = 0; lustre_reserved[i] != NULL; i++) {
491                 if (strncmp(jobid, lustre_reserved[i],
492                             strlen(lustre_reserved[i])) == 0)
493                         return false;
494         }
495         return true;
496 }
497
498 /*
499  * jobid_get_from_cache()
500  *
501  * Returns contents of jobid_var from process environment for current PID,
502  * or from the per-session jobid table.
503  * Values fetch from process environment will be cached for some time to avoid
504  * the overhead of scanning the environment.
505  *
506  * Return: -ENOMEM if allocating a new pidmap fails
507  *         -ENOENT if no entry could be found
508  *         +ve string length for success (something was returned in jobid)
509  */
510 static int jobid_get_from_cache(char *jobid, size_t joblen)
511 {
512         static time64_t last_expire;
513         bool expire_cache = false;
514         pid_t pid = current->pid;
515         struct jobid_pid_map *pidmap = NULL;
516         time64_t now = ktime_get_real_seconds();
517         int rc = 0;
518         ENTRY;
519
520         if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) {
521                 char *jid;
522
523                 rcu_read_lock();
524                 jid = jobid_current();
525                 if (jid) {
526                         strscpy(jobid, jid, joblen);
527                         joblen = strlen(jobid);
528                 } else {
529                         rc = -ENOENT;
530                 }
531                 rcu_read_unlock();
532                 GOTO(out, rc);
533         }
534
535         LASSERT(jobid_hash != NULL);
536
537         /* scan hash periodically to remove old PID entries from cache */
538         spin_lock(&jobid_hash_lock);
539         if (unlikely(last_expire + DELETE_INTERVAL <= now)) {
540                 expire_cache = true;
541                 last_expire = now;
542         }
543         spin_unlock(&jobid_hash_lock);
544
545         if (expire_cache)
546                 cfs_hash_cond_del(jobid_hash, jobid_should_free_item,
547                                   "intentionally_bad_jobid");
548
549         /* first try to find PID in the hash and use that value */
550         pidmap = cfs_hash_lookup(jobid_hash, &pid);
551         if (pidmap == NULL) {
552                 struct jobid_pid_map *pidmap2;
553
554                 OBD_ALLOC_PTR(pidmap);
555                 if (pidmap == NULL)
556                         GOTO(out, rc = -ENOMEM);
557
558                 pidmap->jp_pid = pid;
559                 pidmap->jp_time = 0;
560                 pidmap->jp_jobid[0] = '\0';
561                 spin_lock_init(&pidmap->jp_lock);
562                 INIT_HLIST_NODE(&pidmap->jp_hash);
563                 /*
564                  * @pidmap might be reclaimed just after it is added into
565                  * hash list, init @jp_refcount as 1 to make sure memory
566                  * could be not freed during access.
567                  */
568                 kref_init(&pidmap->jp_refcount);
569
570                 /*
571                  * Add the newly created map to the hash, on key collision we
572                  * lost a racing addition and must destroy our newly allocated
573                  * map.  The object which exists in the hash will be returned.
574                  */
575                 pidmap2 = cfs_hash_findadd_unique(jobid_hash, &pid,
576                                                   &pidmap->jp_hash);
577                 if (unlikely(pidmap != pidmap2)) {
578                         CDEBUG(D_INFO, "jobid: duplicate found for PID=%u\n",
579                                pid);
580                         OBD_FREE_PTR(pidmap);
581                         pidmap = pidmap2;
582                 }
583         }
584
585         /*
586          * If pidmap is old (this is always true for new entries) refresh it.
587          * If obd_jobid_var is not found, cache empty entry and try again
588          * later, to avoid repeat lookups for PID if obd_jobid_var missing.
589          */
590         spin_lock(&pidmap->jp_lock);
591         if (pidmap->jp_time + RESCAN_INTERVAL <= now) {
592                 char env_jobid[LUSTRE_JOBID_SIZE] = "";
593                 int env_len = sizeof(env_jobid);
594
595                 pidmap->jp_time = now;
596
597                 spin_unlock(&pidmap->jp_lock);
598                 rc = jobid_get_from_environ(obd_jobid_var, env_jobid, &env_len);
599
600                 CDEBUG(D_INFO, "jobid: PID mapping established: %d->%s\n",
601                        pidmap->jp_pid, env_jobid);
602                 spin_lock(&pidmap->jp_lock);
603                 if (!rc) {
604                         pidmap->jp_joblen = env_len;
605                         strscpy(pidmap->jp_jobid, env_jobid,
606                                 sizeof(pidmap->jp_jobid));
607                         rc = 0;
608                 } else if (rc == -ENOENT) {
609                         /* It might have been deleted, clear out old entry */
610                         pidmap->jp_joblen = 0;
611                         pidmap->jp_jobid[0] = '\0';
612                 }
613         }
614
615         /*
616          * Regardless of how pidmap was found, if it contains a valid entry
617          * use that for now.  If there was a technical error (e.g. -ENOMEM)
618          * use the old cached value until it can be looked up again properly.
619          * If a cached missing entry was found, return -ENOENT.
620          */
621         if (pidmap->jp_joblen) {
622                 strscpy(jobid, pidmap->jp_jobid, joblen);
623                 joblen = pidmap->jp_joblen;
624                 rc = 0;
625         } else if (!rc) {
626                 rc = -ENOENT;
627         }
628         spin_unlock(&pidmap->jp_lock);
629
630         cfs_hash_put(jobid_hash, &pidmap->jp_hash);
631
632         EXIT;
633 out:
634         return rc < 0 ? rc : joblen;
635 }
636
637 /*
638  * jobid_print_current_comm()
639  *
640  * Print current comm name into the provided jobid buffer, and trim names of
641  * kernel threads like "kworker/0:0" to "kworker" or "ll_sa_12345" to "ll_sa"
642  *
643  * Return: number of chars printed to jobid
644  */
645 static int jobid_print_current_comm(char *jobid, ssize_t joblen)
646 {
647         const char *const names[] = {"kworker", "kswapd", "ll_sa", "ll_agl",
648                                      "ldlm_bl", NULL};
649         int i;
650
651         if (current->flags & PF_KTHREAD) {
652                 for (i = 0; names[i] != NULL; i++) {
653                         if (strncmp(current->comm, names[i],
654                                     strlen(names[i])) == 0)
655                                 return snprintf(jobid, joblen, "%s", names[i]);
656                 }
657         }
658
659         return snprintf(jobid, joblen, "%s", current->comm);
660 }
661
662 /*
663  * jobid_interpret_string()
664  *
665  * Interpret the jobfmt string to expand specified fields, like coredumps do:
666  *   %e = executable
667  *   %g = gid
668  *   %h = hostname
669  *   %H = short hostname
670  *   %j = jobid from environment
671  *   %p = pid
672  *   %u = uid
673  *
674  * Unknown escape strings are dropped.  Other characters are copied through,
675  * excluding whitespace (to avoid making jobid parsing difficult).
676  *
677  * Return: -EOVERFLOW if the expanded string does not fit within @joblen
678  *         0 for success
679  */
680 static int jobid_interpret_string(const char *jobfmt, char *jobid,
681                                   ssize_t joblen)
682 {
683         char c;
684
685         while ((c = *jobfmt++) && joblen > 1) {
686                 char f, *p;
687                 int l;
688
689                 if (isspace(c)) /* Don't allow embedded spaces */
690                         continue;
691
692                 if (c != '%') {
693                         *jobid = c;
694                         joblen--;
695                         jobid++;
696                         *jobid = '\0';
697                         continue;
698                 }
699
700                 switch ((f = *jobfmt++)) {
701                 case 'e': /* executable name */
702                         l = jobid_print_current_comm(jobid, joblen);
703                         break;
704                 case 'g': /* group ID */
705                         l = snprintf(jobid, joblen, "%u",
706                                      from_kgid(&init_user_ns, current_fsgid()));
707                         break;
708                 case 'h': /* hostname */
709                         l = snprintf(jobid, joblen, "%s",
710                                      init_utsname()->nodename);
711                         break;
712                 case 'H': /* short hostname. Cut at first dot */
713                         l = snprintf(jobid, joblen, "%s",
714                                      init_utsname()->nodename);
715                         p = strnchr(jobid, joblen, '.');
716                         if (p) {
717                                 *p = '\0';
718                                 l = p - jobid;
719                         }
720                         break;
721                 case 'j': /* jobid stored in process environment */
722                         l = jobid_get_from_cache(jobid, joblen);
723                         if (l < 0)
724                                 l = 0;
725                         break;
726                 case 'p': /* process ID */
727                         l = snprintf(jobid, joblen, "%u", current->pid);
728                         break;
729                 case 'u': /* user ID */
730                         l = snprintf(jobid, joblen, "%u",
731                                      from_kuid(&init_user_ns, current_fsuid()));
732                         break;
733                 case '\0': /* '%' at end of format string */
734                         l = 0;
735                         goto out;
736                 default: /* drop unknown %x format strings */
737                         l = 0;
738                         break;
739                 }
740                 jobid += l;
741                 joblen -= l;
742         }
743         /*
744          * This points at the end of the buffer, so long as jobid is always
745          * incremented the same amount as joblen is decremented.
746          */
747 out:
748         jobid[joblen - 1] = '\0';
749
750         return joblen < 0 ? -EOVERFLOW : 0;
751 }
752
753 /*
754  * Hash initialization, copied from server-side job stats bucket sizes
755  */
756 #define HASH_JOBID_BKT_BITS 5
757 #define HASH_JOBID_CUR_BITS 7
758 #define HASH_JOBID_MAX_BITS 12
759
760 int jobid_cache_init(void)
761 {
762         int rc = 0;
763         ENTRY;
764
765         if (jobid_hash)
766                 return 0;
767
768         spin_lock_init(&jobid_hash_lock);
769         jobid_hash = cfs_hash_create("JOBID_HASH", HASH_JOBID_CUR_BITS,
770                                      HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS,
771                                      0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
772                                      &jobid_hash_ops, CFS_HASH_DEFAULT);
773         if (!jobid_hash) {
774                 rc = -ENOMEM;
775         } else {
776                 rc = rhashtable_init(&session_jobids, &jobid_params);
777                 if (rc) {
778                         cfs_hash_putref(jobid_hash);
779                         jobid_hash = NULL;
780                 }
781         }
782
783         RETURN(rc);
784 }
785 EXPORT_SYMBOL(jobid_cache_init);
786
787 void jobid_cache_fini(void)
788 {
789         struct cfs_hash *tmp_hash;
790         ENTRY;
791
792         spin_lock(&jobid_hash_lock);
793         tmp_hash = jobid_hash;
794         jobid_hash = NULL;
795         spin_unlock(&jobid_hash_lock);
796
797         cancel_delayed_work_sync(&jobid_prune_work);
798
799         if (tmp_hash != NULL) {
800                 cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL);
801                 cfs_hash_putref(tmp_hash);
802
803                 rhashtable_free_and_destroy(&session_jobids, jobid_free, NULL);
804         }
805
806
807         EXIT;
808 }
809 EXPORT_SYMBOL(jobid_cache_fini);
810
811 /*
812  * Hash operations for pid<->jobid
813  */
814 static unsigned int
815 jobid_hashfn(struct cfs_hash *hs, const void *key, const unsigned int bits)
816 {
817         return cfs_hash_djb2_hash(key, sizeof(pid_t), bits);
818 }
819
820 static void *jobid_key(struct hlist_node *hnode)
821 {
822         struct jobid_pid_map *pidmap;
823
824         pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
825         return &pidmap->jp_pid;
826 }
827
828 static int jobid_keycmp(const void *key, struct hlist_node *hnode)
829 {
830         const pid_t *pid_key1;
831         const pid_t *pid_key2;
832
833         LASSERT(key != NULL);
834         pid_key1 = (pid_t *)key;
835         pid_key2 = (pid_t *)jobid_key(hnode);
836
837         return *pid_key1 == *pid_key2;
838 }
839
840 static void *jobid_object(struct hlist_node *hnode)
841 {
842         return hlist_entry(hnode, struct jobid_pid_map, jp_hash);
843 }
844
845 static void jobid_get(struct cfs_hash *hs, struct hlist_node *hnode)
846 {
847         struct jobid_pid_map *pidmap;
848
849         pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
850
851         kref_get(&pidmap->jp_refcount);
852 }
853
854 static void jobid_put_locked_free(struct kref *kref)
855 {
856         struct jobid_pid_map *pidmap = container_of(kref, struct jobid_pid_map,
857                                                     jp_refcount);
858
859         CDEBUG(D_INFO, "Freeing: %d->%s\n", pidmap->jp_pid, pidmap->jp_jobid);
860         OBD_FREE_PTR(pidmap);
861 }
862
863 static void jobid_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
864 {
865         struct jobid_pid_map *pidmap;
866
867         if (hnode == NULL)
868                 return;
869
870         pidmap = hlist_entry(hnode, struct jobid_pid_map, jp_hash);
871         LASSERT(kref_read(&pidmap->jp_refcount) > 0);
872         kref_put(&pidmap->jp_refcount, jobid_put_locked_free);
873 }
874
875 static struct cfs_hash_ops jobid_hash_ops = {
876         .hs_hash        = jobid_hashfn,
877         .hs_keycmp      = jobid_keycmp,
878         .hs_key         = jobid_key,
879         .hs_object      = jobid_object,
880         .hs_get         = jobid_get,
881         .hs_put         = jobid_put_locked,
882         .hs_put_locked  = jobid_put_locked,
883 };
884
885 /**
886  * Generate the job identifier string for this process for tracking purposes.
887  *
888  * Fill in @jobid string based on the value of obd_jobid_var:
889  * JOBSTATS_DISABLE:      none
890  * JOBSTATS_NODELOCAL:    content of obd_jobid_name (jobid_interpret_string())
891  * JOBSTATS_PROCNAME_UID: process name/UID
892  * JOBSTATS_SESSION       per-session value set by
893  *                            /sys/fs/lustre/jobid_this_session
894  * anything else:         look up obd_jobid_var in the processes environment
895  *
896  * Return -ve error number, 0 on success.
897  */
898 int lustre_get_jobid(char *jobid, size_t joblen)
899 {
900         char id[LUSTRE_JOBID_SIZE] = "";
901         int len = min_t(int, joblen, LUSTRE_JOBID_SIZE);
902         int rc = 0;
903         ENTRY;
904
905         if (unlikely(joblen < 2)) {
906                 if (joblen == 1)
907                         jobid[0] = '\0';
908                 RETURN(-EINVAL);
909         }
910
911         if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) {
912                 /* Jobstats isn't enabled */
913                 memset(jobid, 0, joblen);
914                 RETURN(0);
915         }
916
917         if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
918                 /* Whole node dedicated to single job */
919                 rc = jobid_interpret_string(obd_jobid_name, id, len);
920         } else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
921                 rc = jobid_interpret_string("%e.%u", id, len);
922         } else if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0 ||
923                    jobid_name_is_valid(current->comm)) {
924                 /*
925                  * per-process jobid wanted, either from environment or from
926                  * per-session setting.
927                  * If obd_jobid_name contains "%j" or if getting the per-process
928                  * jobid directly fails, fall back to using obd_jobid_name.
929                  */
930                 rc = -EAGAIN;
931                 if (!strnstr(obd_jobid_name, "%j", joblen))
932                         rc = jobid_get_from_cache(id, len);
933
934                 /* fall back to jobid_name if jobid_var not available */
935                 if (rc < 0) {
936                         int rc2 = jobid_interpret_string(obd_jobid_name,
937                                                          id, len);
938                         if (!rc2)
939                                 rc = 0;
940                 }
941         }
942
943         memcpy(jobid, id, len);
944         RETURN(rc);
945 }
946 EXPORT_SYMBOL(lustre_get_jobid);
947
948 /*
949  * lustre_jobid_clear
950  *
951  * Search cache for JobID given by @find_jobid.
952  * If any entries in the hash table match the value, they are removed
953  */
954 void lustre_jobid_clear(const char *find_jobid)
955 {
956         char jobid[LUSTRE_JOBID_SIZE];
957         char *end;
958
959         if (jobid_hash == NULL)
960                 return;
961
962         strscpy(jobid, find_jobid, sizeof(jobid));
963         /* trim \n off the end of the incoming jobid */
964         end = strchr(jobid, '\n');
965         if (end && *end == '\n')
966                 *end = '\0';
967
968         CDEBUG(D_INFO, "Clearing Jobid: %s\n", jobid);
969         cfs_hash_cond_del(jobid_hash, jobid_should_free_item, jobid);
970
971         CDEBUG(D_INFO, "%d items remain in jobID table\n",
972                atomic_read(&jobid_hash->hs_count));
973 }