X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fobdclass%2Fjobid.c;h=1093f7829a96238809d6b9ca01b1d753c2e187ce;hb=06588e4a22b0ff037eafa1eee5e22521b1626904;hp=633388bebc92eedd73393a2ab6fd97e2db315995;hpb=8ad9453ee66ec1beaff9c8711cf732a861176a6f;p=fs%2Flustre-release.git diff --git a/lustre/obdclass/jobid.c b/lustre/obdclass/jobid.c index 633388b..1093f78 100644 --- a/lustre/obdclass/jobid.c +++ b/lustre/obdclass/jobid.c @@ -33,9 +33,7 @@ #define DEBUG_SUBSYSTEM S_RPC #include -#ifdef HAVE_UIDGID_HEADER #include -#endif #include #include @@ -67,6 +65,317 @@ struct jobid_pid_map { }; /* + * Jobid can be set for a session (see setsid(2)) by writing to + * a sysfs file from any process in that session. + * The jobids are stored in a hash table indexed by the relevant + * struct pid. We periodically look for entries where the pid has + * no PIDTYPE_SID tasks any more, and prune them. This happens within + * 5 seconds of a jobid being added, and every 5 minutes when jobids exist, + * but none are added. + */ +#define JOBID_EXPEDITED_CLEAN (5) +#define JOBID_BACKGROUND_CLEAN (5 * 60) + +struct session_jobid { + struct pid *sj_session; + struct rhash_head sj_linkage; + struct rcu_head sj_rcu; + char sj_jobid[1]; +}; + +static const struct rhashtable_params jobid_params = { + .key_len = sizeof(struct pid *), + .key_offset = offsetof(struct session_jobid, sj_session), + .head_offset = offsetof(struct session_jobid, sj_linkage), +}; + +static struct rhashtable session_jobids; + +/* + * jobid_current must be called with rcu_read_lock held. + * if it returns non-NULL, the string can only be used + * until rcu_read_unlock is called. + */ +char *jobid_current(void) +{ + struct pid *sid = task_session(current); + struct session_jobid *sj; + + sj = rhashtable_lookup_fast(&session_jobids, &sid, jobid_params); + if (sj) + return sj->sj_jobid; + return NULL; +} + +static void jobid_prune_expedite(void); +/* + * jobid_set_current will try to add a new entry + * to the table. If one exists with the same key, the + * jobid will be replaced + */ +int jobid_set_current(char *jobid) +{ + struct pid *sid; + struct session_jobid *sj, *origsj; + int ret; + int len = strlen(jobid); + + sj = kmalloc(sizeof(*sj) + len, GFP_KERNEL); + if (!sj) + return -ENOMEM; + rcu_read_lock(); + sid = task_session(current); + sj->sj_session = get_pid(sid); + strncpy(sj->sj_jobid, jobid, len+1); + origsj = rhashtable_lookup_get_insert_fast(&session_jobids, + &sj->sj_linkage, + jobid_params); + if (origsj == NULL) { + /* successful insert */ + rcu_read_unlock(); + jobid_prune_expedite(); + return 0; + } + + if (IS_ERR(origsj)) { + put_pid(sj->sj_session); + kfree(sj); + rcu_read_unlock(); + return PTR_ERR(origsj); + } + ret = rhashtable_replace_fast(&session_jobids, + &origsj->sj_linkage, + &sj->sj_linkage, + jobid_params); + if (ret) { + put_pid(sj->sj_session); + kfree(sj); + rcu_read_unlock(); + return ret; + } + put_pid(origsj->sj_session); + rcu_read_unlock(); + kfree_rcu(origsj, sj_rcu); + jobid_prune_expedite(); + + return 0; +} + +static void jobid_free(void *vsj, void *arg) +{ + struct session_jobid *sj = vsj; + + put_pid(sj->sj_session); + kfree(sj); +} + +static void jobid_prune(struct work_struct *work); +static DECLARE_DELAYED_WORK(jobid_prune_work, jobid_prune); +static int jobid_prune_expedited; +static void jobid_prune(struct work_struct *work) +{ + int remaining = 0; + struct rhashtable_iter iter; + struct session_jobid *sj; + + jobid_prune_expedited = 0; + rhashtable_walk_enter(&session_jobids, &iter); + rhashtable_walk_start(&iter); + while ((sj = rhashtable_walk_next(&iter)) != NULL) { + if (!hlist_empty(&sj->sj_session->tasks[PIDTYPE_SID])) { + remaining++; + continue; + } + if (rhashtable_remove_fast(&session_jobids, + &sj->sj_linkage, + jobid_params) == 0) { + put_pid(sj->sj_session); + kfree_rcu(sj, sj_rcu); + } + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + if (remaining) + schedule_delayed_work(&jobid_prune_work, + cfs_time_seconds(JOBID_BACKGROUND_CLEAN)); +} + +static void jobid_prune_expedite(void) +{ + if (!jobid_prune_expedited) { + jobid_prune_expedited = 1; + mod_delayed_work(system_wq, &jobid_prune_work, + cfs_time_seconds(JOBID_EXPEDITED_CLEAN)); + } +} + +static int cfs_access_process_vm(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long addr, + void *buf, int len, int write) +{ + /* Just copied from kernel for the kernels which doesn't + * have access_process_vm() exported + */ + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(), + * which is already holding mmap_sem for writes. If some other + * thread gets the write lock in the meantime, this thread will + * block, but at least it won't deadlock on itself. LU-1735 + */ + if (!mmap_read_trylock(mm)) + return -EDEADLK; + + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, rc, offset; + void *maddr; + +#if defined(HAVE_GET_USER_PAGES_GUP_FLAGS) + rc = get_user_pages(addr, 1, write ? FOLL_WRITE : 0, &page, + &vma); +#elif defined(HAVE_GET_USER_PAGES_6ARG) + rc = get_user_pages(addr, 1, write, 1, &page, &vma); +#else + rc = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); +#endif + if (rc <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + put_page(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + mmap_read_unlock(mm); + + return buf - old_buf; +} + +/* Read the environment variable of current process specified by @key. */ +static int cfs_get_environ(const char *key, char *value, int *val_len) +{ + struct mm_struct *mm; + char *buffer; + int buf_len = PAGE_SIZE; + int key_len = strlen(key); + unsigned long addr; + int rc; + bool skip = false; + + ENTRY; + buffer = kmalloc(buf_len, GFP_USER); + if (!buffer) + RETURN(-ENOMEM); + + mm = get_task_mm(current); + if (!mm) { + kfree(buffer); + RETURN(-EINVAL); + } + + addr = mm->env_start; + while (addr < mm->env_end) { + int this_len, retval, scan_len; + char *env_start, *env_end; + + memset(buffer, 0, buf_len); + + this_len = min_t(int, mm->env_end - addr, buf_len); + retval = cfs_access_process_vm(current, mm, addr, buffer, + this_len, 0); + if (retval < 0) + GOTO(out, rc = retval); + else if (retval != this_len) + break; + + addr += retval; + + /* Parse the buffer to find out the specified key/value pair. + * The "key=value" entries are separated by '\0'. + */ + env_start = buffer; + scan_len = this_len; + while (scan_len) { + char *entry; + int entry_len; + + env_end = memscan(env_start, '\0', scan_len); + LASSERT(env_end >= env_start && + env_end <= env_start + scan_len); + + /* The last entry of this buffer cross the buffer + * boundary, reread it in next cycle. + */ + if (unlikely(env_end - env_start == scan_len)) { + /* Just skip the entry larger than page size, + * it can't be jobID env variable. + */ + if (unlikely(scan_len == this_len)) + skip = true; + else + addr -= scan_len; + break; + } else if (unlikely(skip)) { + skip = false; + goto skip; + } + entry = env_start; + entry_len = env_end - env_start; + CDEBUG(D_INFO, "key: %s, entry: %s\n", key, entry); + + /* Key length + length of '=' */ + if (entry_len > key_len + 1 && + entry[key_len] == '=' && + !memcmp(entry, key, key_len)) { + entry += key_len + 1; + entry_len -= key_len + 1; + + /* The 'value' buffer passed in is too small. + * Copy what fits, but return -EOVERFLOW. + */ + if (entry_len >= *val_len) { + memcpy(value, entry, *val_len); + value[*val_len - 1] = 0; + GOTO(out, rc = -EOVERFLOW); + } + + memcpy(value, entry, entry_len); + *val_len = entry_len; + GOTO(out, rc = 0); + } +skip: + scan_len -= (env_end - env_start + 1); + env_start = env_end + 1; + } + } + GOTO(out, rc = -ENOENT); + +out: + mmput(mm); + kfree((void *)buffer); + return rc; +} + +/* * Get jobid of current process by reading the environment variable * stored in between the "env_start" & "env_end" of task struct. * @@ -76,14 +385,13 @@ struct jobid_pid_map { */ int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len) { - static bool printed; int rc; rc = cfs_get_environ(jobid_var, jobid, jobid_len); if (!rc) goto out; - if (unlikely(rc == -EOVERFLOW && !printed)) { + if (rc == -EOVERFLOW) { /* For the PBS_JOBID and LOADL_STEP_ID keys (which are * variable length strings instead of just numbers), it * might make sense to keep the unique parts for JobID, @@ -91,16 +399,23 @@ int jobid_get_from_environ(char *jobid_var, char *jobid, int *jobid_len) * larger temp buffer for cfs_get_environ(), then * truncating the string at some separator to fit into * the specified jobid_len. Fix later if needed. */ - LCONSOLE_WARN("jobid: '%s' value too large (%d)\n", - obd_jobid_var, *jobid_len); - printed = true; + static ktime_t printed; + + if (unlikely(ktime_to_ns(printed) == 0 || + ktime_after(ktime_get(), + ktime_add_ns(printed, + 3600ULL * 24 * NSEC_PER_SEC)))) { + LCONSOLE_WARN("jobid: '%s' value too large (%d)\n", + obd_jobid_var, *jobid_len); + printed = ktime_get(); + } + rc = 0; - } - if (rc) { - CDEBUG((rc == -ENOENT || rc == -EINVAL || - rc == -EDEADLK) ? D_INFO : D_ERROR, - "jobid: get '%s' failed: rc = %d\n", - obd_jobid_var, rc); + } else { + CDEBUG_LIMIT((rc == -ENOENT || rc == -EINVAL || + rc == -EDEADLK) ? D_INFO : D_ERROR, + "jobid: get '%s' failed: rc = %d\n", + obd_jobid_var, rc); } out: @@ -126,10 +441,14 @@ static int jobid_should_free_item(void *obj, void *data) if (obj == NULL) return 0; + if (jobid == NULL) { + WARN_ON_ONCE(atomic_read(&pidmap->jp_refcount) != 1); + return 1; + } + spin_lock(&pidmap->jp_lock); - if (jobid == NULL) - rc = 1; - else if (jobid[0] == '\0') + /* prevent newly inserted items from deleting */ + if (jobid[0] == '\0' && atomic_read(&pidmap->jp_refcount) == 1) rc = 1; else if (ktime_get_real_seconds() - pidmap->jp_time > DELETE_INTERVAL) rc = 1; @@ -168,8 +487,10 @@ static bool jobid_name_is_valid(char *jobid) /* * jobid_get_from_cache() * - * Returns contents of jobid_var from process environment for current PID. - * This will be cached for some time to avoid overhead scanning environment. + * Returns contents of jobid_var from process environment for current PID, + * or from the per-session jobid table. + * Values fetch from process environment will be cached for some time to avoid + * the overhead of scanning the environment. * * Return: -ENOMEM if allocating a new pidmap fails * -ENOENT if no entry could be found @@ -179,12 +500,27 @@ static int jobid_get_from_cache(char *jobid, size_t joblen) { static time64_t last_expire; bool expire_cache = false; - pid_t pid = current_pid(); + pid_t pid = current->pid; struct jobid_pid_map *pidmap = NULL; time64_t now = ktime_get_real_seconds(); int rc = 0; ENTRY; + if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0) { + char *jid; + + rcu_read_lock(); + jid = jobid_current(); + if (jid) { + strlcpy(jobid, jid, joblen); + joblen = strlen(jobid); + } else { + rc = -ENOENT; + } + rcu_read_unlock(); + GOTO(out, rc); + } + LASSERT(jobid_hash != NULL); /* scan hash periodically to remove old PID entries from cache */ @@ -213,6 +549,12 @@ static int jobid_get_from_cache(char *jobid, size_t joblen) pidmap->jp_jobid[0] = '\0'; spin_lock_init(&pidmap->jp_lock); INIT_HLIST_NODE(&pidmap->jp_hash); + /* + * @pidmap might be reclaimed just after it is added into + * hash list, init @jp_refcount as 1 to make sure memory + * could be not freed during access. + */ + atomic_set(&pidmap->jp_refcount, 1); /* * Add the newly created map to the hash, on key collision we @@ -226,8 +568,6 @@ static int jobid_get_from_cache(char *jobid, size_t joblen) pid); OBD_FREE_PTR(pidmap); pidmap = pidmap2; - } else { - cfs_hash_get(jobid_hash, &pidmap->jp_hash); } } @@ -290,6 +630,7 @@ out: * %e = executable * %g = gid * %h = hostname + * %H = short hostname * %j = jobid from environment * %p = pid * %u = uid @@ -306,7 +647,7 @@ static int jobid_interpret_string(const char *jobfmt, char *jobid, char c; while ((c = *jobfmt++) && joblen > 1) { - char f; + char f, *p; int l; if (isspace(c)) /* Don't allow embedded spaces */ @@ -316,12 +657,13 @@ static int jobid_interpret_string(const char *jobfmt, char *jobid, *jobid = c; joblen--; jobid++; + *jobid = '\0'; continue; } switch ((f = *jobfmt++)) { case 'e': /* executable name */ - l = snprintf(jobid, joblen, "%s", current_comm()); + l = snprintf(jobid, joblen, "%s", current->comm); break; case 'g': /* group ID */ l = snprintf(jobid, joblen, "%u", @@ -331,13 +673,22 @@ static int jobid_interpret_string(const char *jobfmt, char *jobid, l = snprintf(jobid, joblen, "%s", init_utsname()->nodename); break; + case 'H': /* short hostname. Cut at first dot */ + l = snprintf(jobid, joblen, "%s", + init_utsname()->nodename); + p = strnchr(jobid, joblen, '.'); + if (p) { + *p = '\0'; + l = p - jobid; + } + break; case 'j': /* jobid stored in process environment */ l = jobid_get_from_cache(jobid, joblen); if (l < 0) l = 0; break; case 'p': /* process ID */ - l = snprintf(jobid, joblen, "%u", current_pid()); + l = snprintf(jobid, joblen, "%u", current->pid); break; case 'u': /* user ID */ l = snprintf(jobid, joblen, "%u", @@ -383,8 +734,15 @@ int jobid_cache_init(void) HASH_JOBID_MAX_BITS, HASH_JOBID_BKT_BITS, 0, CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, &jobid_hash_ops, CFS_HASH_DEFAULT); - if (!jobid_hash) + if (!jobid_hash) { rc = -ENOMEM; + } else { + rc = rhashtable_init(&session_jobids, &jobid_params); + if (rc) { + cfs_hash_putref(jobid_hash); + jobid_hash = NULL; + } + } RETURN(rc); } @@ -400,11 +758,16 @@ void jobid_cache_fini(void) jobid_hash = NULL; spin_unlock(&jobid_hash_lock); + cancel_delayed_work_sync(&jobid_prune_work); + if (tmp_hash != NULL) { cfs_hash_cond_del(tmp_hash, jobid_should_free_item, NULL); cfs_hash_putref(tmp_hash); + + rhashtable_free_and_destroy(&session_jobids, jobid_free, NULL); } + EXIT; } EXPORT_SYMBOL(jobid_cache_fini); @@ -484,8 +847,10 @@ static struct cfs_hash_ops jobid_hash_ops = { * * Fill in @jobid string based on the value of obd_jobid_var: * JOBSTATS_DISABLE: none - * JOBSTATS_NODELOCAL: content of obd_jobid_node (jobid_interpret_string()) + * JOBSTATS_NODELOCAL: content of obd_jobid_name (jobid_interpret_string()) * JOBSTATS_PROCNAME_UID: process name/UID + * JOBSTATS_SESSION per-session value set by + * /sys/fs/lustre/jobid_this_session * anything else: look up obd_jobid_var in the processes environment * * Return -ve error number, 0 on success. @@ -509,18 +874,19 @@ int lustre_get_jobid(char *jobid, size_t joblen) rc = jobid_interpret_string(obd_jobid_name, jobid, joblen); } else if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { rc = jobid_interpret_string("%e.%u", jobid, joblen); - } else if (jobid_name_is_valid(current_comm())) { + } else if (strcmp(obd_jobid_var, JOBSTATS_SESSION) == 0 || + jobid_name_is_valid(current->comm)) { /* - * obd_jobid_var holds the jobid environment variable name. - * Skip initial check if obd_jobid_name already uses "%j", - * otherwise try just "%j" first, then fall back to whatever - * is in obd_jobid_name if obd_jobid_var is not found. + * per-process jobid wanted, either from environment or from + * per-session setting. + * If obd_jobid_name contains "%j" or if getting the per-process + * jobid directly fails, fall back to using obd_jobid_name. */ rc = -EAGAIN; if (!strnstr(obd_jobid_name, "%j", joblen)) rc = jobid_get_from_cache(jobid, joblen); - /* fall back to jobid_node if jobid_var not in environment */ + /* fall back to jobid_name if jobid_var not available */ if (rc < 0) { int rc2 = jobid_interpret_string(obd_jobid_name, jobid, joblen);