1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2004 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #define DEBUG_SUBSYSTEM S_MDS
24 #include <linux/config.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
28 #include <linux/kmod.h>
29 #include <linux/string.h>
30 #include <linux/stat.h>
31 #include <linux/errno.h>
32 #include <linux/version.h>
33 #include <linux/unistd.h>
35 #include <asm/system.h>
36 #include <asm/uaccess.h>
39 #include <linux/stat.h>
40 #include <asm/uaccess.h>
41 #include <linux/slab.h>
42 #include <asm/segment.h>
44 #include <libcfs/list.h>
45 #include <linux/obd_support.h>
46 #include <linux/lustre_lib.h>
47 #include <linux/lustre_mds.h>
48 #include "mds_internal.h"
50 #define GRP_HASH_NEW 0x1
51 #define GRP_HASH_ACQUIRING 0x2
52 #define GRP_HASH_INVALID 0x4
53 #define GRP_HASH_EXPIRED 0x8
55 #define GRP_IS_NEW(i) ((i)->ge_flags & GRP_HASH_NEW)
56 #define GRP_IS_INVALID(i) ((i)->ge_flags & GRP_HASH_INVALID)
57 #define GRP_IS_ACQUIRING(i) ((i)->ge_flags & GRP_HASH_ACQUIRING)
58 #define GRP_IS_EXPIRED(i) ((i)->ge_flags & GRP_HASH_EXPIRED)
59 #define GRP_IS_VALID(i) ((i)->ge_flags == 0)
61 #define GRP_SET_NEW(i) (i)->ge_flags |= GRP_HASH_NEW
62 #define GRP_SET_INVALID(i) (i)->ge_flags |= GRP_HASH_INVALID
63 #define GRP_SET_ACQUIRING(i) (i)->ge_flags |= GRP_HASH_ACQUIRING
64 #define GRP_SET_EXPIRED(i) (i)->ge_flags |= GRP_HASH_EXPIRED
65 #define GRP_SET_VALID(i) (i)->ge_flags = 0
67 #define GRP_CLEAR_NEW(i) (i)->ge_flags &= ~GRP_HASH_NEW
68 #define GRP_CLEAR_ACQUIRING(i) (i)->ge_flags &= ~GRP_HASH_ACQUIRING
69 #define GRP_CLEAR_INVALID(i) (i)->ge_flags &= ~GRP_HASH_INVALID
70 #define GRP_CLEAR_EXPIRED(i) (i)->ge_flags &= ~GRP_HASH_EXPIRED
73 * We need share hash table among the groups of MDSs (which server as the same
74 * lustre file system), maybe MDT? but there's lprocfs problems of putting this
75 * in MDT. so we make it global to the module. which brings the limitation that
76 * one node couldn't running multiple MDS which server as different Lustre FS.
77 * but which maybe not meaningful.
79 static struct mds_grp_hash _group_hash;
81 struct mds_grp_hash *__mds_get_global_group_hash()
86 static struct mds_grp_hash_entry *alloc_entry(uid_t uid)
88 struct mds_grp_hash_entry *entry;
90 OBD_ALLOC(entry, sizeof(*entry));
95 INIT_LIST_HEAD(&entry->ge_hash);
97 atomic_set(&entry->ge_refcount, 0);
98 init_waitqueue_head(&entry->ge_waitq);
102 /* protected by hash lock */
103 static void free_entry(struct mds_grp_hash_entry *entry)
105 if (entry->ge_group_info)
106 groups_free(entry->ge_group_info);
107 list_del(&entry->ge_hash);
108 CDEBUG(D_OTHER, "destroy mds_grp_entry %p for uid %d\n",
109 entry, entry->ge_uid);
110 OBD_FREE(entry, sizeof(*entry));
113 static inline void get_entry(struct mds_grp_hash_entry *entry)
115 atomic_inc(&entry->ge_refcount);
117 static inline void put_entry(struct mds_grp_hash_entry *entry)
119 if (atomic_dec_and_test(&entry->ge_refcount) &&
120 (GRP_IS_INVALID(entry) || GRP_IS_EXPIRED(entry))) {
124 static int check_unlink_entry(struct mds_grp_hash_entry *entry)
126 if (GRP_IS_VALID(entry) &&
127 time_before(jiffies, entry->ge_expire))
130 if (GRP_IS_ACQUIRING(entry) &&
131 time_after(jiffies, entry->ge_acquire_expire)) {
132 GRP_SET_EXPIRED(entry);
133 wake_up_all(&entry->ge_waitq);
134 } else if (!GRP_IS_INVALID(entry)) {
135 GRP_SET_EXPIRED(entry);
138 list_del_init(&entry->ge_hash);
139 if (!atomic_read(&entry->ge_refcount))
144 static int refresh_entry(struct mds_grp_hash *hash,
145 struct mds_grp_hash_entry *entry)
153 snprintf(uidstr, 16, "%d", entry->ge_uid);
155 argv[0] = hash->gh_upcall;
160 envp[1] = "PATH=/sbin:/usr/sbin";
163 rc = USERMODEHELPER(argv[0], argv, envp);
165 CERROR("Error invoking getgroups upcall %s %s: %d; check "
166 "/proc/fs/lustre/mds/grp_hash_upcall\n",
167 argv[0], argv[1], rc);
169 CWARN("Invoked upcall %s %s\n",
175 struct mds_grp_hash_entry *mds_get_group_entry(struct mds_obd *mds, uid_t uid)
177 struct mds_grp_hash_entry *entry = NULL, *new = NULL, *next;
178 struct mds_grp_hash *hash = &_group_hash;
179 struct list_head *head;
184 head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
188 spin_lock(&hash->gh_lock);
189 list_for_each_entry_safe(entry, next, head, ge_hash) {
190 /* check invalid & expired items */
191 if (check_unlink_entry(entry))
193 if (entry->ge_uid == uid) {
199 if (!found) { /* didn't found */
201 spin_unlock(&hash->gh_lock);
202 new = alloc_entry(uid);
204 CERROR("fail to alloc entry\n");
209 list_add(&new->ge_hash, head);
217 list_move(&entry->ge_hash, head);
221 /* acquire for new one */
222 if (GRP_IS_NEW(entry)) {
223 GRP_SET_ACQUIRING(entry);
224 GRP_CLEAR_NEW(entry);
225 entry->ge_acquire_expire = jiffies +
226 hash->gh_acquire_expire * HZ;
227 spin_unlock(&hash->gh_lock);
229 rc = refresh_entry(hash, entry);
231 spin_lock(&hash->gh_lock);
233 GRP_CLEAR_ACQUIRING(entry);
234 GRP_SET_INVALID(entry);
240 * someone (and only one) is doing upcall upon this item, just wait it
243 if (GRP_IS_ACQUIRING(entry)) {
244 init_waitqueue_entry(&wait, current);
245 add_wait_queue(&entry->ge_waitq, &wait);
246 set_current_state(TASK_INTERRUPTIBLE);
247 spin_unlock(&hash->gh_lock);
249 schedule_timeout(hash->gh_acquire_expire * HZ);
251 spin_lock(&hash->gh_lock);
252 remove_wait_queue(&entry->ge_waitq, &wait);
253 if (GRP_IS_ACQUIRING(entry)) {
254 /* we're interrupted or upcall failed
258 spin_unlock(&hash->gh_lock);
264 /* invalid means error, don't need to try again */
265 if (GRP_IS_INVALID(entry)) {
267 spin_unlock(&hash->gh_lock);
272 * check expired. We can't refresh the existed one because some memory
273 * might be shared by multiple processes.
275 if (check_unlink_entry(entry)) {
277 * if expired, try again. but if this entry is created by me but
278 * too quickly turn to expired without any error, should at
279 * least give a chance to use it once.
283 spin_unlock(&hash->gh_lock);
289 /* Now we know it's good */
290 spin_unlock(&hash->gh_lock);
294 void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry)
296 struct mds_grp_hash *hash = &_group_hash;
304 spin_lock(&hash->gh_lock);
305 LASSERT(atomic_read(&entry->ge_refcount) > 0);
307 spin_unlock(&hash->gh_lock);
311 static int entry_set_group_info(struct mds_grp_hash_entry *entry,
312 __u32 ngroups, gid_t *groups)
314 struct group_info *ginfo;
317 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
318 if (ngroups > NGROUPS)
322 if (ngroups > NGROUPS_MAX) {
323 CERROR("too many (%d) supp groups\n", ngroups);
327 ginfo = groups_alloc(ngroups);
329 CERROR("can't alloc group_info for %d groups\n", ngroups);
332 groups_from_buffer(ginfo, groups);
334 entry->ge_group_info = ginfo;
338 int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups)
340 struct mds_grp_hash *hash = &_group_hash;
341 struct mds_grp_hash_entry *entry = NULL;
342 struct list_head *head;
343 int found = 0, rc = 0;
348 head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
350 spin_lock(&hash->gh_lock);
351 list_for_each_entry(entry, head, ge_hash) {
352 if (entry->ge_uid == uid) {
358 /* haven't found, it's possible */
359 spin_unlock(&hash->gh_lock);
363 GRP_SET_INVALID(entry);
364 GOTO(out, rc = -EINVAL);
367 if (!GRP_IS_ACQUIRING(entry) ||
368 GRP_IS_INVALID(entry) ||
369 GRP_IS_EXPIRED(entry)) {
370 CERROR("found a stale entry %p(uid %d) in ioctl\n",
371 entry, entry->ge_uid);
372 GOTO(out, rc = -EINVAL);
375 atomic_inc(&entry->ge_refcount);
376 spin_unlock(&hash->gh_lock);
377 rc = entry_set_group_info(entry, ngroups, groups);
378 spin_lock(&hash->gh_lock);
379 atomic_dec(&entry->ge_refcount);
381 GRP_SET_INVALID(entry);
382 list_del_init(&entry->ge_hash);
385 entry->ge_acquisition_time = LTIME_S(CURRENT_TIME);
386 entry->ge_expire = jiffies + hash->gh_entry_expire * HZ;
387 GRP_SET_VALID(entry);
388 CDEBUG(D_OTHER, "created mds_grp_entry %p for uid %d\n",
389 entry, entry->ge_uid);
391 wake_up_all(&entry->ge_waitq);
392 spin_unlock(&hash->gh_lock);
396 static void mds_flush_group_hash(struct mds_grp_hash *hash, int force)
398 struct mds_grp_hash_entry *entry, *next;
402 spin_lock(&hash->gh_lock);
403 for (i = 0; i < MDSGRP_HASH_SIZE; i++) {
404 list_for_each_entry_safe(entry, next,
405 &hash->gh_table[i], ge_hash) {
406 if (!force && atomic_read(&entry->ge_refcount)) {
407 GRP_SET_EXPIRED(entry);
410 LASSERT(!atomic_read(&entry->ge_refcount));
414 spin_unlock(&hash->gh_lock);
418 void mds_group_hash_flush_idle()
420 mds_flush_group_hash(&_group_hash, 0);
423 int mds_allow_setgroups(void)
425 return _group_hash.gh_allow_setgroups;
428 int mds_group_hash_init()
430 struct mds_grp_hash *hash;
436 spin_lock_init(&hash->gh_lock);
437 for (i = 0; i < MDSGRP_HASH_SIZE; i++)
438 INIT_LIST_HEAD(&hash->gh_table[i]);
439 /* set default value, proc tunable */
440 sprintf(hash->gh_upcall, "%s", "/sbin/l_getgroups");
441 hash->gh_entry_expire = 5 * 60;
442 hash->gh_acquire_expire = 5;
443 hash->gh_allow_setgroups = 0;
448 void mds_group_hash_cleanup()
450 mds_flush_group_hash(&_group_hash, 1);