Whamcloud - gitweb
* updates to HEAD lustre since landing b_port_step on portals
[fs/lustre-release.git] / lustre / mds / mds_groups.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (c) 2004 Cluster File Systems, Inc.
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #define DEBUG_SUBSYSTEM S_MDS
23
24 #include <linux/config.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/mm.h>
28 #include <linux/kmod.h>
29 #include <linux/string.h>
30 #include <linux/stat.h>
31 #include <linux/errno.h>
32 #include <linux/version.h>
33 #include <linux/unistd.h>
34
35 #include <asm/system.h>
36 #include <asm/uaccess.h>
37
38 #include <linux/fs.h>
39 #include <linux/stat.h>
40 #include <asm/uaccess.h>
41 #include <linux/slab.h>
42 #include <asm/segment.h>
43
44 #include <libcfs/list.h>
45 #include <linux/obd_support.h>
46 #include <linux/lustre_lib.h>
47 #include <linux/lustre_mds.h>
48 #include "mds_internal.h"
49
50 #define GRP_HASH_NEW              0x1
51 #define GRP_HASH_ACQUIRING        0x2
52 #define GRP_HASH_INVALID          0x4
53 #define GRP_HASH_EXPIRED          0x8
54
55 #define GRP_IS_NEW(i)          ((i)->ge_flags & GRP_HASH_NEW)
56 #define GRP_IS_INVALID(i)      ((i)->ge_flags & GRP_HASH_INVALID)
57 #define GRP_IS_ACQUIRING(i)    ((i)->ge_flags & GRP_HASH_ACQUIRING)
58 #define GRP_IS_EXPIRED(i)      ((i)->ge_flags & GRP_HASH_EXPIRED)
59 #define GRP_IS_VALID(i)        ((i)->ge_flags == 0)
60
61 #define GRP_SET_NEW(i)         (i)->ge_flags |= GRP_HASH_NEW
62 #define GRP_SET_INVALID(i)     (i)->ge_flags |= GRP_HASH_INVALID
63 #define GRP_SET_ACQUIRING(i)   (i)->ge_flags |= GRP_HASH_ACQUIRING
64 #define GRP_SET_EXPIRED(i)     (i)->ge_flags |= GRP_HASH_EXPIRED
65 #define GRP_SET_VALID(i)       (i)->ge_flags = 0
66
67 #define GRP_CLEAR_NEW(i)       (i)->ge_flags &= ~GRP_HASH_NEW
68 #define GRP_CLEAR_ACQUIRING(i) (i)->ge_flags &= ~GRP_HASH_ACQUIRING
69 #define GRP_CLEAR_INVALID(i)   (i)->ge_flags &= ~GRP_HASH_INVALID
70 #define GRP_CLEAR_EXPIRED(i)   (i)->ge_flags &= ~GRP_HASH_EXPIRED
71
72 /* 
73  * We need share hash table among the groups of MDSs (which server as the same
74  * lustre file system), maybe MDT? but there's lprocfs problems of putting this
75  * in MDT. so we make it global to the module. which brings the limitation that
76  * one node couldn't running multiple MDS which server as different Lustre FS.
77  * but which maybe not meaningful.
78  */
79 static struct mds_grp_hash _group_hash;
80
81 struct mds_grp_hash *__mds_get_global_group_hash()
82 {
83         return &_group_hash;
84 }
85
86 static struct mds_grp_hash_entry *alloc_entry(uid_t uid)
87 {
88         struct mds_grp_hash_entry *entry;
89
90         OBD_ALLOC(entry, sizeof(*entry));
91         if (!entry)
92                 return NULL;
93
94         GRP_SET_NEW(entry);
95         INIT_LIST_HEAD(&entry->ge_hash);
96         entry->ge_uid = uid;
97         atomic_set(&entry->ge_refcount, 0);
98         init_waitqueue_head(&entry->ge_waitq);
99         return entry;
100 }
101
102 /* protected by hash lock */
103 static void free_entry(struct mds_grp_hash_entry *entry)
104 {
105         if (entry->ge_group_info)
106                 groups_free(entry->ge_group_info);
107         list_del(&entry->ge_hash);
108         CDEBUG(D_OTHER, "destroy mds_grp_entry %p for uid %d\n",
109                entry, entry->ge_uid);
110         OBD_FREE(entry, sizeof(*entry));
111 }
112
113 static inline void get_entry(struct mds_grp_hash_entry *entry)
114 {
115         atomic_inc(&entry->ge_refcount);
116 }
117 static inline void put_entry(struct mds_grp_hash_entry *entry)
118 {
119         if (atomic_dec_and_test(&entry->ge_refcount) &&
120             (GRP_IS_INVALID(entry) || GRP_IS_EXPIRED(entry))) {
121                 free_entry(entry);
122         }
123 }
124 static int check_unlink_entry(struct mds_grp_hash_entry *entry)
125 {
126         if (GRP_IS_VALID(entry) &&
127             time_before(jiffies, entry->ge_expire))
128                 return 0;
129
130         if (GRP_IS_ACQUIRING(entry) &&
131             time_after(jiffies, entry->ge_acquire_expire)) {
132                 GRP_SET_EXPIRED(entry);
133                 wake_up_all(&entry->ge_waitq);
134         } else if (!GRP_IS_INVALID(entry)) {
135                 GRP_SET_EXPIRED(entry);
136         }
137
138         list_del_init(&entry->ge_hash);
139         if (!atomic_read(&entry->ge_refcount))
140                 free_entry(entry);
141         return 1;
142 }
143
144 static int refresh_entry(struct mds_grp_hash *hash,
145                          struct mds_grp_hash_entry *entry)
146 {
147         char *argv[4];
148         char *envp[3];
149         char uidstr[16];
150         int rc;
151         ENTRY;
152
153         snprintf(uidstr, 16, "%d", entry->ge_uid);
154
155         argv[0] = hash->gh_upcall;
156         argv[1] = uidstr;
157         argv[2] = NULL;
158                                                                                                                         
159         envp[0] = "HOME=/";
160         envp[1] = "PATH=/sbin:/usr/sbin";
161         envp[2] = NULL;
162
163         rc = USERMODEHELPER(argv[0], argv, envp);
164         if (rc < 0) {
165                 CERROR("Error invoking getgroups upcall %s %s: %d; check "
166                        "/proc/fs/lustre/mds/grp_hash_upcall\n",
167                        argv[0], argv[1], rc);
168         } else {
169                 CWARN("Invoked upcall %s %s\n",
170                         argv[0], argv[1]);
171         }
172         RETURN(rc);
173 }
174
175 struct mds_grp_hash_entry *mds_get_group_entry(struct mds_obd *mds, uid_t uid)
176 {
177         struct mds_grp_hash_entry *entry = NULL, *new = NULL, *next;
178         struct mds_grp_hash *hash = &_group_hash;
179         struct list_head *head;
180         wait_queue_t wait;
181         int rc, found;
182         ENTRY;
183
184         head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
185         
186 find_again:
187         found = 0;
188         spin_lock(&hash->gh_lock);
189         list_for_each_entry_safe(entry, next, head, ge_hash) {
190                 /* check invalid & expired items */
191                 if (check_unlink_entry(entry))
192                         continue;
193                 if (entry->ge_uid == uid) {
194                         found = 1;
195                         break;
196                 }
197         }
198
199         if (!found) { /* didn't found */
200                 if (!new) {
201                         spin_unlock(&hash->gh_lock);
202                         new = alloc_entry(uid);
203                         if (!new) {
204                                 CERROR("fail to alloc entry\n");
205                                 RETURN(NULL);
206                         }
207                         goto find_again;
208                 } else {
209                         list_add(&new->ge_hash, head);
210                         entry = new;
211                 }
212         } else {
213                 if (new) {
214                         free_entry(new);
215                         new = NULL;
216                 }
217                 list_move(&entry->ge_hash, head);
218         }
219         get_entry(entry);
220
221         /* acquire for new one */
222         if (GRP_IS_NEW(entry)) {
223                 GRP_SET_ACQUIRING(entry);
224                 GRP_CLEAR_NEW(entry);
225                 entry->ge_acquire_expire = jiffies +
226                         hash->gh_acquire_expire * HZ;
227                 spin_unlock(&hash->gh_lock);
228
229                 rc = refresh_entry(hash, entry);
230
231                 spin_lock(&hash->gh_lock);
232                 if (rc) {
233                         GRP_CLEAR_ACQUIRING(entry);
234                         GRP_SET_INVALID(entry);
235                 }
236                 /* fall through */
237         }
238         
239         /*
240          * someone (and only one) is doing upcall upon this item, just wait it
241          * complete
242          */
243         if (GRP_IS_ACQUIRING(entry)) {
244                 init_waitqueue_entry(&wait, current);
245                 add_wait_queue(&entry->ge_waitq, &wait);
246                 set_current_state(TASK_INTERRUPTIBLE);
247                 spin_unlock(&hash->gh_lock);
248
249                 schedule_timeout(hash->gh_acquire_expire * HZ);
250
251                 spin_lock(&hash->gh_lock);
252                 remove_wait_queue(&entry->ge_waitq, &wait);
253                 if (GRP_IS_ACQUIRING(entry)) {
254                         /* we're interrupted or upcall failed
255                          * in the middle
256                          */
257                         put_entry(entry);
258                         spin_unlock(&hash->gh_lock);
259                         RETURN(NULL);
260                 }
261                 /* fall through */
262         }
263
264         /* invalid means error, don't need to try again */
265         if (GRP_IS_INVALID(entry)) {
266                 put_entry(entry);
267                 spin_unlock(&hash->gh_lock);
268                 RETURN(NULL);
269         }
270
271         /*
272          * check expired. We can't refresh the existed one because some memory
273          * might be shared by multiple processes.
274          */
275         if (check_unlink_entry(entry)) {
276                 /*
277                  * if expired, try again. but if this entry is created by me but
278                  * too quickly turn to expired without any error, should at
279                  * least give a chance to use it once.
280                  */
281                 if (entry != new) {
282                         put_entry(entry);
283                         spin_unlock(&hash->gh_lock);
284                         new = NULL;
285                         goto find_again;
286                 }
287         }
288         
289         /* Now we know it's good */
290         spin_unlock(&hash->gh_lock);
291         RETURN(entry);
292 }
293
294 void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry)
295 {
296         struct mds_grp_hash *hash = &_group_hash;
297         ENTRY;
298
299         if (!entry) {
300                 EXIT;
301                 return;
302         }
303
304         spin_lock(&hash->gh_lock);
305         LASSERT(atomic_read(&entry->ge_refcount) > 0);
306         put_entry(entry);
307         spin_unlock(&hash->gh_lock);
308         EXIT;
309 }
310
311 static int entry_set_group_info(struct mds_grp_hash_entry *entry,
312                                 __u32 ngroups, gid_t *groups)
313 {
314         struct group_info *ginfo;
315         ENTRY;
316
317 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
318         if (ngroups > NGROUPS)
319                 ngroups = NGROUPS;
320 #endif
321
322         if (ngroups > NGROUPS_MAX) {
323                 CERROR("too many (%d) supp groups\n", ngroups); 
324                 RETURN(-EINVAL);
325         }
326
327         ginfo = groups_alloc(ngroups);
328         if (!ginfo) {
329                 CERROR("can't alloc group_info for %d groups\n", ngroups);
330                 RETURN(-ENOMEM);
331         }
332         groups_from_buffer(ginfo, groups);
333
334         entry->ge_group_info = ginfo;
335         RETURN(0);
336 }
337
338 int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups)
339 {
340         struct mds_grp_hash *hash = &_group_hash;
341         struct mds_grp_hash_entry *entry = NULL;
342         struct list_head *head;
343         int found = 0, rc = 0;
344         ENTRY;
345
346         LASSERT(hash);
347
348         head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
349
350         spin_lock(&hash->gh_lock);
351         list_for_each_entry(entry, head, ge_hash) {
352                 if (entry->ge_uid == uid) {
353                         found = 1;
354                         break;
355                 }
356         }
357         if (!found) {
358                 /* haven't found, it's possible */
359                 spin_unlock(&hash->gh_lock);
360                 RETURN(-EINVAL);
361         }
362         if (err) {
363                 GRP_SET_INVALID(entry);
364                 GOTO(out, rc = -EINVAL);
365         }
366
367         if (!GRP_IS_ACQUIRING(entry) ||
368             GRP_IS_INVALID(entry) ||
369             GRP_IS_EXPIRED(entry)) {
370                 CERROR("found a stale entry %p(uid %d) in ioctl\n",
371                         entry, entry->ge_uid);
372                 GOTO(out, rc = -EINVAL);
373         }
374
375         atomic_inc(&entry->ge_refcount);
376         spin_unlock(&hash->gh_lock);
377         rc = entry_set_group_info(entry, ngroups, groups);
378         spin_lock(&hash->gh_lock);
379         atomic_dec(&entry->ge_refcount);
380         if (rc) {
381                 GRP_SET_INVALID(entry);
382                 list_del_init(&entry->ge_hash);
383                 GOTO(out, rc);
384         }
385         entry->ge_acquisition_time = LTIME_S(CURRENT_TIME);
386         entry->ge_expire = jiffies + hash->gh_entry_expire * HZ;
387         GRP_SET_VALID(entry);
388         CDEBUG(D_OTHER, "created mds_grp_entry %p for uid %d\n",
389                entry, entry->ge_uid);
390 out:
391         wake_up_all(&entry->ge_waitq);
392         spin_unlock(&hash->gh_lock);
393         RETURN(rc);
394 }
395
396 static void mds_flush_group_hash(struct mds_grp_hash *hash, int force)
397 {
398         struct mds_grp_hash_entry *entry, *next;
399         int i;
400         ENTRY;
401
402         spin_lock(&hash->gh_lock);
403         for (i = 0; i < MDSGRP_HASH_SIZE; i++) {
404                 list_for_each_entry_safe(entry, next,
405                                          &hash->gh_table[i], ge_hash) {
406                         if (!force && atomic_read(&entry->ge_refcount)) {
407                                 GRP_SET_EXPIRED(entry);
408                                 continue;
409                         }
410                         LASSERT(!atomic_read(&entry->ge_refcount));
411                         free_entry(entry);
412                 }
413         }
414         spin_unlock(&hash->gh_lock);
415         EXIT;
416 }
417
418 void mds_group_hash_flush_idle()
419 {
420         mds_flush_group_hash(&_group_hash, 0);
421 }
422
423 int mds_allow_setgroups(void)
424 {
425         return _group_hash.gh_allow_setgroups;
426 }
427
428 int mds_group_hash_init()
429 {
430         struct mds_grp_hash *hash;
431         int i;
432         ENTRY;
433
434         hash = &_group_hash;
435
436         spin_lock_init(&hash->gh_lock);
437         for (i = 0; i < MDSGRP_HASH_SIZE; i++)
438                 INIT_LIST_HEAD(&hash->gh_table[i]);
439         /* set default value, proc tunable */
440         sprintf(hash->gh_upcall, "%s", "/sbin/l_getgroups");
441         hash->gh_entry_expire = 5 * 60;
442         hash->gh_acquire_expire = 5;
443         hash->gh_allow_setgroups = 0;
444
445         RETURN(0);
446 }
447
448 void mds_group_hash_cleanup()
449 {
450         mds_flush_group_hash(&_group_hash, 1);
451 }