Whamcloud - gitweb
fix ASSERTION(client_stat->nid_exp_ref_count == 0) during
[fs/lustre-release.git] / lustre / lvfs / upcall_cache.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Supplementary groups cache.
5  *
6  *  Copyright (c) 2004 Cluster File Systems, Inc.
7  *
8  *   This file is part of Lustre, http://www.lustre.org.
9  *
10  *   Lustre is free software; you can redistribute it and/or
11  *   modify it under the terms of version 2 of the GNU General Public
12  *   License as published by the Free Software Foundation.
13  *
14  *   Lustre is distributed in the hope that it will be useful,
15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *   GNU General Public License for more details.
18  *
19  *   You should have received a copy of the GNU General Public License
20  *   along with Lustre; if not, write to the Free Software
21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #define DEBUG_SUBSYSTEM S_SEC
25
26 #ifndef AUTOCONF_INCLUDED
27 #include <linux/config.h>
28 #endif
29 #include <linux/module.h>
30 #include <linux/kernel.h>
31 #include <linux/mm.h>
32 #include <linux/kmod.h>
33 #include <linux/string.h>
34 #include <linux/stat.h>
35 #include <linux/errno.h>
36 #include <linux/version.h>
37 #include <linux/unistd.h>
38
39 #include <asm/system.h>
40 #include <asm/uaccess.h>
41
42 #include <linux/fs.h>
43 #include <linux/stat.h>
44 #include <asm/uaccess.h>
45 #include <linux/slab.h>
46
47 #include <obd_support.h>
48 #include <lustre_lib.h>
49
50 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
51 struct group_info *groups_alloc(int ngroups)
52 {
53         struct group_info *ginfo;
54
55         LASSERT(ngroups <= NGROUPS_SMALL);
56
57         OBD_ALLOC(ginfo, sizeof(*ginfo) + 1 * sizeof(gid_t *));
58         if (!ginfo)
59                 return NULL;
60         ginfo->ngroups = ngroups;
61         ginfo->nblocks = 1;
62         ginfo->blocks[0] = ginfo->small_block;
63         atomic_set(&ginfo->usage, 1);
64
65         return ginfo;
66 }
67
68 void groups_free(struct group_info *ginfo)
69 {
70         LASSERT(ginfo->ngroups <= NGROUPS_SMALL);
71         LASSERT(ginfo->nblocks == 1);
72         LASSERT(ginfo->blocks[0] == ginfo->small_block);
73
74         OBD_FREE(ginfo, sizeof(*ginfo) + 1 * sizeof(gid_t *));
75 }
76 #endif
77
78 static struct upcall_cache_entry *alloc_entry(__u64 key)
79 {
80         struct upcall_cache_entry *entry;
81
82         OBD_ALLOC(entry, sizeof(*entry));
83         if (!entry)
84                 return NULL;
85
86         UC_CACHE_SET_NEW(entry);
87         INIT_LIST_HEAD(&entry->ue_hash);
88         entry->ue_key = key;
89         atomic_set(&entry->ue_refcount, 0);
90         init_waitqueue_head(&entry->ue_waitq);
91         return entry;
92 }
93
94 /* protected by hash lock */
95 static void free_entry(struct upcall_cache_entry *entry)
96 {
97         if (entry->ue_group_info)
98                 groups_free(entry->ue_group_info);
99         list_del(&entry->ue_hash);
100         CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n",
101                entry, entry->ue_key);
102         OBD_FREE(entry, sizeof(*entry));
103 }
104
105 static void get_entry(struct upcall_cache_entry *entry)
106 {
107         atomic_inc(&entry->ue_refcount);
108 }
109
110 static void put_entry(struct upcall_cache_entry *entry)
111 {
112         if (atomic_dec_and_test(&entry->ue_refcount) &&
113             (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
114                 free_entry(entry);
115         }
116 }
117
118 static int check_unlink_entry(struct upcall_cache_entry *entry)
119 {
120         if (UC_CACHE_IS_VALID(entry) &&
121             time_before(jiffies, entry->ue_expire))
122                 return 0;
123
124         if (UC_CACHE_IS_ACQUIRING(entry)) {
125                 if (time_before(jiffies, entry->ue_acquire_expire))
126                         return 0;
127
128                 UC_CACHE_SET_EXPIRED(entry);
129                 wake_up_all(&entry->ue_waitq);
130         } else if (!UC_CACHE_IS_INVALID(entry)) {
131                 UC_CACHE_SET_EXPIRED(entry);
132         }
133
134         list_del_init(&entry->ue_hash);
135         if (!atomic_read(&entry->ue_refcount))
136                 free_entry(entry);
137         return 1;
138 }
139
140 static int refresh_entry(struct upcall_cache *hash,
141                          struct upcall_cache_entry *entry)
142 {
143         char *argv[4];
144         char *envp[3];
145         char keystr[16];
146         int rc;
147         ENTRY;
148
149         snprintf(keystr, 16, LPU64, entry->ue_key);
150
151         CDEBUG(D_INFO, "The groups upcall is: %s \n", hash->uc_upcall);
152         argv[0] = hash->uc_upcall;
153         argv[1] = hash->uc_name;
154         argv[2] = keystr;
155         argv[3] = NULL;
156
157         envp[0] = "HOME=/";
158         envp[1] = "PATH=/sbin:/usr/sbin";
159         envp[2] = NULL;
160
161         rc = USERMODEHELPER(argv[0], argv, envp);
162         if (rc < 0) {
163                 CERROR("%s: error invoking getgroups upcall %s %s %s: rc %d; "
164                        "check /proc/fs/lustre/mds/%s/group_upcall\n",
165                        hash->uc_name, argv[0], argv[1], argv[2], rc, argv[1]);
166         } else {
167                 CDEBUG(D_HA, "%s: invoked upcall %s %s %s\n", hash->uc_name,
168                        argv[0], argv[1], argv[2]);
169                 rc = 0;
170         }
171         RETURN(rc);
172 }
173
174 static int entry_set_group_info(struct upcall_cache_entry *entry, __u32 primary,
175                                 __u32 ngroups, __u32 *groups)
176 {
177         struct group_info *ginfo;
178         int i, j;
179         ENTRY;
180
181 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
182         if (ngroups > NGROUPS)
183                 ngroups = NGROUPS;
184 #endif
185
186         if (ngroups > NGROUPS_MAX) {
187                 CERROR("using first %d supplementary groups for uid "LPU64"\n",
188                        NGROUPS_MAX, entry->ue_key);
189                 ngroups = NGROUPS_MAX;
190         }
191
192         ginfo = groups_alloc(ngroups);
193         if (!ginfo) {
194                 CERROR("uid "LPU64" update can't alloc ginfo for %d groups\n",
195                        entry->ue_key, ngroups);
196                 RETURN(-ENOMEM);
197         }
198         entry->ue_group_info = ginfo;
199         entry->ue_primary = primary;
200
201         for (i = 0; i < ginfo->nblocks; i++) {
202                 int cp_count = min(NGROUPS_PER_BLOCK, (int)ngroups);
203                 int off = i * NGROUPS_PER_BLOCK;
204
205                 for (j = 0; j < cp_count; j++)
206                         ginfo->blocks[i][j] = groups[off + j];
207
208                 ngroups -= cp_count;
209         }
210         RETURN(0);
211 }
212
213 struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash,
214                                                   __u64 key, __u32 primary,
215                                                   __u32 ngroups, __u32 *groups)
216 {
217         struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
218         struct list_head *head;
219         wait_queue_t wait;
220         int rc, found;
221         ENTRY;
222
223         LASSERT(hash);
224
225         if (strcmp(hash->uc_upcall, "NONE") == 0) {
226                 new = alloc_entry(key);
227                 if (!new) {
228                         CERROR("fail to alloc entry\n");
229                         RETURN(NULL);
230                 }
231                 get_entry(new);
232
233                 /* We have to sort the groups for 2.6 kernels */
234                 LASSERT(ngroups <= 2);
235                 if (ngroups == 2 && groups[1] == -1)
236                         ngroups--;
237 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
238                 /* 2.6 needs groups array sorted */
239                 if (ngroups == 2 && groups[0] > groups[1]) {
240                         __u32 tmp = groups[1];
241                         groups[1] = groups[0];
242                         groups[0] = tmp;
243                 }
244 #endif
245                 if (ngroups > 0 && groups[0] == -1) {
246                         groups[0] = groups[1];
247                         ngroups--;
248                 }
249
250                 rc = entry_set_group_info(new, primary, ngroups, groups);
251
252                 /* We can't cache this entry as it only has a subset of
253                  * the user's groups, as sent in suppgid1, suppgid2. */
254                 UC_CACHE_SET_EXPIRED(new);
255                 RETURN(new);
256         }
257         head = &hash->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
258 find_again:
259         found = 0;
260         spin_lock(&hash->uc_lock);
261         list_for_each_entry_safe(entry, next, head, ue_hash) {
262                 /* check invalid & expired items */
263                 if (check_unlink_entry(entry))
264                         continue;
265                 if (entry->ue_key == key) {
266                         found = 1;
267                         break;
268                 }
269         }
270
271         if (!found) { /* didn't find it */
272                 if (!new) {
273                         spin_unlock(&hash->uc_lock);
274                         new = alloc_entry(key);
275                         if (!new) {
276                                 CERROR("fail to alloc entry\n");
277                                 RETURN(ERR_PTR(-ENOMEM));
278                         }
279                         goto find_again;
280                 } else {
281                         list_add(&new->ue_hash, head);
282                         entry = new;
283                 }
284         } else {
285                 if (new) {
286                         free_entry(new);
287                         new = NULL;
288                 }
289                 list_move(&entry->ue_hash, head);
290         }
291         get_entry(entry);
292
293         /* acquire for new one */
294         if (UC_CACHE_IS_NEW(entry)) {
295                 UC_CACHE_SET_ACQUIRING(entry);
296                 UC_CACHE_CLEAR_NEW(entry);
297                 entry->ue_acquire_expire = jiffies + hash->uc_acquire_expire;
298                 spin_unlock(&hash->uc_lock);
299                 rc = refresh_entry(hash, entry);
300                 spin_lock(&hash->uc_lock);
301                 if (rc < 0) {
302                         UC_CACHE_CLEAR_ACQUIRING(entry);
303                         UC_CACHE_SET_INVALID(entry);
304                 }
305                 /* fall through */
306         }
307         /* someone (and only one) is doing upcall upon
308          * this item, just wait it complete
309          */
310         if (UC_CACHE_IS_ACQUIRING(entry)) {
311                 init_waitqueue_entry(&wait, current);
312                 add_wait_queue(&entry->ue_waitq, &wait);
313                 set_current_state(TASK_INTERRUPTIBLE);
314                 spin_unlock(&hash->uc_lock);
315
316                 schedule_timeout(hash->uc_acquire_expire);
317
318                 spin_lock(&hash->uc_lock);
319                 remove_wait_queue(&entry->ue_waitq, &wait);
320                 if (UC_CACHE_IS_ACQUIRING(entry)) {
321                         static unsigned long next;
322                         /* we're interrupted or upcall failed in the middle */
323                         if (time_after(jiffies, next)) {
324                                 CERROR("acquire timeout exceeded for key "LPU64
325                                        "\n", entry->ue_key);
326                                 next = jiffies + 1800;
327                         }
328                         put_entry(entry);
329                         GOTO(out, entry = ERR_PTR(-EIDRM));
330                 }
331                 /* fall through */
332         }
333
334         /* invalid means error, don't need to try again */
335         if (UC_CACHE_IS_INVALID(entry)) {
336                 put_entry(entry);
337                 GOTO(out, entry = ERR_PTR(-EIDRM));
338         }
339
340         /* check expired
341          * We can't refresh the existing one because some
342          * memory might be shared by multiple processes.
343          */
344         if (check_unlink_entry(entry)) {
345                 /* if expired, try again. but if this entry is
346                  * created by me but too quickly turn to expired
347                  * without any error, should at least give a
348                  * chance to use it once.
349                  */
350                 if (entry != new) {
351                         put_entry(entry);
352                         spin_unlock(&hash->uc_lock);
353                         new = NULL;
354                         goto find_again;
355                 }
356         }
357
358         /* Now we know it's good */
359 out:
360         spin_unlock(&hash->uc_lock);
361         RETURN(entry);
362 }
363 EXPORT_SYMBOL(upcall_cache_get_entry);
364
365 void upcall_cache_put_entry(struct upcall_cache *hash,
366                             struct upcall_cache_entry *entry)
367 {
368         ENTRY;
369
370         if (!entry) {
371                 EXIT;
372                 return;
373         }
374
375         LASSERT(atomic_read(&entry->ue_refcount) > 0);
376         spin_lock(&hash->uc_lock);
377         put_entry(entry);
378         spin_unlock(&hash->uc_lock);
379         EXIT;
380 }
381 EXPORT_SYMBOL(upcall_cache_put_entry);
382
383 int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key,
384                           __u32 primary, __u32 ngroups, __u32 *groups)
385 {
386         struct upcall_cache_entry *entry = NULL;
387         struct list_head *head;
388         int found = 0, rc = 0;
389         ENTRY;
390
391         LASSERT(hash);
392
393         head = &hash->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
394
395         spin_lock(&hash->uc_lock);
396         list_for_each_entry(entry, head, ue_hash) {
397                 if (entry->ue_key == key) {
398                         found = 1;
399                         get_entry(entry);
400                         break;
401                 }
402         }
403
404         if (!found) {
405                 CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n",
406                        hash->uc_name, entry->ue_key);
407                 /* haven't found, it's possible */
408                 spin_unlock(&hash->uc_lock);
409                 RETURN(-EINVAL);
410         }
411
412         if (err) {
413                 CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n",
414                        hash->uc_name, entry->ue_key, err);
415                 GOTO(out, rc = -EINVAL);
416         }
417
418         if (!UC_CACHE_IS_ACQUIRING(entry)) {
419                 CDEBUG(D_RPCTRACE,"%s: found uptodate entry %p (key "LPU64")\n",
420                        hash->uc_name, entry, entry->ue_key);
421                 GOTO(out, rc = 0);
422         }
423
424         if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
425                 CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n",
426                        hash->uc_name, entry, entry->ue_key);
427                 GOTO(out, rc = -EINVAL);
428         }
429
430         spin_unlock(&hash->uc_lock);
431         rc = entry_set_group_info(entry, primary, ngroups, groups);
432         spin_lock(&hash->uc_lock);
433         if (rc)
434                 GOTO(out, rc);
435
436         entry->ue_expire = jiffies + hash->uc_entry_expire;
437         UC_CACHE_SET_VALID(entry);
438         CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n",
439                hash->uc_name, entry, entry->ue_key);
440 out:
441         if (rc) {
442                 UC_CACHE_SET_INVALID(entry);
443                 list_del_init(&entry->ue_hash);
444         }
445         UC_CACHE_CLEAR_ACQUIRING(entry);
446         spin_unlock(&hash->uc_lock);
447         wake_up_all(&entry->ue_waitq);
448         put_entry(entry);
449
450         RETURN(rc);
451 }
452 EXPORT_SYMBOL(upcall_cache_downcall);
453
454 static void cache_flush(struct upcall_cache *hash, int force)
455 {
456         struct upcall_cache_entry *entry, *next;
457         int i;
458         ENTRY;
459
460         spin_lock(&hash->uc_lock);
461         for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
462                 list_for_each_entry_safe(entry, next,
463                                          &hash->uc_hashtable[i], ue_hash) {
464                         if (!force && atomic_read(&entry->ue_refcount)) {
465                                 UC_CACHE_SET_EXPIRED(entry);
466                                 continue;
467                         }
468                         LASSERT(!atomic_read(&entry->ue_refcount));
469                         free_entry(entry);
470                 }
471         }
472         spin_unlock(&hash->uc_lock);
473         EXIT;
474 }
475
476 void upcall_cache_flush_idle(struct upcall_cache *cache)
477 {
478         cache_flush(cache, 0);
479 }
480 EXPORT_SYMBOL(upcall_cache_flush_idle);
481
482 void upcall_cache_flush_all(struct upcall_cache *cache)
483 {
484         cache_flush(cache, 1);
485 }
486 EXPORT_SYMBOL(upcall_cache_flush_all);
487
488 struct upcall_cache *upcall_cache_init(const char *name)
489 {
490         struct upcall_cache *hash;
491         int i;
492         ENTRY;
493
494         OBD_ALLOC(hash, sizeof(*hash));
495         if (!hash)
496                 RETURN(ERR_PTR(-ENOMEM));
497
498         spin_lock_init(&hash->uc_lock);
499         for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
500                 INIT_LIST_HEAD(&hash->uc_hashtable[i]);
501         strncpy(hash->uc_name, name, sizeof(hash->uc_name) - 1);
502         /* set default value, proc tunable */
503         strcpy(hash->uc_upcall, "NONE");
504         hash->uc_entry_expire = 10 * 60 * HZ;
505         hash->uc_acquire_expire = 15 * HZ;
506
507         RETURN(hash);
508 }
509 EXPORT_SYMBOL(upcall_cache_init);
510
511 void upcall_cache_cleanup(struct upcall_cache *hash)
512 {
513         if (!hash)
514                 return;
515         upcall_cache_flush_all(hash);
516         OBD_FREE(hash, sizeof(*hash));
517 }
518 EXPORT_SYMBOL(upcall_cache_cleanup);