Whamcloud - gitweb
b=3031
[fs/lustre-release.git] / lustre / llite / llite_gns.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004, 2005 Cluster File Systems, Inc.
5  *
6  * Author: Phil Schwan <phil@clusterfs.com>
7  * Author: Oleg Drokin <green@clusterfs.com>
8  * Author: Yury Umanets <yury@clusterfs.com>
9  * Review: Nikita Danilov <nikita@clusterfs.com>
10  *
11  *   This file is part of Lustre, http://www.lustre.org.
12  *
13  *   Lustre is free software; you can redistribute it and/or
14  *   modify it under the terms of version 2 of the GNU General Public
15  *   License as published by the Free Software Foundation.
16  *
17  *   Lustre is distributed in the hope that it will be useful,
18  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *   GNU General Public License for more details.
21  *
22  *   You should have received a copy of the GNU General Public License
23  *   along with Lustre; if not, write to the Free Software
24  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #define DEBUG_SUBSYSTEM S_LLITE
28
29 #include <linux/fs.h>
30 #include <linux/version.h>
31 #include <asm/uaccess.h>
32 #include <linux/file.h>
33 #include <linux/kmod.h>
34
35 #include <linux/lustre_lite.h>
36 #include "llite_internal.h"
37
38 static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
39 static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
40 static struct ptlrpc_thread gns_thread;
41 static struct ll_gns_ctl gns_ctl;
42
43 /*
44  * waits until passed dentry gets mountpoint or timeout and attempts are
45  * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
46  */
47 static int
48 ll_gns_wait_for_mount(struct dentry *dentry,
49                       int timeout, int tries)
50 {
51         struct l_wait_info lwi;
52         struct ll_sb_info *sbi;
53         ENTRY;
54
55         LASSERT(dentry != NULL);
56         LASSERT(!IS_ERR(dentry));
57         sbi = ll_s2sbi(dentry->d_sb);
58         
59         lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
60         for (; !d_mountpoint(dentry) && tries > 0; tries--)
61                 l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
62
63         if (d_mountpoint(dentry)) {
64                 spin_lock(&sbi->ll_gns_lock);
65                 sbi->ll_gns_state = LL_GNS_FINISHED;
66                 spin_unlock(&sbi->ll_gns_lock);
67                 RETURN(0);
68         }
69         RETURN(-ETIME);
70 }
71
72 /*
73  * tries to mount the mount object under passed @dentry. In the case of success
74  * @dentry will become mount point and 0 will be returned. Error code will be
75  * returned otherwise.
76  */
77 int
78 ll_gns_mount_object(struct dentry *dentry, struct vfsmount *mnt)
79 {
80         char *path, *pathpage, *datapage, *argv[4];
81         struct file *mntinfo_fd = NULL;
82         int cleanup_phase = 0, rc = 0;
83         struct ll_sb_info *sbi;
84         struct dentry *dchild;
85         ENTRY;
86
87         LASSERT(dentry->d_inode != NULL);
88
89         if (!S_ISDIR(dentry->d_inode->i_mode))
90                 RETURN(-EINVAL);
91
92         sbi = ll_i2sbi(dentry->d_inode);
93         LASSERT(sbi != NULL);
94
95         spin_lock(&sbi->ll_gns_lock);
96
97         if (sbi->ll_gns_state == LL_GNS_DISABLED) {
98                 spin_unlock(&sbi->ll_gns_lock);
99                 RETURN(-EINVAL);
100         }
101         
102         /* 
103          * another thead is in progress or just finished mounting the
104          * dentry. Handling that.
105          */
106         if (sbi->ll_gns_state == LL_GNS_MOUNTING ||
107             sbi->ll_gns_state == LL_GNS_FINISHED) {
108                 /* 
109                  * check if another thread is trying to mount some GNS dentry
110                  * too. Letting it know that we busy and make ll_lookup_it() to
111                  * restart syscall and try again later.
112                  */
113                 spin_unlock(&sbi->ll_gns_lock);
114                 RETURN(-EAGAIN);
115         }
116         LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
117
118         if (mnt == NULL) {
119                 CERROR("suid directory found, but no "
120                        "vfsmount available.\n");
121                 RETURN(-EINVAL);
122         }
123
124         CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
125
126         /* mounting started */
127         sbi->ll_gns_state = LL_GNS_MOUNTING;
128         spin_unlock(&sbi->ll_gns_lock);
129
130         /* we need to build an absolute pathname to pass to mount */
131         pathpage = (char *)__get_free_page(GFP_KERNEL);
132         if (!pathpage)
133                 GOTO(cleanup, rc = -ENOMEM);
134         cleanup_phase = 1;
135
136         /* getting @dentry path stored in @pathpage. */
137         path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
138         if (IS_ERR(path)) {
139                 CERROR("can't build mount object path, err %d\n",
140                        (int)PTR_ERR(dchild));
141                 GOTO(cleanup, rc = PTR_ERR(dchild));
142         }
143
144         /* synchronizing with possible /proc/fs/...write */
145         down(&sbi->ll_gns_sem);
146         
147         /* 
148          * mount object name is taken from sbi, where it is set in mount time or
149          * via /proc/fs... tunable. It may be ".mntinfo" or so.
150          */
151
152         /* 
153          * recursive lookup with trying to mount SUID bit marked directories on
154          * the way is not possible here, as lookup_one_len() does not pass @nd
155          * to ->lookup() and this is checked in ll_lookup_it(). So, do not
156          * handle possible -EAGAIN here.
157          */
158         dchild = ll_lookup_one_len(sbi->ll_gns_oname, dentry,
159                                    strlen(sbi->ll_gns_oname));
160         up(&sbi->ll_gns_sem);
161
162         cleanup_phase = 2;
163         
164         if (IS_ERR(dchild)) {
165                 rc = PTR_ERR(dchild);
166                 CERROR("can't find mount object %*s/%*s err = %d.\n",
167                        (int)dentry->d_name.len, dentry->d_name.name,
168                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
169                        rc);
170                 GOTO(cleanup, rc);
171         }
172
173         /* mount object is not found */
174         if (!dchild->d_inode)
175                 GOTO(cleanup, rc = -ENOENT);
176
177         /* check if found child is regular file */
178         if (!S_ISREG(dchild->d_inode->i_mode))
179                 GOTO(cleanup, rc = -EOPNOTSUPP);
180
181         mntget(mnt);
182
183         /* ok, mount object if found, opening it. */
184         mntinfo_fd = dentry_open(dchild, mnt, 0);
185         if (IS_ERR(mntinfo_fd)) {
186                 CERROR("can't open mount object %*s/%*s err = %d.\n",
187                        (int)dentry->d_name.len, dentry->d_name.name,
188                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
189                        (int)PTR_ERR(mntinfo_fd));
190                 mntput(mnt);
191                 GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
192         }
193         cleanup_phase = 3;
194
195         if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE - 1) {
196                 CERROR("mount object %*s/%*s is too big (%Ld)\n",
197                        (int)dentry->d_name.len, dentry->d_name.name,
198                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
199                        mntinfo_fd->f_dentry->d_inode->i_size);
200                 GOTO(cleanup, rc = -EFBIG);
201         }
202
203         datapage = (char *)__get_free_page(GFP_KERNEL);
204         if (!datapage)
205                 GOTO(cleanup, rc = -ENOMEM);
206
207         cleanup_phase = 4;
208         
209         /* read data from mount object. */
210         rc = kernel_read(mntinfo_fd, 0, datapage, PAGE_SIZE - 1);
211         if (rc < 0) {
212                 CERROR("can't read mount object %*s/%*s data, err %d\n",
213                        (int)dentry->d_name.len, dentry->d_name.name,
214                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
215                        rc);
216                 GOTO(cleanup, rc);
217         }
218
219         /* no data in mount object? */
220         if (rc == 0) {
221                 CERROR("mount object %*s/%*s is empty?\n",
222                        (int)dentry->d_name.len, dentry->d_name.name,
223                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname);
224                 GOTO(cleanup, rc);
225         }
226
227         datapage[rc] = '\0';
228         fput(mntinfo_fd);
229         mntinfo_fd = NULL;
230         dchild = NULL;
231
232         /* synchronizing with possible /proc/fs/...write */
233         down(&sbi->ll_gns_sem);
234
235         /*
236          * upcall is initialized in mount time or via /proc/fs/... tuneable and
237          * may be /usr/lib/lustre/gns-upcall.sh
238          */
239         argv[0] = sbi->ll_gns_upcall;
240         argv[1] = datapage;
241         argv[2] = path;
242         argv[3] = NULL;
243         
244         up(&sbi->ll_gns_sem);
245
246         rc = USERMODEHELPER(argv[0], argv, NULL);
247         if (rc) {
248                 CERROR("failed to call GNS upcall %s, err = %d\n",
249                        sbi->ll_gns_upcall, rc);
250                 GOTO(cleanup, rc);
251         }
252
253         /*
254          * wait for mount completion. This is actually not needed, because
255          * USERMODEHELPER() returns only when usermode process finishes. But we
256          * doing this just for case USERMODEHELPER() semantics will be changed
257          * or usermode upcall program will start mounting in backgound and
258          * return instantly. --umka
259          */
260         rc = ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS);
261         if (rc == 0) {
262                 struct dentry *rdentry;
263                 struct vfsmount *rmnt;
264                 
265                 /* mount is successful */
266                 LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
267
268                 rmnt = mntget(mnt);
269                 rdentry = dget(dentry);
270                 
271                 if (follow_down(&rmnt, &rdentry)) {
272                         /* 
273                          * registering new mount in GNS mounts list and thus
274                          * make it accessible from GNS control thread.
275                          */
276                         spin_lock(&dcache_lock);
277                         LASSERT(list_empty(&rmnt->mnt_lustre_list));
278                         list_add_tail(&rmnt->mnt_lustre_list,
279                                       &sbi->ll_mnt_list);
280                         spin_unlock(&dcache_lock);
281                         rmnt->mnt_last_used = jiffies;
282                         mntput(rmnt);
283                         dput(rdentry);
284                 } else {
285                         mntput(mnt);
286                         dput(dentry);
287                 }
288         } else {
289                 CERROR("usermode upcall %s failed to mount %s, err %d\n",
290                        sbi->ll_gns_upcall, path, rc);
291         }
292                 
293         EXIT;
294 cleanup:
295         switch (cleanup_phase) {
296         case 4:
297                 free_page((unsigned long)datapage);
298         case 3:
299                 if (mntinfo_fd != NULL)
300                         fput(mntinfo_fd);
301         case 2:
302                 if (dchild != NULL)
303                         dput(dchild);
304         case 1:
305                 free_page((unsigned long)pathpage);
306         case 0:
307                 spin_lock(&sbi->ll_gns_lock);
308                 sbi->ll_gns_state = LL_GNS_IDLE;
309                 spin_unlock(&sbi->ll_gns_lock);
310         }
311         return rc;
312 }
313
314 /* tries to umount passed @mnt. */
315 int ll_gns_umount_object(struct vfsmount *mnt)
316 {
317         int rc = 0;
318         ENTRY;
319         
320         CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
321         rc = do_umount(mnt, 0);
322         if (rc) {
323                 CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
324                        mnt, rc);
325         }
326         
327         RETURN(rc);
328 }
329
330 int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
331 {
332         struct list_head check_list = LIST_HEAD_INIT(check_list);
333         struct vfsmount *mnt;
334         unsigned long pass;
335         ENTRY;
336
337         spin_lock(&dcache_lock);
338         list_splice_init(&sbi->ll_mnt_list, &check_list);
339
340         /*
341          * walk the list in reverse order, and put them on the front of the sbi
342          * list each iteration; this avoids list-ordering problems if we race
343          * with another gns-mounting thread.
344          */
345         while (!list_empty(&check_list)) {
346                 mnt = list_entry(check_list.prev,
347                                  struct vfsmount,
348                                  mnt_lustre_list);
349
350                 mntget(mnt);
351
352                 list_del_init(&mnt->mnt_lustre_list);
353
354                 list_add(&mnt->mnt_lustre_list,
355                          &sbi->ll_mnt_list);
356
357                 /* check for timeout if needed */
358                 pass = jiffies - mnt->mnt_last_used;
359                 
360                 if (flags == LL_GNS_CHECK &&
361                     pass < sbi->ll_gns_timeout * HZ)
362                 {
363                         mntput(mnt);
364                         continue;
365                 }
366                 spin_unlock(&dcache_lock);
367
368                 /* umounting @mnt */
369                 ll_gns_umount_object(mnt);
370
371                 mntput(mnt);
372                 spin_lock(&dcache_lock);
373         }
374         spin_unlock(&dcache_lock);
375         RETURN(0);
376 }
377
378 /*
379  * GNS timer callback function. It restarts gns timer and wakes up GNS control
380  * thread to process mounts list.
381  */
382 void ll_gns_timer_callback(unsigned long data)
383 {
384         struct ll_sb_info *sbi = (void *)data;
385         ENTRY;
386
387         spin_lock(&gns_lock);
388         if (list_empty(&sbi->ll_gns_sbi_head))
389                 list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
390         spin_unlock(&gns_lock);
391         
392         wake_up(&gns_thread.t_ctl_waitq);
393         mod_timer(&sbi->ll_gns_timer,
394                   jiffies + sbi->ll_gns_tick * HZ);
395 }
396
397 /* this function checks if something new happened to exist in gns list. */
398 static int inline ll_gns_check_event(void)
399 {
400         int rc;
401         
402         spin_lock(&gns_lock);
403         rc = !list_empty(&gns_sbi_list);
404         spin_unlock(&gns_lock);
405
406         return rc;
407 }
408
409 /* should we stop GNS control thread? */
410 static int inline ll_gns_check_stop(void)
411 {
412         mb();
413         return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
414 }
415
416 /* GNS control thread function. */
417 static int ll_gns_thread_main(void *arg)
418 {
419         struct ll_gns_ctl *ctl = arg;
420         unsigned long flags;
421         ENTRY;
422
423         {
424                 char name[sizeof(current->comm)];
425                 snprintf(name, sizeof(name) - 1, "ll_gns");
426                 kportal_daemonize(name);
427         }
428         
429         SIGNAL_MASK_LOCK(current, flags);
430         sigfillset(&current->blocked);
431         RECALC_SIGPENDING;
432         SIGNAL_MASK_UNLOCK(current, flags);
433
434         /*
435          * letting starting function know, that we are ready and control may be
436          * returned.
437          */
438         gns_thread.t_flags = SVC_RUNNING;
439         complete(&ctl->gc_starting);
440
441         while (!ll_gns_check_stop()) {
442                 struct l_wait_info lwi = { 0 };
443
444                 l_wait_event(gns_thread.t_ctl_waitq,
445                              (ll_gns_check_event() ||
446                               ll_gns_check_stop()), &lwi);
447                 
448                 spin_lock(&gns_lock);
449                 while (!list_empty(&gns_sbi_list)) {
450                         struct ll_sb_info *sbi;
451
452                         sbi = list_entry(gns_sbi_list.prev,
453                                          struct ll_sb_info,
454                                          ll_gns_sbi_head);
455                         
456                         list_del_init(&sbi->ll_gns_sbi_head);
457                         spin_unlock(&gns_lock);
458                         ll_gns_check_mounts(sbi, LL_GNS_CHECK);
459                         spin_lock(&gns_lock);
460                 }
461                 spin_unlock(&gns_lock);
462         }
463
464         EXIT;
465         gns_thread.t_flags = SVC_STOPPED;
466
467         /* this is SMP-safe way to finish thread. */
468         complete_and_exit(&ctl->gc_finishing, 0);
469 }
470
471 void ll_gns_add_timer(struct ll_sb_info *sbi)
472 {
473         mod_timer(&sbi->ll_gns_timer,
474                   jiffies + sbi->ll_gns_tick * HZ);
475 }
476
477 void ll_gns_del_timer(struct ll_sb_info *sbi)
478 {
479         del_timer(&sbi->ll_gns_timer);
480 }
481
482 /*
483  * starts GNS control thread and waits for a signal it is up and work may be
484  * continued.
485  */
486 int ll_gns_start_thread(void)
487 {
488         int rc;
489         ENTRY;
490
491         LASSERT(gns_thread.t_flags == 0);
492         init_completion(&gns_ctl.gc_starting);
493         init_completion(&gns_ctl.gc_finishing);
494         init_waitqueue_head(&gns_thread.t_ctl_waitq);
495         
496         rc = kernel_thread(ll_gns_thread_main, &gns_ctl,
497                            (CLONE_VM | CLONE_FILES));
498         if (rc < 0) {
499                 CERROR("cannot start GNS control thread, "
500                        "err = %d\n", rc);
501                 RETURN(rc);
502         }
503         wait_for_completion(&gns_ctl.gc_starting);
504         LASSERT(gns_thread.t_flags == SVC_RUNNING);
505         RETURN(0);
506 }
507
508 /* stops GNS control thread and waits its actual stop. */
509 void ll_gns_stop_thread(void)
510 {
511         ENTRY;
512         gns_thread.t_flags = SVC_STOPPING;
513         wake_up(&gns_thread.t_ctl_waitq);
514         wait_for_completion(&gns_ctl.gc_finishing);
515         LASSERT(gns_thread.t_flags == SVC_STOPPED);
516         gns_thread.t_flags = 0;
517         EXIT;
518 }