Whamcloud - gitweb
- cosmetic changes in mdapi DLD
[fs/lustre-release.git] / lustre / llite / llite_gns.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004, 2005 Cluster File Systems, Inc.
5  *
6  * Author: Phil Schwan <phil@clusterfs.com>
7  * Author: Oleg Drokin <green@clusterfs.com>
8  * Author: Yury Umanets <yury@clusterfs.com>
9  * Review: Nikita Danilov <nikita@clusterfs.com>
10  *
11  *   This file is part of Lustre, http://www.lustre.org.
12  *
13  *   Lustre is free software; you can redistribute it and/or
14  *   modify it under the terms of version 2 of the GNU General Public
15  *   License as published by the Free Software Foundation.
16  *
17  *   Lustre is distributed in the hope that it will be useful,
18  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *   GNU General Public License for more details.
21  *
22  *   You should have received a copy of the GNU General Public License
23  *   along with Lustre; if not, write to the Free Software
24  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #define DEBUG_SUBSYSTEM S_LLITE
28
29 #include <linux/fs.h>
30 #include <linux/version.h>
31 #include <asm/uaccess.h>
32 #include <linux/file.h>
33 #include <linux/kmod.h>
34
35 #include <linux/lustre_lite.h>
36 #include "llite_internal.h"
37
38 static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
39 static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
40 static struct ptlrpc_thread gns_thread;
41 static struct ll_gns_ctl gns_ctl;
42
43 /*
44  * waits until passed dentry gets mountpoint or timeout and attempts are
45  * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
46  */
47 static int
48 ll_gns_wait_for_mount(struct dentry *dentry,
49                       int timeout, int tries)
50 {
51         struct l_wait_info lwi;
52         struct ll_sb_info *sbi;
53         ENTRY;
54
55         LASSERT(dentry != NULL);
56         LASSERT(!IS_ERR(dentry));
57         sbi = ll_s2sbi(dentry->d_sb);
58         
59         lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
60         for (; !d_mountpoint(dentry) && tries > 0; tries--)
61                 l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
62
63         if (d_mountpoint(dentry)) {
64                 spin_lock(&sbi->ll_gns_lock);
65                 sbi->ll_gns_state = LL_GNS_FINISHED;
66                 spin_unlock(&sbi->ll_gns_lock);
67                 RETURN(0);
68         }
69         RETURN(-ETIME);
70 }
71
72 /*
73  * tries to mount the mount object under passed @dentry. In the case of success
74  * @dentry will become mount point and 0 will be returned. Error code will be
75  * returned otherwise.
76  */
77 int
78 ll_gns_mount_object(struct dentry *dentry, struct vfsmount *mnt)
79 {
80         struct ll_dentry_data *lld = dentry->d_fsdata;
81         char *path, *pathpage, *datapage, *argv[4];
82         struct file *mntinfo_fd = NULL;
83         int cleanup_phase = 0, rc = 0;
84         struct ll_sb_info *sbi;
85         struct dentry *dchild;
86         ENTRY;
87
88         if (mnt == NULL) {
89                 CERROR("suid directory found, but no "
90                        "vfsmount available.\n");
91                 RETURN(-EINVAL);
92         }
93
94         CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
95
96         LASSERT(dentry->d_inode != NULL);
97         LASSERT(S_ISDIR(dentry->d_inode->i_mode));
98         LASSERT(lld != NULL);
99         
100         sbi = ll_i2sbi(dentry->d_inode);
101         LASSERT(sbi != NULL);
102
103         /* 
104          * another thead is in progress or just finished mounting the
105          * dentry. Handling that.
106          */
107         spin_lock(&sbi->ll_gns_lock);
108         if (sbi->ll_gns_state == LL_GNS_MOUNTING ||
109             sbi->ll_gns_state == LL_GNS_FINISHED) {
110                 spin_unlock(&sbi->ll_gns_lock);
111                 CDEBUG(D_INODE, "GNS is in progress now, throwing "
112                        "-ERESTARTSYS to restart syscall and let "
113                        "it finish.\n");
114                 RETURN(-ERESTARTSYS);
115         }
116         LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
117
118         spin_lock(&dentry->d_lock);
119         dentry->d_flags |= DCACHE_GNS_MOUNTING;
120         spin_unlock(&dentry->d_lock);
121         
122         /* mounting started */
123         sbi->ll_gns_state = LL_GNS_MOUNTING;
124         spin_unlock(&sbi->ll_gns_lock);
125
126         /* we need to build an absolute pathname to pass to mount */
127         pathpage = (char *)__get_free_page(GFP_KERNEL);
128         if (!pathpage)
129                 GOTO(cleanup, rc = -ENOMEM);
130         cleanup_phase = 1;
131
132         /* getting @dentry path stored in @pathpage. */
133         path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
134         if (IS_ERR(path)) {
135                 CERROR("can't build mount object path, err %d\n",
136                        (int)PTR_ERR(dchild));
137                 GOTO(cleanup, rc = PTR_ERR(dchild));
138         }
139
140         /* synchronizing with possible /proc/fs/...write */
141         down(&sbi->ll_gns_sem);
142         
143         /* 
144          * mount object name is taken from sbi, where it is set in mount time or
145          * via /proc/fs... tunable. It may be ".mntinfo" or so.
146          */
147
148         /* 
149          * FIXME: lookup_one_len() requires dentry->d_inode->i_sem to be locked,
150          * but we can't use ll_lookup_one_len() as this function is called from
151          * different contol paths and some of them take dentry->d_inode->i_sem
152          * and others do not.
153          */
154         dchild = lookup_one_len(sbi->ll_gns_oname, dentry,
155                                 strlen(sbi->ll_gns_oname));
156         up(&sbi->ll_gns_sem);
157
158         cleanup_phase = 2;
159         
160         if (IS_ERR(dchild)) {
161                 rc = PTR_ERR(dchild);
162                 
163                 if (rc == -ERESTARTSYS) {
164                         CDEBUG(D_INODE, "possible endless loop is detected "
165                                "due to mount object is directory marked by "
166                                "SUID bit.\n");
167                         GOTO(cleanup, rc = -ELOOP);
168                 }
169
170                 CERROR("can't find mount object %*s/%*s err = %d.\n",
171                        (int)dentry->d_name.len, dentry->d_name.name,
172                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
173                        rc);
174                 GOTO(cleanup, rc);
175         }
176
177         /* mount object is not found */
178         if (!dchild->d_inode)
179                 GOTO(cleanup, rc = -ENOENT);
180
181         /* check if found child is regular file */
182         if (!S_ISREG(dchild->d_inode->i_mode))
183                 GOTO(cleanup, rc = -EOPNOTSUPP);
184
185         mntget(mnt);
186
187         /* ok, mount object if found, opening it. */
188         mntinfo_fd = dentry_open(dchild, mnt, 0);
189         if (IS_ERR(mntinfo_fd)) {
190                 CERROR("can't open mount object %*s/%*s err = %d.\n",
191                        (int)dentry->d_name.len, dentry->d_name.name,
192                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
193                        (int)PTR_ERR(mntinfo_fd));
194                 mntput(mnt);
195                 GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
196         }
197         cleanup_phase = 3;
198
199         if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE) {
200                 CERROR("mount object %*s/%*s is too big (%Ld)\n",
201                        (int)dentry->d_name.len, dentry->d_name.name,
202                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
203                        mntinfo_fd->f_dentry->d_inode->i_size);
204                 GOTO(cleanup, rc = -EFBIG);
205         }
206
207         datapage = (char *)__get_free_page(GFP_KERNEL);
208         if (!datapage)
209                 GOTO(cleanup, rc = -ENOMEM);
210
211         cleanup_phase = 4;
212         
213         /* read data from mount object. */
214         rc = kernel_read(mntinfo_fd, 0, datapage, PAGE_SIZE);
215         if (rc < 0) {
216                 CERROR("can't read mount object %*s/%*s data, err %d\n",
217                        (int)dentry->d_name.len, dentry->d_name.name,
218                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
219                        rc);
220                 GOTO(cleanup, rc);
221         }
222
223         datapage[PAGE_SIZE - 1] = '\0';
224
225         fput(mntinfo_fd);
226         mntinfo_fd = NULL;
227         dchild = NULL;
228
229         /* synchronizing with possible /proc/fs/...write */
230         down(&sbi->ll_gns_sem);
231
232         /*
233          * upcall is initialized in mount time or via /proc/fs/... tuneable and
234          * may be /usr/lib/lustre/gns-upcall.sh
235          */
236         argv[0] = sbi->ll_gns_upcall;
237         argv[1] = datapage;
238         argv[2] = path;
239         argv[3] = NULL;
240         
241         up(&sbi->ll_gns_sem);
242
243         rc = USERMODEHELPER(argv[0], argv, NULL);
244         if (rc) {
245                 CERROR("failed to call GNS upcall %s, err = %d\n",
246                        sbi->ll_gns_upcall, rc);
247                 GOTO(cleanup, rc);
248         }
249
250         /*
251          * wait for mount completion. This is actually not need, because
252          * USERMODEHELPER() returns only when usermode process finishes. But we
253          * doing this just for case USERMODEHELPER() semantics will be changed
254          * or usermode upcall program will start mounting in backgound and
255          * return instantly. --umka
256          */
257         rc = ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS);
258         complete_all(&sbi->ll_gns_mount_finished);
259         if (rc == 0) {
260                 struct dentry *rdentry;
261                 struct vfsmount *rmnt;
262                 
263                 /* mount is successful */
264                 LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
265
266                 rmnt = mntget(mnt);
267                 rdentry = dget(dentry);
268                 
269                 if (follow_down(&rmnt, &rdentry)) {
270                         /* 
271                          * registering new mount in GNS mounts list and thus
272                          * make it accessible from GNS control thread.
273                          */
274                         spin_lock(&dcache_lock);
275                         LASSERT(list_empty(&rmnt->mnt_lustre_list));
276                         list_add_tail(&rmnt->mnt_lustre_list,
277                                       &sbi->ll_mnt_list);
278                         spin_unlock(&dcache_lock);
279                         rmnt->mnt_last_used = jiffies;
280                         mntput(rmnt);
281                         dput(rdentry);
282                 } else {
283                         mntput(mnt);
284                         dput(dentry);
285                 }
286                 spin_lock(&dentry->d_lock);
287                 dentry->d_flags &= ~DCACHE_GNS_PENDING;
288                 spin_unlock(&dentry->d_lock);
289         } else {
290                 CERROR("usermode upcall %s failed to mount %s, err %d\n",
291                        sbi->ll_gns_upcall, path, rc);
292         }
293                 
294         EXIT;
295 cleanup:
296         switch (cleanup_phase) {
297         case 4:
298                 free_page((unsigned long)datapage);
299         case 3:
300                 if (mntinfo_fd != NULL)
301                         fput(mntinfo_fd);
302         case 2:
303                 if (dchild != NULL)
304                         dput(dchild);
305         case 1:
306                 free_page((unsigned long)pathpage);
307                 
308                 /* 
309                  * waking up all waiters after gns state is set to
310                  * LL_GNS_MOUNTING
311                  */
312                 complete_all(&sbi->ll_gns_mount_finished);
313         case 0:
314                 spin_lock(&sbi->ll_gns_lock);
315                 sbi->ll_gns_state = LL_GNS_IDLE;
316                 spin_unlock(&sbi->ll_gns_lock);
317
318                 spin_lock(&dentry->d_lock);
319                 dentry->d_flags &= ~DCACHE_GNS_MOUNTING;
320                 spin_unlock(&dentry->d_lock);
321         }
322         return rc;
323 }
324
325 /* tries to umount passed @mnt. */
326 int ll_gns_umount_object(struct vfsmount *mnt)
327 {
328         int rc = 0;
329         ENTRY;
330         
331         CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
332         rc = do_umount(mnt, 0);
333         if (rc) {
334                 CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
335                        mnt, rc);
336         }
337         
338         RETURN(rc);
339 }
340
341 int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
342 {
343         struct list_head check_list = LIST_HEAD_INIT(check_list);
344         struct vfsmount *mnt;
345         unsigned long pass;
346         ENTRY;
347
348         spin_lock(&dcache_lock);
349         list_splice_init(&sbi->ll_mnt_list, &check_list);
350
351         /*
352          * walk the list in reverse order, and put them on the front of the sbi
353          * list each iteration; this avoids list-ordering problems if we race
354          * with another gns-mounting thread.
355          */
356         while (!list_empty(&check_list)) {
357                 mnt = list_entry(check_list.prev,
358                                  struct vfsmount,
359                                  mnt_lustre_list);
360
361                 mntget(mnt);
362
363                 list_del_init(&mnt->mnt_lustre_list);
364
365                 list_add(&mnt->mnt_lustre_list,
366                          &sbi->ll_mnt_list);
367
368                 /* check for timeout if needed */
369                 pass = jiffies - mnt->mnt_last_used;
370                 
371                 if (flags == LL_GNS_CHECK &&
372                     pass < sbi->ll_gns_timeout * HZ)
373                 {
374                         mntput(mnt);
375                         continue;
376                 }
377                 spin_unlock(&dcache_lock);
378
379                 /* umounting @mnt */
380                 ll_gns_umount_object(mnt);
381
382                 mntput(mnt);
383                 spin_lock(&dcache_lock);
384         }
385         spin_unlock(&dcache_lock);
386         RETURN(0);
387 }
388
389 /*
390  * GNS timer callback function. It restarts gns timer and wakes up GNS control
391  * thread to process mounts list.
392  */
393 void ll_gns_timer_callback(unsigned long data)
394 {
395         struct ll_sb_info *sbi = (void *)data;
396         ENTRY;
397
398         spin_lock(&gns_lock);
399         if (list_empty(&sbi->ll_gns_sbi_head))
400                 list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
401         spin_unlock(&gns_lock);
402         
403         wake_up(&gns_thread.t_ctl_waitq);
404         mod_timer(&sbi->ll_gns_timer,
405                   jiffies + sbi->ll_gns_tick * HZ);
406 }
407
408 /* this function checks if something new happened to exist in gns list. */
409 static int inline ll_gns_check_event(void)
410 {
411         int rc;
412         
413         spin_lock(&gns_lock);
414         rc = !list_empty(&gns_sbi_list);
415         spin_unlock(&gns_lock);
416
417         return rc;
418 }
419
420 /* should we stop GNS control thread? */
421 static int inline ll_gns_check_stop(void)
422 {
423         mb();
424         return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
425 }
426
427 /* GNS control thread function. */
428 static int ll_gns_thread_main(void *arg)
429 {
430         struct ll_gns_ctl *ctl = arg;
431         unsigned long flags;
432         ENTRY;
433
434         {
435                 char name[sizeof(current->comm)];
436                 snprintf(name, sizeof(name) - 1, "ll_gns");
437                 kportal_daemonize(name);
438         }
439         
440         SIGNAL_MASK_LOCK(current, flags);
441         sigfillset(&current->blocked);
442         RECALC_SIGPENDING;
443         SIGNAL_MASK_UNLOCK(current, flags);
444
445         /*
446          * letting starting function know, that we are ready and control may be
447          * returned.
448          */
449         gns_thread.t_flags = SVC_RUNNING;
450         complete(&ctl->gc_starting);
451
452         while (!ll_gns_check_stop()) {
453                 struct l_wait_info lwi = { 0 };
454
455                 l_wait_event(gns_thread.t_ctl_waitq,
456                              (ll_gns_check_event() ||
457                               ll_gns_check_stop()), &lwi);
458                 
459                 spin_lock(&gns_lock);
460                 while (!list_empty(&gns_sbi_list)) {
461                         struct ll_sb_info *sbi;
462
463                         sbi = list_entry(gns_sbi_list.prev,
464                                          struct ll_sb_info,
465                                          ll_gns_sbi_head);
466                         
467                         list_del_init(&sbi->ll_gns_sbi_head);
468                         spin_unlock(&gns_lock);
469                         ll_gns_check_mounts(sbi, LL_GNS_CHECK);
470                         spin_lock(&gns_lock);
471                 }
472                 spin_unlock(&gns_lock);
473         }
474
475         EXIT;
476         gns_thread.t_flags = SVC_STOPPED;
477
478         /* this is SMP-safe way to finish thread. */
479         complete_and_exit(&ctl->gc_finishing, 0);
480 }
481
482 void ll_gns_add_timer(struct ll_sb_info *sbi)
483 {
484         mod_timer(&sbi->ll_gns_timer,
485                   jiffies + sbi->ll_gns_tick * HZ);
486 }
487
488 void ll_gns_del_timer(struct ll_sb_info *sbi)
489 {
490         del_timer(&sbi->ll_gns_timer);
491 }
492
493 /*
494  * starts GNS control thread and waits for a signal it is up and work may be
495  * continued.
496  */
497 int ll_gns_start_thread(void)
498 {
499         int rc;
500         ENTRY;
501
502         LASSERT(gns_thread.t_flags == 0);
503         init_completion(&gns_ctl.gc_starting);
504         init_completion(&gns_ctl.gc_finishing);
505         init_waitqueue_head(&gns_thread.t_ctl_waitq);
506         
507         rc = kernel_thread(ll_gns_thread_main, &gns_ctl,
508                            (CLONE_VM | CLONE_FILES));
509         if (rc < 0) {
510                 CERROR("cannot start GNS control thread, "
511                        "err = %d\n", rc);
512                 RETURN(rc);
513         }
514         wait_for_completion(&gns_ctl.gc_starting);
515         LASSERT(gns_thread.t_flags == SVC_RUNNING);
516         RETURN(0);
517 }
518
519 /* stops GNS control thread and waits its actual stop. */
520 void ll_gns_stop_thread(void)
521 {
522         ENTRY;
523         gns_thread.t_flags = SVC_STOPPING;
524         wake_up(&gns_thread.t_ctl_waitq);
525         wait_for_completion(&gns_ctl.gc_finishing);
526         LASSERT(gns_thread.t_flags == SVC_STOPPED);
527         gns_thread.t_flags = 0;
528         EXIT;
529 }