Whamcloud - gitweb
current branches now use lnet from HEAD
[fs/lustre-release.git] / lustre / llite / llite_gns.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004, 2005 Cluster File Systems, Inc.
5  *
6  * Author: Phil Schwan <phil@clusterfs.com>
7  * Author: Oleg Drokin <green@clusterfs.com>
8  * Author: Yury Umanets <yury@clusterfs.com>
9  * Review: Nikita Danilov <nikita@clusterfs.com>
10  *
11  *   This file is part of Lustre, http://www.lustre.org.
12  *
13  *   Lustre is free software; you can redistribute it and/or
14  *   modify it under the terms of version 2 of the GNU General Public
15  *   License as published by the Free Software Foundation.
16  *
17  *   Lustre is distributed in the hope that it will be useful,
18  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *   GNU General Public License for more details.
21  *
22  *   You should have received a copy of the GNU General Public License
23  *   along with Lustre; if not, write to the Free Software
24  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #define DEBUG_SUBSYSTEM S_LLITE
28
29 #include <linux/fs.h>
30 #include <linux/version.h>
31 #include <asm/uaccess.h>
32 #include <linux/file.h>
33 #include <linux/kmod.h>
34
35 #include <linux/lustre_lite.h>
36 #include "llite_internal.h"
37
38 static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
39 static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
40 static struct ptlrpc_thread gns_thread;
41 static struct ll_gns_ctl gns_ctl;
42
43 /*
44  * waits until passed dentry gets mountpoint or timeout and attempts are
45  * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
46  */
47 static int
48 ll_gns_wait_for_mount(struct dentry *dentry,
49                       int timeout, int tries)
50 {
51         struct l_wait_info lwi;
52         struct ll_sb_info *sbi;
53         int rc = 0;
54         ENTRY;
55
56         LASSERT(dentry != NULL);
57         LASSERT(!IS_ERR(dentry));
58         sbi = ll_s2sbi(dentry->d_sb);
59         
60         lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
61         for (; !d_mountpoint(dentry) && tries > 0; tries--) {
62                 l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
63                 if (signal_pending(current))
64                         GOTO(out, rc = -EINTR);
65         }
66
67         if (!d_mountpoint(dentry))
68                 rc = -ETIME;
69         
70         EXIT;
71 out:    
72         spin_lock(&sbi->ll_gns_lock);
73         sbi->ll_gns_state = LL_GNS_FINISHED;
74         spin_unlock(&sbi->ll_gns_lock);
75         return rc;
76 }
77
78 /*
79  * tries to mount the mount object under passed @dentry. In the case of success
80  * @dentry will become mount point and 0 will be returned. Error code will be
81  * returned otherwise.
82  */
83 int
84 ll_gns_mount_object(struct dentry *dentry, struct vfsmount *mnt)
85 {
86         char *path, *pathpage, *datapage = NULL, *argv[4];
87         struct file *mntinfo_fd = NULL;
88         int cleanup_phase = 0, rc = 0;
89         struct ll_sb_info *sbi;
90         struct dentry *dchild = NULL;
91         ENTRY;
92
93         LASSERT(dentry->d_inode != NULL);
94
95         if (!S_ISDIR(dentry->d_inode->i_mode))
96                 RETURN(-EINVAL);
97
98         sbi = ll_i2sbi(dentry->d_inode);
99         
100         if (mnt == NULL) {
101                 CERROR("suid directory found, but no "
102                        "vfsmount available.\n");
103                 RETURN(-EINVAL);
104         }
105
106         if (atomic_read(&sbi->ll_gns_enabled) == 0)
107                 RETURN(-EINVAL);
108
109         spin_lock(&sbi->ll_gns_lock);
110         
111         /* 
112          * another thead is in progress or just finished mounting the
113          * dentry. Handling that.
114          */
115         if (sbi->ll_gns_state != LL_GNS_IDLE) {
116                 /* 
117                  * another thread is trying to mount GNS dentry. We'd like to
118                  * handling that.
119                  */
120                 spin_unlock(&sbi->ll_gns_lock);
121
122         restart:
123                 /* 
124                  * check if dentry is mount point already, if so, do not restart
125                  * syscal.
126                  */
127                 if (d_mountpoint(dentry))
128                         RETURN(0);
129
130                 spin_lock(&sbi->ll_gns_lock);
131                 if (sbi->ll_gns_pending_dentry && 
132                     is_subdir(sbi->ll_gns_pending_dentry, dentry)) {
133                         spin_unlock(&sbi->ll_gns_lock);
134                         RETURN(-EAGAIN);
135                 }
136                 spin_unlock(&sbi->ll_gns_lock);
137
138                 /* 
139                  * waiting for GNS complete and check dentry again, it may be
140                  * mounted already.
141                  */
142                 wait_for_completion(&sbi->ll_gns_mount_finished);
143                 if (d_mountpoint(dentry))
144                         RETURN(0);
145
146                 /* 
147                  * check for he case when there are few waiters and all they are
148                  * awakened, but only one will find GNS state LL_GNS_IDLE, and
149                  * the rest will face with LL_GNS_MOUNTING.  --umka
150                  */
151                 spin_lock(&sbi->ll_gns_lock);
152                 if (sbi->ll_gns_state != LL_GNS_IDLE) {
153                         spin_unlock(&sbi->ll_gns_lock);
154                         goto restart;
155                 }
156                 spin_unlock(&sbi->ll_gns_lock);
157         }
158         LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
159         CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
160
161         /* mounting started */
162         sbi->ll_gns_state = LL_GNS_MOUNTING;
163         sbi->ll_gns_pending_dentry = dentry;
164         spin_unlock(&sbi->ll_gns_lock);
165
166         /* we need to build an absolute pathname to pass to mount */
167         pathpage = (char *)__get_free_page(GFP_KERNEL);
168         if (!pathpage)
169                 GOTO(cleanup, rc = -ENOMEM);
170         cleanup_phase = 1;
171
172         /* getting @dentry path stored in @pathpage. */
173         path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
174         if (IS_ERR(path)) {
175                 CERROR("can't build mount object path, err %d\n",
176                        (int)PTR_ERR(path));
177                 GOTO(cleanup, rc = PTR_ERR(path));
178         }
179
180         /* synchronizing with possible /proc/fs/...write */
181         down(&sbi->ll_gns_sem);
182         
183         /* 
184          * mount object name is taken from sbi, where it is set in mount time or
185          * via /proc/fs... tunable. It may be ".mntinfo" or so.
186          */
187
188         /* 
189          * recursive lookup with trying to mount SUID bit marked directories on
190          * the way is not possible here, as lookup_one_len() does not pass @nd
191          * to ->lookup() and this is checked in ll_lookup_it().
192          */
193         dchild = ll_lookup_one_len(sbi->ll_gns_oname, dentry,
194                                    strlen(sbi->ll_gns_oname));
195         up(&sbi->ll_gns_sem);
196
197         if (IS_ERR(dchild)) {
198                 rc = PTR_ERR(dchild);
199                 CERROR("can't find mount object %*s/%*s err = %d.\n",
200                        (int)dentry->d_name.len, dentry->d_name.name,
201                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
202                        rc);
203                 GOTO(cleanup, rc);
204         }
205
206         /* mount object is not found */
207         if (!dchild->d_inode) {
208                 dput(dchild);
209                 GOTO(cleanup, rc = -ENOENT);
210         }
211
212         /* check if found child is regular file */
213         if (!S_ISREG(dchild->d_inode->i_mode)) {
214                 dput(dchild);
215                 GOTO(cleanup, rc = -EBADF);
216         }
217
218         /* ok, mount object if found, opening it. */
219         mntinfo_fd = dentry_open(dchild, mntget(mnt), 0);
220         if (IS_ERR(mntinfo_fd)) {
221                 CERROR("can't open mount object %*s/%*s err = %d.\n",
222                        (int)dentry->d_name.len, dentry->d_name.name,
223                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
224                        (int)PTR_ERR(mntinfo_fd));
225                 mntput(mnt);
226                 dput(dchild);
227                 GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
228         }
229         cleanup_phase = 2;
230
231         /* make sure that inode size is up-to-date */
232         rc = ll_inode_revalidate_it(mntinfo_fd->f_dentry);
233         if (rc < 0) {
234                 CERROR("can't revalidate mount object %*s/%*s, err %d\n",
235                        (int)dentry->d_name.len, dentry->d_name.name,
236                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
237                        rc);
238                 GOTO(cleanup, rc);
239         }
240
241         if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE - 1) {
242                 CERROR("mount object %*s/%*s is too big (%Ld)\n",
243                        (int)dentry->d_name.len, dentry->d_name.name,
244                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
245                        mntinfo_fd->f_dentry->d_inode->i_size);
246                 GOTO(cleanup, rc = -EFBIG);
247         }
248
249         datapage = (char *)__get_free_page(GFP_KERNEL);
250         if (!datapage)
251                 GOTO(cleanup, rc = -ENOMEM);
252
253         cleanup_phase = 3;
254         
255         /* read data from mount object. */
256         rc = kernel_read(mntinfo_fd, 0, datapage, PAGE_SIZE - 1);
257         if (rc < 0) {
258                 CERROR("can't read mount object %*s/%*s data, err %d\n",
259                        (int)dentry->d_name.len, dentry->d_name.name,
260                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
261                        rc);
262                 GOTO(cleanup, rc);
263         }
264
265         /* no data in mount object? */
266         if (rc == 0) {
267                 CERROR("mount object %*s/%*s is empty?\n",
268                        (int)dentry->d_name.len, dentry->d_name.name,
269                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname);
270                 GOTO(cleanup, rc);
271         }
272
273         datapage[rc] = '\0';
274         fput(mntinfo_fd);
275         mntinfo_fd = NULL;
276         dchild = NULL;
277
278         /* synchronizing with possible /proc/fs/...write */
279         down(&sbi->ll_gns_sem);
280
281         /*
282          * upcall is initialized in mount time or via /proc/fs/... tuneable and
283          * may be /usr/lib/lustre/gns-upcall.sh
284          */
285         argv[0] = sbi->ll_gns_upcall;
286         argv[1] = datapage;
287         argv[2] = path;
288         argv[3] = NULL;
289         
290         up(&sbi->ll_gns_sem);
291
292         /* do not wait for helper complete here. */
293         rc = call_usermodehelper(argv[0], argv, NULL, 1);
294         if (rc) {
295                 CWARN("failed to call GNS upcall %s, err = %d, "
296                       "checking for mount anyway\n", sbi->ll_gns_upcall, rc);
297         }
298
299         /*
300          * waiting for dentry become mount point GNS_WAIT_ATTEMPTS times by 1
301          * second.
302          */
303         rc = ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS);
304         LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
305         
306         /* checking for mount point anyway to not loss mounts */
307         if (d_mountpoint(dentry)) {
308                 struct dentry *rdentry;
309                 struct vfsmount *rmnt;
310                
311                 rmnt = mntget(mnt);
312                 rdentry = dget(dentry);
313                 
314                 if (follow_down(&rmnt, &rdentry)) {
315                         /* 
316                          * registering new mount in GNS mounts list and thus
317                          * make it accessible from GNS control thread.
318                          */
319                         spin_lock(&dcache_lock);
320                         LASSERT(list_empty(&rmnt->mnt_lustre_list));
321                         list_add_tail(&rmnt->mnt_lustre_list,
322                                       &sbi->ll_mnt_list);
323                         spin_unlock(&dcache_lock);
324                         rmnt->mnt_last_used = jiffies;
325                         mntput(rmnt);
326                         dput(rdentry);
327                 } else {
328                         mntput(mnt);
329                         dput(dentry);
330                 }
331                 
332                 rc = 0;
333         } else {
334                 CERROR("usermode upcall %s failed to mount %s, err %d\n",
335                        sbi->ll_gns_upcall, path, rc);
336         }
337                 
338         EXIT;
339 cleanup:
340         switch (cleanup_phase) {
341         case 3:
342                 free_page((unsigned long)datapage);
343         case 2:
344                 if (mntinfo_fd != NULL) {
345                         fput(mntinfo_fd);
346                         dchild = NULL;
347                 }
348         case 1:
349                 free_page((unsigned long)pathpage);
350         case 0:
351                 spin_lock(&sbi->ll_gns_lock);
352                 sbi->ll_gns_state = LL_GNS_IDLE;
353                 sbi->ll_gns_pending_dentry = NULL;
354                 spin_unlock(&sbi->ll_gns_lock);
355
356                 /* waking up all waiters after GNS state is LL_GNS_IDLE */
357                 complete_all(&sbi->ll_gns_mount_finished);
358                 init_completion(&sbi->ll_gns_mount_finished);
359         }
360         return rc;
361 }
362
363 /* tries to umount passed @mnt. */
364 int ll_gns_umount_object(struct vfsmount *mnt)
365 {
366         int rc = 0;
367         ENTRY;
368         
369         CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
370         rc = do_umount(mnt, 0);
371         if (rc) {
372                 CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
373                        mnt, rc);
374         }
375         
376         RETURN(rc);
377 }
378
379 int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
380 {
381         struct list_head check_list = LIST_HEAD_INIT(check_list);
382         struct vfsmount *mnt;
383         unsigned long pass;
384         ENTRY;
385
386         spin_lock(&dcache_lock);
387         list_splice_init(&sbi->ll_mnt_list, &check_list);
388
389         /*
390          * walk the list in reverse order, and put them on the front of the sbi
391          * list each iteration; this avoids list-ordering problems if we race
392          * with another gns-mounting thread.
393          */
394         while (!list_empty(&check_list)) {
395                 mnt = list_entry(check_list.prev,
396                                  struct vfsmount,
397                                  mnt_lustre_list);
398
399                 mntget(mnt);
400
401                 list_del_init(&mnt->mnt_lustre_list);
402
403                 list_add(&mnt->mnt_lustre_list,
404                          &sbi->ll_mnt_list);
405
406                 /* check for timeout if needed */
407                 pass = jiffies - mnt->mnt_last_used;
408                 
409                 if (flags == LL_GNS_CHECK &&
410                     pass < sbi->ll_gns_timeout * HZ)
411                 {
412                         mntput(mnt);
413                         continue;
414                 }
415                 spin_unlock(&dcache_lock);
416
417                 /* umounting @mnt */
418                 ll_gns_umount_object(mnt);
419
420                 mntput(mnt);
421                 spin_lock(&dcache_lock);
422         }
423         spin_unlock(&dcache_lock);
424         RETURN(0);
425 }
426
427 /*
428  * GNS timer callback function. It restarts gns timer and wakes up GNS control
429  * thread to process mounts list.
430  */
431 void ll_gns_timer_callback(unsigned long data)
432 {
433         struct ll_sb_info *sbi = (void *)data;
434         ENTRY;
435
436         spin_lock(&gns_lock);
437         if (list_empty(&sbi->ll_gns_sbi_head))
438                 list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
439         spin_unlock(&gns_lock);
440         
441         wake_up(&gns_thread.t_ctl_waitq);
442         mod_timer(&sbi->ll_gns_timer,
443                   jiffies + sbi->ll_gns_tick * HZ);
444 }
445
446 /* this function checks if something new happened to exist in gns list. */
447 static int inline ll_gns_check_event(void)
448 {
449         int rc;
450         
451         spin_lock(&gns_lock);
452         rc = !list_empty(&gns_sbi_list);
453         spin_unlock(&gns_lock);
454
455         return rc;
456 }
457
458 /* should we stop GNS control thread? */
459 static int inline ll_gns_check_stop(void)
460 {
461         mb();
462         return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
463 }
464
465 /* GNS control thread function. */
466 static int ll_gns_thread(void *arg)
467 {
468         struct ll_gns_ctl *ctl = arg;
469         unsigned long flags;
470         ENTRY;
471
472         {
473                 char name[sizeof(current->comm)];
474                 snprintf(name, sizeof(name) - 1, "ll_gns");
475                 kportal_daemonize(name);
476         }
477         
478         SIGNAL_MASK_LOCK(current, flags);
479         sigfillset(&current->blocked);
480         RECALC_SIGPENDING;
481         SIGNAL_MASK_UNLOCK(current, flags);
482
483         /*
484          * letting starting function know, that we are ready and control may be
485          * returned.
486          */
487         gns_thread.t_flags = SVC_RUNNING;
488         complete(&ctl->gc_starting);
489
490         while (!ll_gns_check_stop()) {
491                 struct l_wait_info lwi = { 0 };
492
493                 l_wait_event(gns_thread.t_ctl_waitq,
494                              (ll_gns_check_event() ||
495                               ll_gns_check_stop()), &lwi);
496                 
497                 spin_lock(&gns_lock);
498                 while (!list_empty(&gns_sbi_list)) {
499                         struct ll_sb_info *sbi;
500
501                         sbi = list_entry(gns_sbi_list.prev,
502                                          struct ll_sb_info,
503                                          ll_gns_sbi_head);
504                         
505                         list_del_init(&sbi->ll_gns_sbi_head);
506                         spin_unlock(&gns_lock);
507                         ll_gns_check_mounts(sbi, LL_GNS_CHECK);
508                         spin_lock(&gns_lock);
509                 }
510                 spin_unlock(&gns_lock);
511         }
512
513         EXIT;
514         gns_thread.t_flags = SVC_STOPPED;
515
516         /* this is SMP-safe way to finish thread. */
517         complete_and_exit(&ctl->gc_finishing, 0);
518 }
519
520 void ll_gns_add_timer(struct ll_sb_info *sbi)
521 {
522         mod_timer(&sbi->ll_gns_timer,
523                   jiffies + sbi->ll_gns_tick * HZ);
524 }
525
526 void ll_gns_del_timer(struct ll_sb_info *sbi)
527 {
528         del_timer(&sbi->ll_gns_timer);
529 }
530
531 /*
532  * starts GNS control thread and waits for a signal it is up and work may be
533  * continued.
534  */
535 int ll_gns_thread_start(void)
536 {
537         int rc;
538         ENTRY;
539
540         LASSERT(gns_thread.t_flags == 0);
541         init_completion(&gns_ctl.gc_starting);
542         init_completion(&gns_ctl.gc_finishing);
543         init_waitqueue_head(&gns_thread.t_ctl_waitq);
544         
545         rc = kernel_thread(ll_gns_thread, &gns_ctl,
546                            (CLONE_VM | CLONE_FILES));
547         if (rc < 0) {
548                 CERROR("cannot start GNS control thread, "
549                        "err = %d\n", rc);
550                 RETURN(rc);
551         }
552         wait_for_completion(&gns_ctl.gc_starting);
553         LASSERT(gns_thread.t_flags == SVC_RUNNING);
554         RETURN(0);
555 }
556
557 /* stops GNS control thread and waits its actual stop. */
558 void ll_gns_thread_stop(void)
559 {
560         ENTRY;
561         gns_thread.t_flags = SVC_STOPPING;
562         wake_up(&gns_thread.t_ctl_waitq);
563         wait_for_completion(&gns_ctl.gc_finishing);
564         LASSERT(gns_thread.t_flags == SVC_STOPPED);
565         gns_thread.t_flags = 0;
566         EXIT;
567 }