Whamcloud - gitweb
recovery-small 21 LBUG: don't release intent for open etc. (maybe the
[fs/lustre-release.git] / lustre / llite / llite_gns.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004, 2005 Cluster File Systems, Inc.
5  *
6  * Author: Phil Schwan <phil@clusterfs.com>
7  * Author: Oleg Drokin <green@clusterfs.com>
8  * Author: Yury Umanets <yury@clusterfs.com>
9  * Review: Nikita Danilov <nikita@clusterfs.com>
10  *
11  *   This file is part of Lustre, http://www.lustre.org.
12  *
13  *   Lustre is free software; you can redistribute it and/or
14  *   modify it under the terms of version 2 of the GNU General Public
15  *   License as published by the Free Software Foundation.
16  *
17  *   Lustre is distributed in the hope that it will be useful,
18  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *   GNU General Public License for more details.
21  *
22  *   You should have received a copy of the GNU General Public License
23  *   along with Lustre; if not, write to the Free Software
24  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #define DEBUG_SUBSYSTEM S_LLITE
28
29 #include <linux/fs.h>
30 #include <linux/version.h>
31 #include <asm/uaccess.h>
32 #include <linux/file.h>
33 #include <linux/kmod.h>
34
35 #include <linux/lustre_lite.h>
36 #include "llite_internal.h"
37
38 static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
39 static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
40 static struct ptlrpc_thread gns_thread;
41 static struct ll_gns_ctl gns_ctl;
42
43 #define CONCUR_GNS_RESTART_APPROACH 0
44
45 /*
46  * waits until passed dentry gets mountpoint or timeout and attempts are
47  * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
48  */
49 static int
50 ll_gns_wait_for_mount(struct dentry *dentry,
51                       int timeout, int tries)
52 {
53         struct l_wait_info lwi;
54         struct ll_sb_info *sbi;
55         ENTRY;
56
57         LASSERT(dentry != NULL);
58         LASSERT(!IS_ERR(dentry));
59         sbi = ll_s2sbi(dentry->d_sb);
60         
61         lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
62         for (; !d_mountpoint(dentry) && tries > 0; tries--)
63                 l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
64
65         if (d_mountpoint(dentry)) {
66                 spin_lock(&sbi->ll_gns_lock);
67                 sbi->ll_gns_state = LL_GNS_FINISHED;
68                 spin_unlock(&sbi->ll_gns_lock);
69                 RETURN(0);
70         }
71         RETURN(-ETIME);
72 }
73
74 #if (CONCUR_GNS_RESTART_APPROACH == 1)
75 /* 
76  * sending a signal known to be ignored to cause restarting syscall if GNS mount
77  * function returns -ERESTARTSYS.
78  */
79 static void
80 ll_gns_send_signal(void)
81 {
82         struct task_struct *task = current;
83         int signal = SIGCONT;
84
85         read_lock(&tasklist_lock);
86         spin_lock_irq(&task->sighand->siglock);
87         sigaddset(&task->pending.signal, signal);
88         spin_unlock_irq(&task->sighand->siglock);
89         read_unlock(&tasklist_lock);
90         set_tsk_thread_flag(task, TIF_SIGPENDING);
91 }
92 #endif
93
94 /*
95  * tries to mount the mount object under passed @dentry. In the case of success
96  * @dentry will become mount point and 0 will be returned. Error code will be
97  * returned otherwise.
98  */
99 int
100 ll_gns_mount_object(struct dentry *dentry, struct vfsmount *mnt)
101 {
102         char *path, *pathpage, *datapage, *argv[4];
103         struct file *mntinfo_fd = NULL;
104         int cleanup_phase = 0, rc = 0;
105         struct ll_sb_info *sbi;
106         struct dentry *dchild;
107         ENTRY;
108
109         LASSERT(dentry->d_inode != NULL);
110
111         if (!S_ISDIR(dentry->d_inode->i_mode))
112                 RETURN(-EINVAL);
113
114         sbi = ll_i2sbi(dentry->d_inode);
115         
116         if (mnt == NULL) {
117                 CERROR("suid directory found, but no "
118                        "vfsmount available.\n");
119                 RETURN(-EINVAL);
120         }
121
122         if (atomic_read(&sbi->ll_gns_enabled) == 0)
123                 RETURN(-EINVAL);
124
125         spin_lock(&sbi->ll_gns_lock);
126
127         /* 
128          * another thead is in progress or just finished mounting the
129          * dentry. Handling that.
130          */
131         if (sbi->ll_gns_state == LL_GNS_MOUNTING ||
132             sbi->ll_gns_state == LL_GNS_FINISHED) {
133                 /* 
134                  * another thread is trying to mount GNS dentry. We'd like to
135                  * handling that.
136                  */
137                 spin_unlock(&sbi->ll_gns_lock);
138
139                 /* 
140                  * check if dentry is mount point already, if so, do not restart
141                  * syscal.
142                  */
143                 if (d_mountpoint(dentry))
144                         RETURN(0);
145
146 #if (CONCUR_GNS_RESTART_APPROACH == 1)
147                 /* 
148                  * causing syscall to restart and possibly find this dentry
149                  * already mounted.
150                  */
151                 ll_gns_send_signal();
152                 RETURN(-ERESTARTSYS);
153 #else
154                 /* 
155                  * waiting for GNS complete and check dentry again, it may be
156                  * mounted already.
157                  */
158                 wait_for_completion(&sbi->ll_gns_mount_finished);
159                 if (d_mountpoint(dentry))
160                         RETURN(0);
161 #endif
162         }
163         LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
164
165         CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
166
167         /* mounting started */
168         sbi->ll_gns_state = LL_GNS_MOUNTING;
169         spin_unlock(&sbi->ll_gns_lock);
170
171         /* we need to build an absolute pathname to pass to mount */
172         pathpage = (char *)__get_free_page(GFP_KERNEL);
173         if (!pathpage)
174                 GOTO(cleanup, rc = -ENOMEM);
175         cleanup_phase = 1;
176
177         /* getting @dentry path stored in @pathpage. */
178         path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
179         if (IS_ERR(path)) {
180                 CERROR("can't build mount object path, err %d\n",
181                        (int)PTR_ERR(dchild));
182                 GOTO(cleanup, rc = PTR_ERR(dchild));
183         }
184
185         /* synchronizing with possible /proc/fs/...write */
186         down(&sbi->ll_gns_sem);
187         
188         /* 
189          * mount object name is taken from sbi, where it is set in mount time or
190          * via /proc/fs... tunable. It may be ".mntinfo" or so.
191          */
192
193         /* 
194          * recursive lookup with trying to mount SUID bit marked directories on
195          * the way is not possible here, as lookup_one_len() does not pass @nd
196          * to ->lookup() and this is checked in ll_lookup_it(). So, do not
197          * handle possible -EAGAIN here.
198          */
199         dchild = ll_lookup_one_len(sbi->ll_gns_oname, dentry,
200                                    strlen(sbi->ll_gns_oname));
201         up(&sbi->ll_gns_sem);
202
203         cleanup_phase = 2;
204         
205         if (IS_ERR(dchild)) {
206                 rc = PTR_ERR(dchild);
207                 CERROR("can't find mount object %*s/%*s err = %d.\n",
208                        (int)dentry->d_name.len, dentry->d_name.name,
209                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
210                        rc);
211                 GOTO(cleanup, rc);
212         }
213
214         /* mount object is not found */
215         if (!dchild->d_inode)
216                 GOTO(cleanup, rc = -ENOENT);
217
218         /* check if found child is regular file */
219         if (!S_ISREG(dchild->d_inode->i_mode))
220                 GOTO(cleanup, rc = -EBADF);
221
222         mntget(mnt);
223
224         /* ok, mount object if found, opening it. */
225         mntinfo_fd = dentry_open(dchild, mnt, 0);
226         if (IS_ERR(mntinfo_fd)) {
227                 CERROR("can't open mount object %*s/%*s err = %d.\n",
228                        (int)dentry->d_name.len, dentry->d_name.name,
229                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
230                        (int)PTR_ERR(mntinfo_fd));
231                 mntput(mnt);
232                 GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
233         }
234         cleanup_phase = 3;
235
236         if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE - 1) {
237                 CERROR("mount object %*s/%*s is too big (%Ld)\n",
238                        (int)dentry->d_name.len, dentry->d_name.name,
239                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
240                        mntinfo_fd->f_dentry->d_inode->i_size);
241                 GOTO(cleanup, rc = -EFBIG);
242         }
243
244         datapage = (char *)__get_free_page(GFP_KERNEL);
245         if (!datapage)
246                 GOTO(cleanup, rc = -ENOMEM);
247
248         cleanup_phase = 4;
249         
250         /* read data from mount object. */
251         rc = kernel_read(mntinfo_fd, 0, datapage, PAGE_SIZE - 1);
252         if (rc < 0) {
253                 CERROR("can't read mount object %*s/%*s data, err %d\n",
254                        (int)dentry->d_name.len, dentry->d_name.name,
255                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname,
256                        rc);
257                 GOTO(cleanup, rc);
258         }
259
260         /* no data in mount object? */
261         if (rc == 0) {
262                 CERROR("mount object %*s/%*s is empty?\n",
263                        (int)dentry->d_name.len, dentry->d_name.name,
264                        strlen(sbi->ll_gns_oname), sbi->ll_gns_oname);
265                 GOTO(cleanup, rc);
266         }
267
268         datapage[rc] = '\0';
269         fput(mntinfo_fd);
270         mntinfo_fd = NULL;
271         dchild = NULL;
272
273         /* synchronizing with possible /proc/fs/...write */
274         down(&sbi->ll_gns_sem);
275
276         /*
277          * upcall is initialized in mount time or via /proc/fs/... tuneable and
278          * may be /usr/lib/lustre/gns-upcall.sh
279          */
280         argv[0] = sbi->ll_gns_upcall;
281         argv[1] = datapage;
282         argv[2] = path;
283         argv[3] = NULL;
284         
285         up(&sbi->ll_gns_sem);
286
287         /* do not wait for helper complete here. */
288         rc = call_usermodehelper(argv[0], argv, NULL, 1);
289         if (rc) {
290                 CWARN("failed to call GNS upcall %s, err = %d, "
291                       "checking for mount anyway\n", sbi->ll_gns_upcall, rc);
292         }
293
294         /*
295          * waiting for dentry become mount point GNS_WAIT_ATTEMPTS times by 1
296          * second.
297          */
298         rc = ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS);
299         if (rc == 0) {
300                 struct dentry *rdentry;
301                 struct vfsmount *rmnt;
302                
303                 LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
304
305                 rmnt = mntget(mnt);
306                 rdentry = dget(dentry);
307                 
308                 if (follow_down(&rmnt, &rdentry)) {
309                         /* 
310                          * registering new mount in GNS mounts list and thus
311                          * make it accessible from GNS control thread.
312                          */
313                         spin_lock(&dcache_lock);
314                         LASSERT(list_empty(&rmnt->mnt_lustre_list));
315                         list_add_tail(&rmnt->mnt_lustre_list,
316                                       &sbi->ll_mnt_list);
317                         spin_unlock(&dcache_lock);
318                         rmnt->mnt_last_used = jiffies;
319                         mntput(rmnt);
320                         dput(rdentry);
321                 } else {
322                         mntput(mnt);
323                         dput(dentry);
324                 }
325         } else {
326                 CERROR("usermode upcall %s failed to mount %s, err %d\n",
327                        sbi->ll_gns_upcall, path, rc);
328         }
329                 
330         EXIT;
331 cleanup:
332         switch (cleanup_phase) {
333         case 4:
334                 free_page((unsigned long)datapage);
335         case 3:
336                 if (mntinfo_fd != NULL)
337                         fput(mntinfo_fd);
338         case 2:
339                 if (dchild != NULL)
340                         dput(dchild);
341         case 1:
342                 free_page((unsigned long)pathpage);
343         case 0:
344                 spin_lock(&sbi->ll_gns_lock);
345                 sbi->ll_gns_state = LL_GNS_IDLE;
346                 spin_unlock(&sbi->ll_gns_lock);
347                 complete_all(&sbi->ll_gns_mount_finished);
348         }
349         return rc;
350 }
351
352 /* tries to umount passed @mnt. */
353 int ll_gns_umount_object(struct vfsmount *mnt)
354 {
355         int rc = 0;
356         ENTRY;
357         
358         CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
359         rc = do_umount(mnt, 0);
360         if (rc) {
361                 CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
362                        mnt, rc);
363         }
364         
365         RETURN(rc);
366 }
367
368 int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
369 {
370         struct list_head check_list = LIST_HEAD_INIT(check_list);
371         struct vfsmount *mnt;
372         unsigned long pass;
373         ENTRY;
374
375         spin_lock(&dcache_lock);
376         list_splice_init(&sbi->ll_mnt_list, &check_list);
377
378         /*
379          * walk the list in reverse order, and put them on the front of the sbi
380          * list each iteration; this avoids list-ordering problems if we race
381          * with another gns-mounting thread.
382          */
383         while (!list_empty(&check_list)) {
384                 mnt = list_entry(check_list.prev,
385                                  struct vfsmount,
386                                  mnt_lustre_list);
387
388                 mntget(mnt);
389
390                 list_del_init(&mnt->mnt_lustre_list);
391
392                 list_add(&mnt->mnt_lustre_list,
393                          &sbi->ll_mnt_list);
394
395                 /* check for timeout if needed */
396                 pass = jiffies - mnt->mnt_last_used;
397                 
398                 if (flags == LL_GNS_CHECK &&
399                     pass < sbi->ll_gns_timeout * HZ)
400                 {
401                         mntput(mnt);
402                         continue;
403                 }
404                 spin_unlock(&dcache_lock);
405
406                 /* umounting @mnt */
407                 ll_gns_umount_object(mnt);
408
409                 mntput(mnt);
410                 spin_lock(&dcache_lock);
411         }
412         spin_unlock(&dcache_lock);
413         RETURN(0);
414 }
415
416 /*
417  * GNS timer callback function. It restarts gns timer and wakes up GNS control
418  * thread to process mounts list.
419  */
420 void ll_gns_timer_callback(unsigned long data)
421 {
422         struct ll_sb_info *sbi = (void *)data;
423         ENTRY;
424
425         spin_lock(&gns_lock);
426         if (list_empty(&sbi->ll_gns_sbi_head))
427                 list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
428         spin_unlock(&gns_lock);
429         
430         wake_up(&gns_thread.t_ctl_waitq);
431         mod_timer(&sbi->ll_gns_timer,
432                   jiffies + sbi->ll_gns_tick * HZ);
433 }
434
435 /* this function checks if something new happened to exist in gns list. */
436 static int inline ll_gns_check_event(void)
437 {
438         int rc;
439         
440         spin_lock(&gns_lock);
441         rc = !list_empty(&gns_sbi_list);
442         spin_unlock(&gns_lock);
443
444         return rc;
445 }
446
447 /* should we stop GNS control thread? */
448 static int inline ll_gns_check_stop(void)
449 {
450         mb();
451         return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
452 }
453
454 /* GNS control thread function. */
455 static int ll_gns_thread_main(void *arg)
456 {
457         struct ll_gns_ctl *ctl = arg;
458         unsigned long flags;
459         ENTRY;
460
461         {
462                 char name[sizeof(current->comm)];
463                 snprintf(name, sizeof(name) - 1, "ll_gns");
464                 kportal_daemonize(name);
465         }
466         
467         SIGNAL_MASK_LOCK(current, flags);
468         sigfillset(&current->blocked);
469         RECALC_SIGPENDING;
470         SIGNAL_MASK_UNLOCK(current, flags);
471
472         /*
473          * letting starting function know, that we are ready and control may be
474          * returned.
475          */
476         gns_thread.t_flags = SVC_RUNNING;
477         complete(&ctl->gc_starting);
478
479         while (!ll_gns_check_stop()) {
480                 struct l_wait_info lwi = { 0 };
481
482                 l_wait_event(gns_thread.t_ctl_waitq,
483                              (ll_gns_check_event() ||
484                               ll_gns_check_stop()), &lwi);
485                 
486                 spin_lock(&gns_lock);
487                 while (!list_empty(&gns_sbi_list)) {
488                         struct ll_sb_info *sbi;
489
490                         sbi = list_entry(gns_sbi_list.prev,
491                                          struct ll_sb_info,
492                                          ll_gns_sbi_head);
493                         
494                         list_del_init(&sbi->ll_gns_sbi_head);
495                         spin_unlock(&gns_lock);
496                         ll_gns_check_mounts(sbi, LL_GNS_CHECK);
497                         spin_lock(&gns_lock);
498                 }
499                 spin_unlock(&gns_lock);
500         }
501
502         EXIT;
503         gns_thread.t_flags = SVC_STOPPED;
504
505         /* this is SMP-safe way to finish thread. */
506         complete_and_exit(&ctl->gc_finishing, 0);
507 }
508
509 void ll_gns_add_timer(struct ll_sb_info *sbi)
510 {
511         mod_timer(&sbi->ll_gns_timer,
512                   jiffies + sbi->ll_gns_tick * HZ);
513 }
514
515 void ll_gns_del_timer(struct ll_sb_info *sbi)
516 {
517         del_timer(&sbi->ll_gns_timer);
518 }
519
520 /*
521  * starts GNS control thread and waits for a signal it is up and work may be
522  * continued.
523  */
524 int ll_gns_start_thread(void)
525 {
526         int rc;
527         ENTRY;
528
529         LASSERT(gns_thread.t_flags == 0);
530         init_completion(&gns_ctl.gc_starting);
531         init_completion(&gns_ctl.gc_finishing);
532         init_waitqueue_head(&gns_thread.t_ctl_waitq);
533         
534         rc = kernel_thread(ll_gns_thread_main, &gns_ctl,
535                            (CLONE_VM | CLONE_FILES));
536         if (rc < 0) {
537                 CERROR("cannot start GNS control thread, "
538                        "err = %d\n", rc);
539                 RETURN(rc);
540         }
541         wait_for_completion(&gns_ctl.gc_starting);
542         LASSERT(gns_thread.t_flags == SVC_RUNNING);
543         RETURN(0);
544 }
545
546 /* stops GNS control thread and waits its actual stop. */
547 void ll_gns_stop_thread(void)
548 {
549         ENTRY;
550         gns_thread.t_flags = SVC_STOPPING;
551         wake_up(&gns_thread.t_ctl_waitq);
552         wait_for_completion(&gns_ctl.gc_finishing);
553         LASSERT(gns_thread.t_flags == SVC_STOPPED);
554         gns_thread.t_flags = 0;
555         EXIT;
556 }