Whamcloud - gitweb
- many fixes in GNS code after Nikita's code review. They are the following:
[fs/lustre-release.git] / lustre / llite / llite_gns.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Copyright (C) 2004, 2005 Cluster File Systems, Inc.
5  *
6  * Author: Phil Schwan <phil@clusterfs.com>
7  * Author: Oleg Drokin <green@clusterfs.com>
8  * Author: Yury Umanets <yury@clusterfs.com>
9  * Review: Nikita Danilov <nikita@clusterfs.com>
10  *
11  *   This file is part of Lustre, http://www.lustre.org.
12  *
13  *   Lustre is free software; you can redistribute it and/or
14  *   modify it under the terms of version 2 of the GNU General Public
15  *   License as published by the Free Software Foundation.
16  *
17  *   Lustre is distributed in the hope that it will be useful,
18  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *   GNU General Public License for more details.
21  *
22  *   You should have received a copy of the GNU General Public License
23  *   along with Lustre; if not, write to the Free Software
24  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #define DEBUG_SUBSYSTEM S_LLITE
28
29 #include <linux/fs.h>
30 #include <linux/version.h>
31 #include <asm/uaccess.h>
32 #include <linux/file.h>
33 #include <linux/kmod.h>
34
35 #include <linux/lustre_lite.h>
36 #include "llite_internal.h"
37
38 static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
39 static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
40 static struct ptlrpc_thread gns_thread;
41 static struct ll_gns_ctl gns_ctl;
42
43 /*
44  * waits until passed dentry gets mountpoint or timeout and attempts are
45  * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
46  */
47 static int
48 ll_gns_wait_for_mount(struct dentry *dentry,
49                       int timeout, int tries)
50 {
51         struct l_wait_info lwi;
52         struct ll_sb_info *sbi;
53         int rc;
54         ENTRY;
55
56         LASSERT(dentry != NULL);
57         LASSERT(!IS_ERR(dentry));
58         sbi = ll_s2sbi(dentry->d_sb);
59         
60         for (; !d_mountpoint(dentry) && tries > 0; tries--) {
61                 lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
62                 l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
63         }
64
65         if ((rc = d_mountpoint(dentry) ? 1 : 0)) {
66                 spin_lock(&sbi->ll_gns_lock);
67                 LASSERT(sbi->ll_gns_state == LL_GNS_MOUNTING);
68                 sbi->ll_gns_state = LL_GNS_FINISHED;
69                 spin_unlock(&sbi->ll_gns_lock);
70         }
71
72         complete_all(&sbi->ll_gns_mount_finished);
73         RETURN(rc);
74 }
75
76 /*
77  * tries to mount the mount object under passed @dentry. In the case of success
78  * @dentry will become mount point and 0 will be returned. Error code will be
79  * returned otherwise.
80  */
81 int
82 ll_gns_mount_object(struct dentry *dentry, struct vfsmount *mnt)
83 {
84         struct ll_dentry_data *lld = dentry->d_fsdata;
85         char *path, *pathpage, *datapage, *argv[4];
86         struct file *mntinfo_fd = NULL;
87         int cleanup_phase = 0, rc = 0;
88         struct ll_sb_info *sbi;
89         struct dentry *dchild;
90         ENTRY;
91
92         if (mnt == NULL) {
93                 CERROR("suid directory found, but no "
94                        "vfsmount available.\n");
95                 RETURN(-EINVAL);
96         }
97
98         CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
99
100         LASSERT(dentry->d_inode != NULL);
101         LASSERT(S_ISDIR(dentry->d_inode->i_mode));
102         LASSERT(lld != NULL);
103         
104         sbi = ll_i2sbi(dentry->d_inode);
105         LASSERT(sbi != NULL);
106
107         /* 
108          * another thead is in progress or just finished mounting the
109          * dentry. Handling that.
110          */
111         spin_lock(&sbi->ll_gns_lock);
112         if (sbi->ll_gns_state == LL_GNS_MOUNTING ||
113             sbi->ll_gns_state == LL_GNS_FINISHED)
114         {
115                 spin_unlock(&sbi->ll_gns_lock);
116                 CDEBUG(D_INODE"GNS is in progress now, throwing "
117                        "-ERESTARTSYS to restart syscall and let "
118                        "it finish.\n");
119                 RETURN(-ERESTARTSYS);
120         }
121         LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
122
123         spin_lock(&dentry->d_lock);
124         dentry->d_flags |= DCACHE_GNS_MOUNTING;
125         spin_unlock(&dentry->d_lock);
126         
127         /* mounting started */
128         sbi->ll_gns_state = LL_GNS_MOUNTING;
129         spin_unlock(&sbi->ll_gns_lock);
130
131         /* we need to build an absolute pathname to pass to mount */
132         pathpage = (char *)__get_free_page(GFP_KERNEL);
133         if (!pathpage)
134                 GOTO(cleanup, rc = -ENOMEM);
135         cleanup_phase = 1;
136
137         /* getting @dentry path stored in @pathpage. */
138         path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
139         if (IS_ERR(path)) {
140                 CERROR("can't build mount object path, err %d\n",
141                        (int)PTR_ERR(dchild));
142                 GOTO(cleanup, rc = PTR_ERR(dchild));
143         }
144
145         /* synchronizing with possible /proc/fs/...write */
146         down(&sbi->ll_gns_sem);
147         
148         /* 
149          * mount object name is taken from sbi, where it is set in mount time or
150          * via /proc/fs... tunable. It may be ".mntinfo" or so.
151          */
152         dchild = lookup_one_len(sbi->ll_gns_oname, dentry,
153                                 strlen(sbi->ll_gns_oname));
154         up(&sbi->ll_gns_sem);
155
156         if (!dchild)
157                 GOTO(cleanup, rc = -ENOENT);
158         
159         if (IS_ERR(dchild)) {
160                 CERROR("can't find mount object %*s/%*s err = %d.\n",
161                        (int)dentry->d_name.len, dentry->d_name.name,
162                        (int)dchild->d_name.len, dchild->d_name.name,
163                        (int)PTR_ERR(dchild));
164                 GOTO(cleanup, rc = PTR_ERR(dchild));
165         }
166
167         /* mount object is not found */
168         if (!dchild->d_inode)
169                 GOTO(cleanup, rc = -ENOENT);
170
171         mntget(mnt);
172
173         /* ok, mount object if found, opening it. */
174         mntinfo_fd = dentry_open(dchild, mnt, 0);
175         if (IS_ERR(mntinfo_fd)) {
176                 CERROR("can't open mount object %*s/%*s err = %d.\n",
177                        (int)dentry->d_name.len, dentry->d_name.name,
178                        (int)dchild->d_name.len, dchild->d_name.name,
179                        (int)PTR_ERR(mntinfo_fd));
180                 dput(dchild);
181                 mntput(mnt);
182                 GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
183         }
184         cleanup_phase = 2;
185
186         if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE) {
187                 CERROR("mount object %*s/%*s is too big (%Ld)\n",
188                        (int)dentry->d_name.len, dentry->d_name.name,
189                        (int)dchild->d_name.len, dchild->d_name.name,
190                        mntinfo_fd->f_dentry->d_inode->i_size);
191                 GOTO(cleanup, rc = -EFBIG);
192         }
193
194         datapage = (char *)__get_free_page(GFP_KERNEL);
195         if (!datapage)
196                 GOTO(cleanup, rc = -ENOMEM);
197
198         cleanup_phase = 3;
199         
200         /* read data from mount object. */
201         rc = kernel_read(mntinfo_fd, 0, datapage, PAGE_SIZE);
202         if (rc < 0) {
203                 CERROR("can't read mount object %*s/%*s data, err %d\n",
204                        (int)dentry->d_name.len, dentry->d_name.name,
205                        (int)dchild->d_name.len, dchild->d_name.name,
206                        rc);
207                 GOTO(cleanup, rc);
208         }
209
210         datapage[PAGE_SIZE - 1] = '\0';
211
212         fput(mntinfo_fd);
213         mntinfo_fd = NULL;
214
215         /* synchronizing with possible /proc/fs/...write */
216         down(&sbi->ll_gns_sem);
217
218         /*
219          * upcall is initialized in mount time or via /proc/fs/... tuneable and
220          * may be /usr/lib/lustre/gns-upcall.sh
221          */
222         argv[0] = sbi->ll_gns_upcall;
223         argv[1] = datapage;
224         argv[2] = path;
225         argv[3] = NULL;
226         
227         up(&sbi->ll_gns_sem);
228
229         rc = USERMODEHELPER(argv[0], argv, NULL);
230         if (rc) {
231                 CERROR("failed to call GNS upcall %s, err = %d\n",
232                        sbi->ll_gns_upcall, rc);
233                 GOTO(cleanup, rc);
234         }
235
236         /*
237          * wait for mount completion. This is actually not need, because
238          * USERMODEHELPER() returns only when usermode process finishes. But we
239          * doing this just for case USERMODEHELPER() semantics will be changed
240          * or usermode upcall program will start mounting in backgound and
241          * return instantly. --umka
242          */
243         if (ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS)) {
244                 struct dentry *rdentry;
245                 struct vfsmount *rmnt;
246                 
247                 /* mount is successful */
248                 LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
249
250                 rmnt = mntget(mnt);
251                 rdentry = dget(dentry);
252                 
253                 if (follow_down(&rmnt, &rdentry)) {
254                         /* 
255                          * registering new mount in GNS mounts list and thus
256                          * make it accessible from GNS control thread.
257                          */
258                         spin_lock(&dcache_lock);
259                         LASSERT(list_empty(&rmnt->mnt_lustre_list));
260                         list_add_tail(&rmnt->mnt_lustre_list,
261                                       &sbi->ll_mnt_list);
262                         spin_unlock(&dcache_lock);
263                         rmnt->mnt_last_used = jiffies;
264                         mntput(rmnt);
265                         dput(rdentry);
266                 } else {
267                         mntput(mnt);
268                         dput(dentry);
269                 }
270                 spin_lock(&dentry->d_lock);
271                 dentry->d_flags &= ~DCACHE_GNS_PENDING;
272                 spin_unlock(&dentry->d_lock);
273         } else {
274                 CERROR("usermode upcall %s failed to mount %s\n",
275                        sbi->ll_gns_upcall, path);
276                 rc = -ETIME;
277         }
278
279         EXIT;
280 cleanup:
281         switch (cleanup_phase) {
282         case 3:
283                 free_page((unsigned long)datapage);
284         case 2:
285                 if (mntinfo_fd != NULL)
286                         fput(mntinfo_fd);
287         case 1:
288                 free_page((unsigned long)pathpage);
289         case 0:
290                 /* 
291                  * waking up all waiters after gns state is set to
292                  * LL_GNS_MOUNTING
293                  */
294                 if (cleanup_phase > 0)
295                         complete_all(&sbi->ll_gns_mount_finished);
296                 
297                 spin_lock(&sbi->ll_gns_lock);
298                 sbi->ll_gns_state = LL_GNS_IDLE;
299                 spin_unlock(&sbi->ll_gns_lock);
300
301                 spin_lock(&dentry->d_lock);
302                 dentry->d_flags &= ~DCACHE_GNS_MOUNTING;
303                 spin_unlock(&dentry->d_lock);
304         }
305         return rc;
306 }
307
308 /* tries to umount passed @mnt. */
309 int ll_gns_umount_object(struct vfsmount *mnt)
310 {
311         int rc = 0;
312         ENTRY;
313         
314         CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
315         rc = do_umount(mnt, 0);
316         if (rc) {
317                 CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
318                        mnt, rc);
319         }
320         
321         RETURN(rc);
322 }
323
324 int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
325 {
326         struct list_head check_list = LIST_HEAD_INIT(check_list);
327         struct vfsmount *mnt;
328         unsigned long pass;
329         ENTRY;
330
331         spin_lock(&dcache_lock);
332         list_splice_init(&sbi->ll_mnt_list, &check_list);
333
334         /*
335          * walk the list in reverse order, and put them on the front of the sbi
336          * list each iteration; this avoids list-ordering problems if we race
337          * with another gns-mounting thread.
338          */
339         while (!list_empty(&check_list)) {
340                 mnt = list_entry(check_list.prev,
341                                  struct vfsmount,
342                                  mnt_lustre_list);
343
344                 mntget(mnt);
345
346                 list_del_init(&mnt->mnt_lustre_list);
347
348                 list_add(&mnt->mnt_lustre_list,
349                          &sbi->ll_mnt_list);
350
351                 /* check for timeout if needed */
352                 pass = jiffies - mnt->mnt_last_used;
353                 
354                 if (flags == LL_GNS_CHECK &&
355                     pass < sbi->ll_gns_timeout * HZ)
356                 {
357                         mntput(mnt);
358                         continue;
359                 }
360                 spin_unlock(&dcache_lock);
361
362                 /* umounting @mnt */
363                 ll_gns_umount_object(mnt);
364
365                 mntput(mnt);
366                 spin_lock(&dcache_lock);
367         }
368         spin_unlock(&dcache_lock);
369         RETURN(0);
370 }
371
372 /*
373  * GNS timer callback function. It restarts gns timer and wakes up GNS control
374  * thread to process mounts list.
375  */
376 void ll_gns_timer_callback(unsigned long data)
377 {
378         struct ll_sb_info *sbi = (void *)data;
379         ENTRY;
380
381         spin_lock(&gns_lock);
382         if (list_empty(&sbi->ll_gns_sbi_head))
383                 list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
384         spin_unlock(&gns_lock);
385         
386         wake_up(&gns_thread.t_ctl_waitq);
387         mod_timer(&sbi->ll_gns_timer,
388                   jiffies + sbi->ll_gns_tick * HZ);
389 }
390
391 /* this function checks if something new happened to exist in gns list. */
392 static int inline ll_gns_check_event(void)
393 {
394         int rc;
395         
396         spin_lock(&gns_lock);
397         rc = !list_empty(&gns_sbi_list);
398         spin_unlock(&gns_lock);
399
400         return rc;
401 }
402
403 /* should we stop GNS control thread? */
404 static int inline ll_gns_check_stop(void)
405 {
406         mb();
407         return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
408 }
409
410 /* GNS control thread function. */
411 static int ll_gns_thread_main(void *arg)
412 {
413         struct ll_gns_ctl *ctl = arg;
414         unsigned long flags;
415         ENTRY;
416
417         {
418                 char name[sizeof(current->comm)];
419                 snprintf(name, sizeof(name) - 1, "ll_gns");
420                 kportal_daemonize(name);
421         }
422         
423         SIGNAL_MASK_LOCK(current, flags);
424         sigfillset(&current->blocked);
425         RECALC_SIGPENDING;
426         SIGNAL_MASK_UNLOCK(current, flags);
427
428         /*
429          * letting starting function know, that we are ready and control may be
430          * returned.
431          */
432         gns_thread.t_flags = SVC_RUNNING;
433         complete(&ctl->gc_starting);
434
435         while (!ll_gns_check_stop()) {
436                 struct l_wait_info lwi = { 0 };
437
438                 l_wait_event(gns_thread.t_ctl_waitq,
439                              (ll_gns_check_event() ||
440                               ll_gns_check_stop()), &lwi);
441                 
442                 spin_lock(&gns_lock);
443                 while (!list_empty(&gns_sbi_list)) {
444                         struct ll_sb_info *sbi;
445
446                         sbi = list_entry(gns_sbi_list.prev,
447                                          struct ll_sb_info,
448                                          ll_gns_sbi_head);
449                         
450                         list_del_init(&sbi->ll_gns_sbi_head);
451                         spin_unlock(&gns_lock);
452                         ll_gns_check_mounts(sbi, LL_GNS_CHECK);
453                         spin_lock(&gns_lock);
454                 }
455                 spin_unlock(&gns_lock);
456         }
457
458         EXIT;
459         gns_thread.t_flags = SVC_STOPPED;
460
461         /* this is SMP-safe way to finish thread. */
462         complete_and_exit(&ctl->gc_finishing, 0);
463 }
464
465 void ll_gns_add_timer(struct ll_sb_info *sbi)
466 {
467         mod_timer(&sbi->ll_gns_timer,
468                   jiffies + sbi->ll_gns_tick * HZ);
469 }
470
471 void ll_gns_del_timer(struct ll_sb_info *sbi)
472 {
473         del_timer(&sbi->ll_gns_timer);
474 }
475
476 /*
477  * starts GNS control thread and waits for a signal it is up and work may be
478  * continued.
479  */
480 int ll_gns_start_thread(void)
481 {
482         int rc;
483         ENTRY;
484
485         LASSERT(gns_thread.t_flags == 0);
486         init_completion(&gns_ctl.gc_starting);
487         init_completion(&gns_ctl.gc_finishing);
488         init_waitqueue_head(&gns_thread.t_ctl_waitq);
489         
490         rc = kernel_thread(ll_gns_thread_main, &gns_ctl,
491                            (CLONE_VM | CLONE_FILES));
492         if (rc < 0) {
493                 CERROR("cannot start GNS control thread, "
494                        "err = %d\n", rc);
495                 RETURN(rc);
496         }
497         wait_for_completion(&gns_ctl.gc_starting);
498         LASSERT(gns_thread.t_flags == SVC_RUNNING);
499         RETURN(0);
500 }
501
502 /* stops GNS control thread and waits its actual stop. */
503 void ll_gns_stop_thread(void)
504 {
505         ENTRY;
506         gns_thread.t_flags = SVC_STOPPING;
507         wake_up(&gns_thread.t_ctl_waitq);
508         wait_for_completion(&gns_ctl.gc_finishing);
509         LASSERT(gns_thread.t_flags == SVC_STOPPED);
510         gns_thread.t_flags = 0;
511         EXIT;
512 }