Whamcloud - gitweb
b=11973
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  lustre/lib/lvfs_linux.c
5  *  Lustre filesystem abstraction routines
6  *
7  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8  *   Author: Andreas Dilger <adilger@clusterfs.com>
9  *
10  *   This file is part of Lustre, http://www.lustre.org.
11  *
12  *   Lustre is free software; you can redistribute it and/or
13  *   modify it under the terms of version 2 of the GNU General Public
14  *   License as published by the Free Software Foundation.
15  *
16  *   Lustre is distributed in the hope that it will be useful,
17  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
18  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  *   GNU General Public License for more details.
20  *
21  *   You should have received a copy of the GNU General Public License
22  *   along with Lustre; if not, write to the Free Software
23  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24  */
25
26 #ifndef EXPORT_SYMTAB
27 # define EXPORT_SYMTAB
28 #endif
29
30 #define DEBUG_SUBSYSTEM S_FILTER
31
32 #include <linux/version.h>
33 #include <linux/fs.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <lustre_fsfilt.h>
41 #include <obd.h>
42 #include <obd_class.h>
43 #include <linux/module.h>
44 #include <linux/init.h>
45 #include <linux/lustre_compat25.h>
46 #include <lvfs.h>
47 #include "lvfs_internal.h"
48
49 #include <obd.h>
50 #include <lustre_lib.h>
51 #include <lustre_quota.h>
52
53 atomic_t obd_memory;
54 int obd_memmax;
55
56 /* Debugging check only needed during development */
57 #ifdef OBD_CTXT_DEBUG
58 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
59 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
60                                               msg)
61 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
62 #else
63 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
64 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
65 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
66 #endif
67
68 static void push_group_info(struct lvfs_run_ctxt *save,
69                             struct upcall_cache_entry *uce)
70 {
71         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
72
73         if (!ginfo) {
74                 save->ngroups = current_ngroups;
75                 current_ngroups = 0;
76         } else {
77 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
78                 task_lock(current);
79                 save->group_info = current->group_info;
80                 current->group_info = ginfo;
81                 task_unlock(current);
82 #else
83                 LASSERT(ginfo->ngroups <= NGROUPS);
84                 LASSERT(current->ngroups <= NGROUPS_SMALL);
85                 /* save old */
86                 save->group_info.ngroups = current->ngroups;
87                 if (current->ngroups)
88                         memcpy(save->group_info.small_block, current->groups,
89                                current->ngroups * sizeof(gid_t));
90                 /* push new */
91                 current->ngroups = ginfo->ngroups;
92                 if (ginfo->ngroups)
93                         memcpy(current->groups, ginfo->small_block,
94                                current->ngroups * sizeof(gid_t));
95 #endif
96         }
97 }
98
99 static void pop_group_info(struct lvfs_run_ctxt *save,
100                            struct upcall_cache_entry *uce)
101 {
102         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
103
104         if (!ginfo) {
105                 current_ngroups = save->ngroups;
106         } else {
107 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
108                 task_lock(current);
109                 current->group_info = save->group_info;
110                 task_unlock(current);
111 #else
112                 current->ngroups = save->group_info.ngroups;
113                 if (current->ngroups)
114                         memcpy(current->groups, save->group_info.small_block,
115                                current->ngroups * sizeof(gid_t));
116 #endif
117         }
118 }
119
120 /* push / pop to root of obd store */
121 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
122                struct lvfs_ucred *uc)
123 {
124         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
125         ASSERT_CTXT_MAGIC(new_ctx->magic);
126         OBD_SET_CTXT_MAGIC(save);
127
128         /*
129         CDEBUG(D_INFO,
130                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
131                save, current, current->fs, current->fs->pwd,
132                atomic_read(&current->fs->pwd->d_count),
133                atomic_read(&current->fs->pwd->d_inode->i_count),
134                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
135                current->fs->pwdmnt,
136                atomic_read(&current->fs->pwdmnt->mnt_count));
137         */
138
139         save->fs = get_fs();
140         LASSERT(atomic_read(&current->fs->pwd->d_count));
141         LASSERT(atomic_read(&new_ctx->pwd->d_count));
142         save->pwd = dget(current->fs->pwd);
143         save->pwdmnt = mntget(current->fs->pwdmnt);
144         save->luc.luc_umask = current->fs->umask;
145
146         LASSERT(save->pwd);
147         LASSERT(save->pwdmnt);
148         LASSERT(new_ctx->pwd);
149         LASSERT(new_ctx->pwdmnt);
150
151         if (uc) {
152                 save->luc.luc_fsuid = current->fsuid;
153                 save->luc.luc_fsgid = current->fsgid;
154                 save->luc.luc_cap = current->cap_effective;
155
156                 current->fsuid = uc->luc_fsuid;
157                 current->fsgid = uc->luc_fsgid;
158                 current->cap_effective = uc->luc_cap;
159                 push_group_info(save, uc->luc_uce);
160         }
161         current->fs->umask = 0; /* umask already applied on client */
162         set_fs(new_ctx->fs);
163         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
164
165         /*
166         CDEBUG(D_INFO,
167                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
168                new_ctx, current, current->fs, current->fs->pwd,
169                atomic_read(&current->fs->pwd->d_count),
170                atomic_read(&current->fs->pwd->d_inode->i_count),
171                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
172                current->fs->pwdmnt,
173                atomic_read(&current->fs->pwdmnt->mnt_count));
174         */
175 }
176 EXPORT_SYMBOL(push_ctxt);
177
178 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
179               struct lvfs_ucred *uc)
180 {
181         //printk("pc0");
182         ASSERT_CTXT_MAGIC(saved->magic);
183         //printk("pc1");
184         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
185
186         /*
187         CDEBUG(D_INFO,
188                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
189                new_ctx, current, current->fs, current->fs->pwd,
190                atomic_read(&current->fs->pwd->d_count),
191                atomic_read(&current->fs->pwd->d_inode->i_count),
192                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
193                current->fs->pwdmnt,
194                atomic_read(&current->fs->pwdmnt->mnt_count));
195         */
196
197         LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
198                  current->fs->pwd, new_ctx->pwd);
199         LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
200                  current->fs->pwdmnt, new_ctx->pwdmnt);
201
202         set_fs(saved->fs);
203         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
204
205         dput(saved->pwd);
206         mntput(saved->pwdmnt);
207         current->fs->umask = saved->luc.luc_umask;
208         if (uc) {
209                 current->fsuid = saved->luc.luc_fsuid;
210                 current->fsgid = saved->luc.luc_fsgid;
211                 current->cap_effective = saved->luc.luc_cap;
212                 pop_group_info(saved, uc->luc_uce);
213         }
214
215         /*
216         CDEBUG(D_INFO,
217                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
218                saved, current, current->fs, current->fs->pwd,
219                atomic_read(&current->fs->pwd->d_count),
220                atomic_read(&current->fs->pwd->d_inode->i_count),
221                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
222                current->fs->pwdmnt,
223                atomic_read(&current->fs->pwdmnt->mnt_count));
224         */
225 }
226 EXPORT_SYMBOL(pop_ctxt);
227
228 /* utility to make a file */
229 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
230 {
231         struct dentry *dchild;
232         int err = 0;
233         ENTRY;
234
235         ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
236         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
237
238         dchild = ll_lookup_one_len(name, dir, strlen(name));
239         if (IS_ERR(dchild))
240                 GOTO(out_up, dchild);
241
242         if (dchild->d_inode) {
243                 int old_mode = dchild->d_inode->i_mode;
244                 if (!S_ISREG(old_mode))
245                         GOTO(out_err, err = -EEXIST);
246
247                 /* Fixup file permissions if necessary */
248                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
249                         CWARN("fixing permissions on %s from %o to %o\n",
250                               name, old_mode, mode);
251                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
252                                                   (old_mode & ~S_IALLUGO);
253                         mark_inode_dirty(dchild->d_inode);
254                 }
255                 GOTO(out_up, dchild);
256         }
257
258         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
259                             NULL);
260         if (err)
261                 GOTO(out_err, err);
262
263         RETURN(dchild);
264
265 out_err:
266         dput(dchild);
267         dchild = ERR_PTR(err);
268 out_up:
269         return dchild;
270 }
271 EXPORT_SYMBOL(simple_mknod);
272
273 /* utility to make a directory */
274 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
275 {
276         struct dentry *dchild;
277         int err = 0;
278         ENTRY;
279
280         ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
281         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
282         dchild = ll_lookup_one_len(name, dir, strlen(name));
283         if (IS_ERR(dchild))
284                 GOTO(out_up, dchild);
285
286         if (dchild->d_inode) {
287                 int old_mode = dchild->d_inode->i_mode;
288                 if (!S_ISDIR(old_mode)) {
289                         CERROR("found %s (%lu/%u) is mode %o\n", name,
290                                dchild->d_inode->i_ino,
291                                dchild->d_inode->i_generation, old_mode);
292                         GOTO(out_err, err = -ENOTDIR);
293                 }
294
295                 /* Fixup directory permissions if necessary */
296                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
297                         CDEBUG(D_CONFIG, 
298                                "fixing permissions on %s from %o to %o\n",
299                                name, old_mode, mode);
300                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
301                                                   (old_mode & ~S_IALLUGO);
302                         mark_inode_dirty(dchild->d_inode);
303                 }
304                 GOTO(out_up, dchild);
305         }
306
307         err = vfs_mkdir(dir->d_inode, dchild, mode);
308         if (err)
309                 GOTO(out_err, err);
310
311         RETURN(dchild);
312
313 out_err:
314         dput(dchild);
315         dchild = ERR_PTR(err);
316 out_up:
317         return dchild;
318 }
319 EXPORT_SYMBOL(simple_mkdir);
320
321 /* utility to rename a file */
322 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
323 {
324         struct dentry *dchild_old, *dchild_new;
325         int err = 0;
326         ENTRY;
327
328         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
329         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
330                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
331
332         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
333         if (IS_ERR(dchild_old))
334                 RETURN(PTR_ERR(dchild_old));
335
336         if (!dchild_old->d_inode) 
337                 GOTO(put_old, err = -ENOENT);
338
339         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
340         if (IS_ERR(dchild_new))
341                 GOTO(put_old, err = PTR_ERR(dchild_new));
342
343         err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
344
345         dput(dchild_new);
346 put_old:
347         dput(dchild_old);
348         RETURN(err);
349 }
350 EXPORT_SYMBOL(lustre_rename);
351
352 /*
353  * Read a file from within kernel context.  Prior to calling this
354  * function we should already have done a push_ctxt().
355  */
356 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
357 {
358         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
359         if (!file || !file->f_op || !file->f_op->read || !off)
360                 RETURN(-ENOSYS);
361
362         return file->f_op->read(file, buf, len, off);
363 }
364 EXPORT_SYMBOL(lustre_fread);
365
366 /*
367  * Write a file from within kernel context.  Prior to calling this
368  * function we should already have done a push_ctxt().
369  */
370 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
371 {
372         ENTRY;
373         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
374         if (!file)
375                 RETURN(-ENOENT);
376         if (!file->f_op)
377                 RETURN(-ENOSYS);
378         if (!off)
379                 RETURN(-EINVAL);
380
381         if (!file->f_op->write)
382                 RETURN(-EROFS);
383
384         RETURN(file->f_op->write(file, buf, len, off));
385 }
386 EXPORT_SYMBOL(lustre_fwrite);
387
388 /*
389  * Sync a file from within kernel context.  Prior to calling this
390  * function we should already have done a push_ctxt().
391  */
392 int lustre_fsync(struct file *file)
393 {
394         ENTRY;
395         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
396         if (!file || !file->f_op || !file->f_op->fsync)
397                 RETURN(-ENOSYS);
398
399         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
400 }
401 EXPORT_SYMBOL(lustre_fsync);
402
403 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
404                              int flags)
405 {
406         mntget(ctxt->pwdmnt);
407         return dentry_open(de, ctxt->pwdmnt, flags);
408 }
409 EXPORT_SYMBOL(l_dentry_open);
410
411 #ifdef HAVE_VFS_READDIR_U64_INO
412 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
413                      u64 ino, unsigned int d_type)
414 #else
415 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
416                      ino_t ino, unsigned int d_type)
417 #endif
418 {
419         struct l_linux_dirent *dirent;
420         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
421
422         dirent = buf->lrc_dirent;
423         if (dirent)
424                dirent->lld_off = offset;
425
426         OBD_ALLOC(dirent, sizeof(*dirent));
427
428         if (!dirent)
429                 return -ENOMEM;
430
431         list_add_tail(&dirent->lld_list, buf->lrc_list);
432
433         buf->lrc_dirent = dirent;
434         dirent->lld_ino = ino;
435         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
436         memcpy(dirent->lld_name, name, namlen);
437
438         return 0;
439 }
440
441 long l_readdir(struct file *file, struct list_head *dentry_list)
442 {
443         struct l_linux_dirent *lastdirent;
444         struct l_readdir_callback buf;
445         int error;
446
447         buf.lrc_dirent = NULL;
448         buf.lrc_list = dentry_list; 
449
450         error = vfs_readdir(file, l_filldir, &buf);
451         if (error < 0)
452                 return error;
453
454         lastdirent = buf.lrc_dirent;
455         if (lastdirent)
456                 lastdirent->lld_off = file->f_pos;
457
458         return 0; 
459 }
460 EXPORT_SYMBOL(l_readdir);
461 EXPORT_SYMBOL(obd_memory);
462 EXPORT_SYMBOL(obd_memmax);
463
464 #ifdef LUSTRE_KERNEL_VERSION
465 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
466 #error rdonly patchset must be updated [cfs bz11248]
467 #endif
468
469 void dev_set_rdonly(lvfs_sbdev_type dev);
470 int dev_check_rdonly(lvfs_sbdev_type dev);
471
472 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
473 {
474         lvfs_sbdev_sync(dev);
475         if (jdev && (jdev != dev)) {
476                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
477                        (long)jdev);
478                 dev_set_rdonly(jdev);
479         }
480         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
481         dev_set_rdonly(dev);
482 }
483
484 int lvfs_check_rdonly(lvfs_sbdev_type dev)
485 {
486         return dev_check_rdonly(dev);
487 }
488
489 EXPORT_SYMBOL(__lvfs_set_rdonly);
490 EXPORT_SYMBOL(lvfs_check_rdonly);
491 #endif /* LUSTRE_KERNEL_VERSION */
492
493 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
494 {
495         char *write_page = NULL;
496         loff_t offset = 0;
497         int rc = 0;
498         ENTRY;
499
500         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
501         if (!write_page)
502                 RETURN(-ENOMEM);
503         
504         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
505        
506         OBD_FREE(write_page, CFS_PAGE_SIZE);
507
508         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
509         RETURN(rc); 
510 }
511 EXPORT_SYMBOL(lvfs_check_io_health);
512
513 static int __init lvfs_linux_init(void)
514 {
515         RETURN(0);
516 }
517
518 static void __exit lvfs_linux_exit(void)
519 {
520         int leaked;
521         ENTRY;
522
523         leaked = atomic_read(&obd_memory);
524         CDEBUG(leaked ? D_ERROR : D_INFO,
525                "obd mem max: %d leaked: %d\n", obd_memmax, leaked);
526
527         EXIT;
528         return;
529 }
530
531 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
532 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
533 MODULE_LICENSE("GPL");
534
535 module_init(lvfs_linux_init);
536 module_exit(lvfs_linux_exit);