Whamcloud - gitweb
Branch b1_6
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  lustre/lib/lvfs_linux.c
5  *  Lustre filesystem abstraction routines
6  *
7  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8  *   Author: Andreas Dilger <adilger@clusterfs.com>
9  *
10  *   This file is part of Lustre, http://www.lustre.org.
11  *
12  *   Lustre is free software; you can redistribute it and/or
13  *   modify it under the terms of version 2 of the GNU General Public
14  *   License as published by the Free Software Foundation.
15  *
16  *   Lustre is distributed in the hope that it will be useful,
17  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
18  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  *   GNU General Public License for more details.
20  *
21  *   You should have received a copy of the GNU General Public License
22  *   along with Lustre; if not, write to the Free Software
23  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24  */
25
26 #ifndef EXPORT_SYMTAB
27 # define EXPORT_SYMTAB
28 #endif
29
30 #define DEBUG_SUBSYSTEM S_FILTER
31
32 #include <linux/version.h>
33 #include <linux/fs.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <lustre_fsfilt.h>
41 #include <obd.h>
42 #include <linux/module.h>
43 #include <linux/init.h>
44 #include <linux/lustre_compat25.h>
45 #include <lvfs.h>
46 #include "lvfs_internal.h"
47
48 #include <obd.h>
49 #include <lustre_lib.h>
50 #include <lustre_quota.h>
51
52 atomic_t obd_memory;
53 int obd_memmax;
54 unsigned int obd_fail_val;
55 unsigned int obd_fail_loc;
56 unsigned int obd_alloc_fail_rate = 0;
57
58 /* Debugging check only needed during development */
59 #ifdef OBD_CTXT_DEBUG
60 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
61 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
62                                               msg)
63 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
64 #else
65 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
66 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
67 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
68 #endif
69
70 static void push_group_info(struct lvfs_run_ctxt *save,
71                             struct upcall_cache_entry *uce)
72 {
73         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
74
75         if (!ginfo) {
76                 save->ngroups = current_ngroups;
77                 current_ngroups = 0;
78         } else {
79 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
80                 task_lock(current);
81                 save->group_info = current->group_info;
82                 current->group_info = ginfo;
83                 task_unlock(current);
84 #else
85                 LASSERT(ginfo->ngroups <= NGROUPS);
86                 LASSERT(current->ngroups <= NGROUPS_SMALL);
87                 /* save old */
88                 save->group_info.ngroups = current->ngroups;
89                 if (current->ngroups)
90                         memcpy(save->group_info.small_block, current->groups,
91                                current->ngroups * sizeof(gid_t));
92                 /* push new */
93                 current->ngroups = ginfo->ngroups;
94                 if (ginfo->ngroups)
95                         memcpy(current->groups, ginfo->small_block,
96                                current->ngroups * sizeof(gid_t));
97 #endif
98         }
99 }
100
101 static void pop_group_info(struct lvfs_run_ctxt *save,
102                            struct upcall_cache_entry *uce)
103 {
104         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
105
106         if (!ginfo) {
107                 current_ngroups = save->ngroups;
108         } else {
109 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
110                 task_lock(current);
111                 current->group_info = save->group_info;
112                 task_unlock(current);
113 #else
114                 current->ngroups = save->group_info.ngroups;
115                 if (current->ngroups)
116                         memcpy(current->groups, save->group_info.small_block,
117                                current->ngroups * sizeof(gid_t));
118 #endif
119         }
120 }
121
122 /* push / pop to root of obd store */
123 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
124                struct lvfs_ucred *uc)
125 {
126         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
127         ASSERT_CTXT_MAGIC(new_ctx->magic);
128         OBD_SET_CTXT_MAGIC(save);
129
130         /*
131         CDEBUG(D_INFO,
132                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
133                save, current, current->fs, current->fs->pwd,
134                atomic_read(&current->fs->pwd->d_count),
135                atomic_read(&current->fs->pwd->d_inode->i_count),
136                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
137                current->fs->pwdmnt,
138                atomic_read(&current->fs->pwdmnt->mnt_count));
139         */
140
141         save->fs = get_fs();
142         LASSERT(atomic_read(&current->fs->pwd->d_count));
143         LASSERT(atomic_read(&new_ctx->pwd->d_count));
144         save->pwd = dget(current->fs->pwd);
145         save->pwdmnt = mntget(current->fs->pwdmnt);
146         save->luc.luc_umask = current->fs->umask;
147
148         LASSERT(save->pwd);
149         LASSERT(save->pwdmnt);
150         LASSERT(new_ctx->pwd);
151         LASSERT(new_ctx->pwdmnt);
152
153         if (uc) {
154                 save->luc.luc_fsuid = current->fsuid;
155                 save->luc.luc_fsgid = current->fsgid;
156                 save->luc.luc_cap = current->cap_effective;
157
158                 current->fsuid = uc->luc_fsuid;
159                 current->fsgid = uc->luc_fsgid;
160                 current->cap_effective = uc->luc_cap;
161                 push_group_info(save, uc->luc_uce);
162         }
163         current->fs->umask = 0; /* umask already applied on client */
164         set_fs(new_ctx->fs);
165         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
166
167         /*
168         CDEBUG(D_INFO,
169                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
170                new_ctx, current, current->fs, current->fs->pwd,
171                atomic_read(&current->fs->pwd->d_count),
172                atomic_read(&current->fs->pwd->d_inode->i_count),
173                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
174                current->fs->pwdmnt,
175                atomic_read(&current->fs->pwdmnt->mnt_count));
176         */
177 }
178 EXPORT_SYMBOL(push_ctxt);
179
180 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
181               struct lvfs_ucred *uc)
182 {
183         //printk("pc0");
184         ASSERT_CTXT_MAGIC(saved->magic);
185         //printk("pc1");
186         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
187
188         /*
189         CDEBUG(D_INFO,
190                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
191                new_ctx, current, current->fs, current->fs->pwd,
192                atomic_read(&current->fs->pwd->d_count),
193                atomic_read(&current->fs->pwd->d_inode->i_count),
194                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
195                current->fs->pwdmnt,
196                atomic_read(&current->fs->pwdmnt->mnt_count));
197         */
198
199         LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
200                  current->fs->pwd, new_ctx->pwd);
201         LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
202                  current->fs->pwdmnt, new_ctx->pwdmnt);
203
204         set_fs(saved->fs);
205         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
206
207         dput(saved->pwd);
208         mntput(saved->pwdmnt);
209         current->fs->umask = saved->luc.luc_umask;
210         if (uc) {
211                 current->fsuid = saved->luc.luc_fsuid;
212                 current->fsgid = saved->luc.luc_fsgid;
213                 current->cap_effective = saved->luc.luc_cap;
214                 pop_group_info(saved, uc->luc_uce);
215         }
216
217         /*
218         CDEBUG(D_INFO,
219                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
220                saved, current, current->fs, current->fs->pwd,
221                atomic_read(&current->fs->pwd->d_count),
222                atomic_read(&current->fs->pwd->d_inode->i_count),
223                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
224                current->fs->pwdmnt,
225                atomic_read(&current->fs->pwdmnt->mnt_count));
226         */
227 }
228 EXPORT_SYMBOL(pop_ctxt);
229
230 /* utility to make a file */
231 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
232 {
233         struct dentry *dchild;
234         int err = 0;
235         ENTRY;
236
237         ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
238         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
239
240         dchild = ll_lookup_one_len(name, dir, strlen(name));
241         if (IS_ERR(dchild))
242                 GOTO(out_up, dchild);
243
244         if (dchild->d_inode) {
245                 int old_mode = dchild->d_inode->i_mode;
246                 if (!S_ISREG(old_mode))
247                         GOTO(out_err, err = -EEXIST);
248
249                 /* Fixup file permissions if necessary */
250                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
251                         CWARN("fixing permissions on %s from %o to %o\n",
252                               name, old_mode, mode);
253                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
254                                                   (old_mode & ~S_IALLUGO);
255                         mark_inode_dirty(dchild->d_inode);
256                 }
257                 GOTO(out_up, dchild);
258         }
259
260         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
261                             NULL);
262         if (err)
263                 GOTO(out_err, err);
264
265         RETURN(dchild);
266
267 out_err:
268         dput(dchild);
269         dchild = ERR_PTR(err);
270 out_up:
271         return dchild;
272 }
273 EXPORT_SYMBOL(simple_mknod);
274
275 /* utility to make a directory */
276 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
277 {
278         struct dentry *dchild;
279         int err = 0;
280         ENTRY;
281
282         ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
283         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
284         dchild = ll_lookup_one_len(name, dir, strlen(name));
285         if (IS_ERR(dchild))
286                 GOTO(out_up, dchild);
287
288         if (dchild->d_inode) {
289                 int old_mode = dchild->d_inode->i_mode;
290                 if (!S_ISDIR(old_mode)) {
291                         CERROR("found %s (%lu/%u) is mode %o\n", name,
292                                dchild->d_inode->i_ino,
293                                dchild->d_inode->i_generation, old_mode);
294                         GOTO(out_err, err = -ENOTDIR);
295                 }
296
297                 /* Fixup directory permissions if necessary */
298                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
299                         CDEBUG(D_CONFIG, 
300                                "fixing permissions on %s from %o to %o\n",
301                                name, old_mode, mode);
302                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
303                                                   (old_mode & ~S_IALLUGO);
304                         mark_inode_dirty(dchild->d_inode);
305                 }
306                 GOTO(out_up, dchild);
307         }
308
309         err = vfs_mkdir(dir->d_inode, dchild, mode);
310         if (err)
311                 GOTO(out_err, err);
312
313         RETURN(dchild);
314
315 out_err:
316         dput(dchild);
317         dchild = ERR_PTR(err);
318 out_up:
319         return dchild;
320 }
321 EXPORT_SYMBOL(simple_mkdir);
322
323 /* utility to rename a file */
324 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
325 {
326         struct dentry *dchild_old, *dchild_new;
327         int err = 0;
328         ENTRY;
329
330         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
331         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
332                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
333
334         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
335         if (IS_ERR(dchild_old))
336                 RETURN(PTR_ERR(dchild_old));
337
338         if (!dchild_old->d_inode) 
339                 GOTO(put_old, err = -ENOENT);
340
341         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
342         if (IS_ERR(dchild_new))
343                 GOTO(put_old, err = PTR_ERR(dchild_new));
344
345         err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
346
347         dput(dchild_new);
348 put_old:
349         dput(dchild_old);
350         RETURN(err);
351 }
352 EXPORT_SYMBOL(lustre_rename);
353
354 /*
355  * Read a file from within kernel context.  Prior to calling this
356  * function we should already have done a push_ctxt().
357  */
358 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
359 {
360         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
361         if (!file || !file->f_op || !file->f_op->read || !off)
362                 RETURN(-ENOSYS);
363
364         return file->f_op->read(file, buf, len, off);
365 }
366 EXPORT_SYMBOL(lustre_fread);
367
368 /*
369  * Write a file from within kernel context.  Prior to calling this
370  * function we should already have done a push_ctxt().
371  */
372 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
373 {
374         ENTRY;
375         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
376         if (!file)
377                 RETURN(-ENOENT);
378         if (!file->f_op)
379                 RETURN(-ENOSYS);
380         if (!off)
381                 RETURN(-EINVAL);
382
383         if (!file->f_op->write)
384                 RETURN(-EROFS);
385
386         RETURN(file->f_op->write(file, buf, len, off));
387 }
388 EXPORT_SYMBOL(lustre_fwrite);
389
390 /*
391  * Sync a file from within kernel context.  Prior to calling this
392  * function we should already have done a push_ctxt().
393  */
394 int lustre_fsync(struct file *file)
395 {
396         ENTRY;
397         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
398         if (!file || !file->f_op || !file->f_op->fsync)
399                 RETURN(-ENOSYS);
400
401         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
402 }
403 EXPORT_SYMBOL(lustre_fsync);
404
405 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
406                              int flags)
407 {
408         mntget(ctxt->pwdmnt);
409         return dentry_open(de, ctxt->pwdmnt, flags);
410 }
411 EXPORT_SYMBOL(l_dentry_open);
412
413 #ifdef HAVE_VFS_READDIR_U64_INO
414 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
415                      u64 ino, unsigned int d_type)
416 #else
417 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
418                      ino_t ino, unsigned int d_type)
419 #endif
420 {
421         struct l_linux_dirent *dirent;
422         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
423
424         dirent = buf->lrc_dirent;
425         if (dirent)
426                dirent->lld_off = offset;
427
428         OBD_ALLOC(dirent, sizeof(*dirent));
429
430         if (!dirent)
431                 return -ENOMEM;
432
433         list_add_tail(&dirent->lld_list, buf->lrc_list);
434
435         buf->lrc_dirent = dirent;
436         dirent->lld_ino = ino;
437         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
438         memcpy(dirent->lld_name, name, namlen);
439
440         return 0;
441 }
442
443 long l_readdir(struct file *file, struct list_head *dentry_list)
444 {
445         struct l_linux_dirent *lastdirent;
446         struct l_readdir_callback buf;
447         int error;
448
449         buf.lrc_dirent = NULL;
450         buf.lrc_list = dentry_list; 
451
452         error = vfs_readdir(file, l_filldir, &buf);
453         if (error < 0)
454                 return error;
455
456         lastdirent = buf.lrc_dirent;
457         if (lastdirent)
458                 lastdirent->lld_off = file->f_pos;
459
460         return 0; 
461 }
462 EXPORT_SYMBOL(l_readdir);
463 EXPORT_SYMBOL(obd_memory);
464 EXPORT_SYMBOL(obd_memmax);
465
466 #ifdef LUSTRE_KERNEL_VERSION
467 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
468 #error rdonly patchset must be updated [cfs bz11248]
469 #endif
470
471 void dev_set_rdonly(lvfs_sbdev_type dev);
472 int dev_check_rdonly(lvfs_sbdev_type dev);
473
474 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
475 {
476         lvfs_sbdev_sync(dev);
477         if (jdev && (jdev != dev)) {
478                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
479                        (long)jdev);
480                 dev_set_rdonly(jdev);
481         }
482         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
483         dev_set_rdonly(dev);
484 }
485
486 int lvfs_check_rdonly(lvfs_sbdev_type dev)
487 {
488         return dev_check_rdonly(dev);
489 }
490
491 EXPORT_SYMBOL(__lvfs_set_rdonly);
492 EXPORT_SYMBOL(lvfs_check_rdonly);
493 #endif /* LUSTRE_KERNEL_VERSION */
494
495 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
496 {
497         char *write_page = NULL;
498         loff_t offset = 0;
499         int rc = 0;
500         ENTRY;
501
502         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
503         if (!write_page)
504                 RETURN(-ENOMEM);
505         
506         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
507        
508         OBD_FREE(write_page, CFS_PAGE_SIZE);
509
510         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
511         RETURN(rc); 
512 }
513 EXPORT_SYMBOL(lvfs_check_io_health);
514
515 static int __init lvfs_linux_init(void)
516 {
517         RETURN(0);
518 }
519
520 static void __exit lvfs_linux_exit(void)
521 {
522         int leaked;
523         ENTRY;
524
525         leaked = atomic_read(&obd_memory);
526         CDEBUG(leaked ? D_ERROR : D_INFO,
527                "obd mem max: %d leaked: %d\n", obd_memmax, leaked);
528
529         EXIT;
530         return;
531 }
532
533 EXPORT_SYMBOL(obd_fail_loc);
534 EXPORT_SYMBOL(obd_alloc_fail_rate);
535 EXPORT_SYMBOL(obd_fail_val);
536
537 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
538 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
539 MODULE_LICENSE("GPL");
540
541 module_init(lvfs_linux_init);
542 module_exit(lvfs_linux_exit);