1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/lib/lvfs_linux.c
5 * Lustre filesystem abstraction routines
7 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8 * Author: Andreas Dilger <adilger@clusterfs.com>
10 * This file is part of Lustre, http://www.lustre.org.
12 * Lustre is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Lustre is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Lustre; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 # define EXPORT_SYMTAB
30 #define DEBUG_SUBSYSTEM S_FILTER
32 #include <linux/version.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <lustre_fsfilt.h>
42 #include <obd_class.h>
43 #include <linux/module.h>
44 #include <linux/init.h>
45 #include <linux/lustre_compat25.h>
47 #include "lvfs_internal.h"
50 #include <lustre_lib.h>
51 #include <lustre_quota.h>
56 /* Debugging check only needed during development */
58 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
59 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
61 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
63 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
64 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
65 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
68 static void push_group_info(struct lvfs_run_ctxt *save,
69 struct group_info *ginfo)
72 save->ngroups = current_ngroups;
75 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
77 save->group_info = current->group_info;
78 current->group_info = ginfo;
81 LASSERT(ginfo->ngroups <= NGROUPS);
82 LASSERT(current->ngroups <= NGROUPS_SMALL);
84 save->group_info.ngroups = current->ngroups;
86 memcpy(save->group_info.small_block, current->groups,
87 current->ngroups * sizeof(gid_t));
89 current->ngroups = ginfo->ngroups;
91 memcpy(current->groups, ginfo->small_block,
92 current->ngroups * sizeof(gid_t));
97 static void pop_group_info(struct lvfs_run_ctxt *save,
98 struct group_info *ginfo)
101 current_ngroups = save->ngroups;
103 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
105 current->group_info = save->group_info;
106 task_unlock(current);
108 current->ngroups = save->group_info.ngroups;
109 if (current->ngroups)
110 memcpy(current->groups, save->group_info.small_block,
111 current->ngroups * sizeof(gid_t));
116 /* push / pop to root of obd store */
117 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
118 struct lvfs_ucred *uc)
120 //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
121 ASSERT_CTXT_MAGIC(new_ctx->magic);
122 OBD_SET_CTXT_MAGIC(save);
126 "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
127 save, current, current->fs, current->fs->pwd,
128 atomic_read(¤t->fs->pwd->d_count),
129 atomic_read(¤t->fs->pwd->d_inode->i_count),
130 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
132 atomic_read(¤t->fs->pwdmnt->mnt_count));
136 LASSERT(atomic_read(¤t->fs->pwd->d_count));
137 LASSERT(atomic_read(&new_ctx->pwd->d_count));
138 save->pwd = dget(current->fs->pwd);
139 save->pwdmnt = mntget(current->fs->pwdmnt);
140 save->luc.luc_umask = current->fs->umask;
141 save->ngroups = current->group_info->ngroups;
144 LASSERT(save->pwdmnt);
145 LASSERT(new_ctx->pwd);
146 LASSERT(new_ctx->pwdmnt);
149 save->luc.luc_uid = current->uid;
150 save->luc.luc_gid = current->gid;
151 save->luc.luc_fsuid = current->fsuid;
152 save->luc.luc_fsgid = current->fsgid;
153 save->luc.luc_cap = current->cap_effective;
155 current->uid = uc->luc_uid;
156 current->gid = uc->luc_gid;
157 current->fsuid = uc->luc_fsuid;
158 current->fsgid = uc->luc_fsgid;
159 current->cap_effective = uc->luc_cap;
161 push_group_info(save,
163 uc->luc_identity ? uc->luc_identity->mi_ginfo :
166 current->fs->umask = 0; /* umask already applied on client */
168 ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
172 "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
173 new_ctx, current, current->fs, current->fs->pwd,
174 atomic_read(¤t->fs->pwd->d_count),
175 atomic_read(¤t->fs->pwd->d_inode->i_count),
176 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
178 atomic_read(¤t->fs->pwdmnt->mnt_count));
181 EXPORT_SYMBOL(push_ctxt);
183 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
184 struct lvfs_ucred *uc)
187 ASSERT_CTXT_MAGIC(saved->magic);
189 ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
193 " = pop %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
194 new_ctx, current, current->fs, current->fs->pwd,
195 atomic_read(¤t->fs->pwd->d_count),
196 atomic_read(¤t->fs->pwd->d_inode->i_count),
197 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
199 atomic_read(¤t->fs->pwdmnt->mnt_count));
202 LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
203 current->fs->pwd, new_ctx->pwd);
204 LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
205 current->fs->pwdmnt, new_ctx->pwdmnt);
208 ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
211 mntput(saved->pwdmnt);
212 current->fs->umask = saved->luc.luc_umask;
214 current->uid = saved->luc.luc_uid;
215 current->gid = saved->luc.luc_gid;
216 current->fsuid = saved->luc.luc_fsuid;
217 current->fsgid = saved->luc.luc_fsgid;
218 current->cap_effective = saved->luc.luc_cap;
219 pop_group_info(saved,
221 uc->luc_identity ? uc->luc_identity->mi_ginfo :
227 "= pop %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
228 saved, current, current->fs, current->fs->pwd,
229 atomic_read(¤t->fs->pwd->d_count),
230 atomic_read(¤t->fs->pwd->d_inode->i_count),
231 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
233 atomic_read(¤t->fs->pwdmnt->mnt_count));
236 EXPORT_SYMBOL(pop_ctxt);
238 /* utility to make a file */
239 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
241 struct dentry *dchild;
245 // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
246 CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
248 dchild = ll_lookup_one_len(name, dir, strlen(name));
250 GOTO(out_up, dchild);
252 if (dchild->d_inode) {
253 int old_mode = dchild->d_inode->i_mode;
254 if (!S_ISREG(old_mode))
255 GOTO(out_err, err = -EEXIST);
257 /* Fixup file permissions if necessary */
258 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
259 CWARN("fixing permissions on %s from %o to %o\n",
260 name, old_mode, mode);
261 dchild->d_inode->i_mode = (mode & S_IALLUGO) |
262 (old_mode & ~S_IALLUGO);
263 mark_inode_dirty(dchild->d_inode);
265 GOTO(out_up, dchild);
268 err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
277 dchild = ERR_PTR(err);
281 EXPORT_SYMBOL(simple_mknod);
283 /* utility to make a directory */
284 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
286 struct dentry *dchild;
290 // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
291 CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
292 dchild = ll_lookup_one_len(name, dir, strlen(name));
294 GOTO(out_up, dchild);
296 if (dchild->d_inode) {
297 int old_mode = dchild->d_inode->i_mode;
298 if (!S_ISDIR(old_mode)) {
299 CERROR("found %s (%lu/%u) is mode %o\n", name,
300 dchild->d_inode->i_ino,
301 dchild->d_inode->i_generation, old_mode);
302 GOTO(out_err, err = -ENOTDIR);
305 /* Fixup directory permissions if necessary */
306 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
308 "fixing permissions on %s from %o to %o\n",
309 name, old_mode, mode);
310 dchild->d_inode->i_mode = (mode & S_IALLUGO) |
311 (old_mode & ~S_IALLUGO);
312 mark_inode_dirty(dchild->d_inode);
314 GOTO(out_up, dchild);
317 err = vfs_mkdir(dir->d_inode, dchild, mode);
325 dchild = ERR_PTR(err);
329 EXPORT_SYMBOL(simple_mkdir);
331 /* utility to rename a file */
332 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
334 struct dentry *dchild_old, *dchild_new;
338 ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
339 CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
340 (int)strlen(oldname), oldname, (int)strlen(newname), newname);
342 dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
343 if (IS_ERR(dchild_old))
344 RETURN(PTR_ERR(dchild_old));
346 if (!dchild_old->d_inode)
347 GOTO(put_old, err = -ENOENT);
349 dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
350 if (IS_ERR(dchild_new))
351 GOTO(put_old, err = PTR_ERR(dchild_new));
353 err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
360 EXPORT_SYMBOL(lustre_rename);
363 * Read a file from within kernel context. Prior to calling this
364 * function we should already have done a push_ctxt().
366 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
368 ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
369 if (!file || !file->f_op || !file->f_op->read || !off)
372 return file->f_op->read(file, buf, len, off);
374 EXPORT_SYMBOL(lustre_fread);
377 * Write a file from within kernel context. Prior to calling this
378 * function we should already have done a push_ctxt().
380 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
383 ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
391 if (!file->f_op->write)
394 RETURN(file->f_op->write(file, buf, len, off));
396 EXPORT_SYMBOL(lustre_fwrite);
399 * Sync a file from within kernel context. Prior to calling this
400 * function we should already have done a push_ctxt().
402 int lustre_fsync(struct file *file)
405 ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
406 if (!file || !file->f_op || !file->f_op->fsync)
409 RETURN(file->f_op->fsync(file, file->f_dentry, 0));
411 EXPORT_SYMBOL(lustre_fsync);
413 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
416 mntget(ctxt->pwdmnt);
417 return dentry_open(de, ctxt->pwdmnt, flags);
419 EXPORT_SYMBOL(l_dentry_open);
421 #ifdef HAVE_VFS_READDIR_U64_INO
422 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
423 u64 ino, unsigned int d_type)
425 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
426 ino_t ino, unsigned int d_type)
429 struct l_linux_dirent *dirent;
430 struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
432 dirent = buf->lrc_dirent;
434 dirent->lld_off = offset;
436 OBD_ALLOC(dirent, sizeof(*dirent));
441 list_add_tail(&dirent->lld_list, buf->lrc_list);
443 buf->lrc_dirent = dirent;
444 dirent->lld_ino = ino;
445 LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
446 memcpy(dirent->lld_name, name, namlen);
451 long l_readdir(struct file *file, struct list_head *dentry_list)
453 struct l_linux_dirent *lastdirent;
454 struct l_readdir_callback buf;
457 buf.lrc_dirent = NULL;
458 buf.lrc_list = dentry_list;
460 error = vfs_readdir(file, l_filldir, &buf);
464 lastdirent = buf.lrc_dirent;
466 lastdirent->lld_off = file->f_pos;
470 EXPORT_SYMBOL(l_readdir);
471 EXPORT_SYMBOL(obd_memory);
472 EXPORT_SYMBOL(obd_memmax);
474 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
475 static spinlock_t obd_memlist_lock = SPIN_LOCK_UNLOCKED;
476 static struct hlist_head *obd_memtable = NULL;
477 static unsigned long obd_memtable_size = 0;
479 static int lvfs_memdbg_init(int size)
481 struct hlist_head *head;
484 LASSERT(size > sizeof(sizeof(struct hlist_head)));
485 obd_memtable_size = size / sizeof(struct hlist_head);
487 CWARN("Allocating %lu memdbg entries.\n",
488 (unsigned long)obd_memtable_size);
490 LASSERT(obd_memtable == NULL);
491 obd_memtable = kmalloc(size, GFP_KERNEL);
495 i = obd_memtable_size;
498 INIT_HLIST_HEAD(head);
506 static int lvfs_memdbg_cleanup(void)
508 struct hlist_node *node = NULL, *tmp = NULL;
509 struct hlist_head *head;
510 struct obd_mem_track *mt;
513 spin_lock(&obd_memlist_lock);
514 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
515 hlist_for_each_safe(node, tmp, head) {
516 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
517 hlist_del_init(&mt->mt_hash);
521 spin_unlock(&obd_memlist_lock);
526 static inline unsigned long const hashfn(void *ptr)
528 return (unsigned long)ptr &
529 (obd_memtable_size - 1);
532 static void __lvfs_memdbg_insert(struct obd_mem_track *mt)
534 struct hlist_head *head = obd_memtable +
536 hlist_add_head(&mt->mt_hash, head);
539 void lvfs_memdbg_insert(struct obd_mem_track *mt)
541 spin_lock(&obd_memlist_lock);
542 __lvfs_memdbg_insert(mt);
543 spin_unlock(&obd_memlist_lock);
545 EXPORT_SYMBOL(lvfs_memdbg_insert);
547 static void __lvfs_memdbg_remove(struct obd_mem_track *mt)
549 hlist_del_init(&mt->mt_hash);
552 void lvfs_memdbg_remove(struct obd_mem_track *mt)
554 spin_lock(&obd_memlist_lock);
555 __lvfs_memdbg_remove(mt);
556 spin_unlock(&obd_memlist_lock);
558 EXPORT_SYMBOL(lvfs_memdbg_remove);
560 static struct obd_mem_track *__lvfs_memdbg_find(void *ptr)
562 struct hlist_node *node = NULL;
563 struct obd_mem_track *mt = NULL;
564 struct hlist_head *head;
566 head = obd_memtable + hashfn(ptr);
568 hlist_for_each(node, head) {
569 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
570 if ((unsigned long)mt->mt_ptr == (unsigned long)ptr)
577 struct obd_mem_track *lvfs_memdbg_find(void *ptr)
579 struct obd_mem_track *mt;
581 spin_lock(&obd_memlist_lock);
582 mt = __lvfs_memdbg_find(ptr);
583 spin_unlock(&obd_memlist_lock);
587 EXPORT_SYMBOL(lvfs_memdbg_find);
589 int lvfs_memdbg_check_insert(struct obd_mem_track *mt)
591 struct obd_mem_track *tmp;
593 spin_lock(&obd_memlist_lock);
594 tmp = __lvfs_memdbg_find(mt->mt_ptr);
596 __lvfs_memdbg_insert(mt);
597 spin_unlock(&obd_memlist_lock);
600 spin_unlock(&obd_memlist_lock);
603 EXPORT_SYMBOL(lvfs_memdbg_check_insert);
605 struct obd_mem_track *
606 lvfs_memdbg_check_remove(void *ptr)
608 struct obd_mem_track *mt;
610 spin_lock(&obd_memlist_lock);
611 mt = __lvfs_memdbg_find(ptr);
613 __lvfs_memdbg_remove(mt);
614 spin_unlock(&obd_memlist_lock);
617 spin_unlock(&obd_memlist_lock);
620 EXPORT_SYMBOL(lvfs_memdbg_check_remove);
623 void lvfs_memdbg_show(void)
625 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
626 struct hlist_node *node = NULL;
627 struct hlist_head *head;
628 struct obd_mem_track *mt;
633 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
637 leaked = atomic_read(&obd_memory);
640 CWARN("Memory leaks detected (max %d, leaked %d)\n",
644 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
645 spin_lock(&obd_memlist_lock);
646 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
647 hlist_for_each(node, head) {
649 CWARN("Abnormal memory activities:\n");
652 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
653 CWARN(" [%s] ptr: 0x%p, size: %d, src at %s\n",
654 ((mt->mt_flags & OBD_MT_WRONG_SIZE) ?
655 "wrong size" : "leaked memory"),
656 mt->mt_ptr, mt->mt_size, mt->mt_loc);
659 spin_unlock(&obd_memlist_lock);
662 EXPORT_SYMBOL(lvfs_memdbg_show);
664 #ifdef LUSTRE_KERNEL_VERSION
665 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
666 #error rdonly patchset must be updated [cfs bz11248]
668 void dev_set_rdonly(lvfs_sbdev_type dev);
669 int dev_check_rdonly(lvfs_sbdev_type dev);
671 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
673 lvfs_sbdev_sync(dev);
674 if (jdev && (jdev != dev)) {
675 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
677 dev_set_rdonly(jdev);
679 CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
683 int lvfs_check_rdonly(lvfs_sbdev_type dev)
685 return dev_check_rdonly(dev);
688 EXPORT_SYMBOL(__lvfs_set_rdonly);
689 EXPORT_SYMBOL(lvfs_check_rdonly);
691 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
693 char *write_page = NULL;
698 OBD_ALLOC(write_page, CFS_PAGE_SIZE);
702 rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
704 OBD_FREE(write_page, CFS_PAGE_SIZE);
706 CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
709 EXPORT_SYMBOL(lvfs_check_io_health);
710 #endif /* LUSTRE_KERNEL_VERSION */
712 static int __init lvfs_linux_init(void)
715 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
716 lvfs_memdbg_init(PAGE_SIZE);
721 static void __exit lvfs_linux_exit(void)
727 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
728 lvfs_memdbg_cleanup();
733 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
734 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
735 MODULE_LICENSE("GPL");
737 module_init(lvfs_linux_init);
738 module_exit(lvfs_linux_exit);