1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/lib/lvfs_linux.c
5 * Lustre filesystem abstraction routines
7 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8 * Author: Andreas Dilger <adilger@clusterfs.com>
10 * This file is part of Lustre, http://www.lustre.org.
12 * Lustre is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Lustre is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Lustre; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 # define EXPORT_SYMTAB
30 #define DEBUG_SUBSYSTEM S_FILTER
32 #include <linux/version.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <lustre_fsfilt.h>
42 #include <obd_class.h>
43 #include <linux/module.h>
44 #include <linux/init.h>
45 #include <linux/lustre_compat25.h>
47 #include "lvfs_internal.h"
50 #include <lustre_lib.h>
51 #include <lustre_quota.h>
53 __u64 obd_max_pages = 0;
54 __u64 obd_max_alloc = 0;
55 struct lprocfs_stats *obd_memory = NULL;
56 spinlock_t obd_updatemax_lock = SPIN_LOCK_UNLOCKED;
57 /* refine later and change to seqlock or simlar from libcfs */
59 /* Debugging check only needed during development */
61 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
62 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
64 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
66 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
67 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
68 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
71 static void push_group_info(struct lvfs_run_ctxt *save,
72 struct group_info *ginfo)
75 save->ngroups = current_ngroups;
79 save->group_info = current->group_info;
80 current->group_info = ginfo;
85 static void pop_group_info(struct lvfs_run_ctxt *save,
86 struct group_info *ginfo)
89 current_ngroups = save->ngroups;
92 current->group_info = save->group_info;
97 /* push / pop to root of obd store */
98 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
99 struct lvfs_ucred *uc)
101 //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
102 ASSERT_CTXT_MAGIC(new_ctx->magic);
103 OBD_SET_CTXT_MAGIC(save);
107 "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
108 save, current, current->fs, current->fs->pwd,
109 atomic_read(¤t->fs->pwd->d_count),
110 atomic_read(¤t->fs->pwd->d_inode->i_count),
111 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
113 atomic_read(¤t->fs->pwdmnt->mnt_count));
117 LASSERT(atomic_read(¤t->fs->pwd->d_count));
118 LASSERT(atomic_read(&new_ctx->pwd->d_count));
119 save->pwd = dget(current->fs->pwd);
120 save->pwdmnt = mntget(current->fs->pwdmnt);
121 save->luc.luc_umask = current->fs->umask;
122 save->ngroups = current->group_info->ngroups;
125 LASSERT(save->pwdmnt);
126 LASSERT(new_ctx->pwd);
127 LASSERT(new_ctx->pwdmnt);
130 save->luc.luc_uid = current->uid;
131 save->luc.luc_gid = current->gid;
132 save->luc.luc_fsuid = current->fsuid;
133 save->luc.luc_fsgid = current->fsgid;
134 save->luc.luc_cap = current->cap_effective;
136 current->uid = uc->luc_uid;
137 current->gid = uc->luc_gid;
138 current->fsuid = uc->luc_fsuid;
139 current->fsgid = uc->luc_fsgid;
140 current->cap_effective = uc->luc_cap;
142 push_group_info(save,
144 uc->luc_identity ? uc->luc_identity->mi_ginfo :
147 current->fs->umask = 0; /* umask already applied on client */
149 ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
153 "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
154 new_ctx, current, current->fs, current->fs->pwd,
155 atomic_read(¤t->fs->pwd->d_count),
156 atomic_read(¤t->fs->pwd->d_inode->i_count),
157 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
159 atomic_read(¤t->fs->pwdmnt->mnt_count));
162 EXPORT_SYMBOL(push_ctxt);
164 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
165 struct lvfs_ucred *uc)
168 ASSERT_CTXT_MAGIC(saved->magic);
170 ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
174 " = pop %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
175 new_ctx, current, current->fs, current->fs->pwd,
176 atomic_read(¤t->fs->pwd->d_count),
177 atomic_read(¤t->fs->pwd->d_inode->i_count),
178 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
180 atomic_read(¤t->fs->pwdmnt->mnt_count));
183 LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
184 current->fs->pwd, new_ctx->pwd);
185 LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
186 current->fs->pwdmnt, new_ctx->pwdmnt);
189 ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
192 mntput(saved->pwdmnt);
193 current->fs->umask = saved->luc.luc_umask;
195 current->uid = saved->luc.luc_uid;
196 current->gid = saved->luc.luc_gid;
197 current->fsuid = saved->luc.luc_fsuid;
198 current->fsgid = saved->luc.luc_fsgid;
199 current->cap_effective = saved->luc.luc_cap;
200 pop_group_info(saved,
202 uc->luc_identity ? uc->luc_identity->mi_ginfo :
208 "= pop %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
209 saved, current, current->fs, current->fs->pwd,
210 atomic_read(¤t->fs->pwd->d_count),
211 atomic_read(¤t->fs->pwd->d_inode->i_count),
212 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
214 atomic_read(¤t->fs->pwdmnt->mnt_count));
217 EXPORT_SYMBOL(pop_ctxt);
219 /* utility to make a file */
220 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
222 struct dentry *dchild;
226 // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
227 CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
229 dchild = ll_lookup_one_len(name, dir, strlen(name));
231 GOTO(out_up, dchild);
233 if (dchild->d_inode) {
234 int old_mode = dchild->d_inode->i_mode;
235 if (!S_ISREG(old_mode))
236 GOTO(out_err, err = -EEXIST);
238 /* Fixup file permissions if necessary */
239 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
240 CWARN("fixing permissions on %s from %o to %o\n",
241 name, old_mode, mode);
242 dchild->d_inode->i_mode = (mode & S_IALLUGO) |
243 (old_mode & ~S_IALLUGO);
244 mark_inode_dirty(dchild->d_inode);
246 GOTO(out_up, dchild);
249 err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
258 dchild = ERR_PTR(err);
262 EXPORT_SYMBOL(simple_mknod);
264 /* utility to make a directory */
265 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
267 struct dentry *dchild;
271 // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
272 CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
273 dchild = ll_lookup_one_len(name, dir, strlen(name));
275 GOTO(out_up, dchild);
277 if (dchild->d_inode) {
278 int old_mode = dchild->d_inode->i_mode;
279 if (!S_ISDIR(old_mode)) {
280 CERROR("found %s (%lu/%u) is mode %o\n", name,
281 dchild->d_inode->i_ino,
282 dchild->d_inode->i_generation, old_mode);
283 GOTO(out_err, err = -ENOTDIR);
286 /* Fixup directory permissions if necessary */
287 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
289 "fixing permissions on %s from %o to %o\n",
290 name, old_mode, mode);
291 dchild->d_inode->i_mode = (mode & S_IALLUGO) |
292 (old_mode & ~S_IALLUGO);
293 mark_inode_dirty(dchild->d_inode);
295 GOTO(out_up, dchild);
298 err = vfs_mkdir(dir->d_inode, dchild, mode);
306 dchild = ERR_PTR(err);
310 EXPORT_SYMBOL(simple_mkdir);
312 /* utility to rename a file */
313 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
315 struct dentry *dchild_old, *dchild_new;
319 ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
320 CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
321 (int)strlen(oldname), oldname, (int)strlen(newname), newname);
323 dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
324 if (IS_ERR(dchild_old))
325 RETURN(PTR_ERR(dchild_old));
327 if (!dchild_old->d_inode)
328 GOTO(put_old, err = -ENOENT);
330 dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
331 if (IS_ERR(dchild_new))
332 GOTO(put_old, err = PTR_ERR(dchild_new));
334 err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
341 EXPORT_SYMBOL(lustre_rename);
344 * Read a file from within kernel context. Prior to calling this
345 * function we should already have done a push_ctxt().
347 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
349 ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
350 if (!file || !file->f_op || !file->f_op->read || !off)
353 return file->f_op->read(file, buf, len, off);
355 EXPORT_SYMBOL(lustre_fread);
358 * Write a file from within kernel context. Prior to calling this
359 * function we should already have done a push_ctxt().
361 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
364 ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
372 if (!file->f_op->write)
375 RETURN(file->f_op->write(file, buf, len, off));
377 EXPORT_SYMBOL(lustre_fwrite);
380 * Sync a file from within kernel context. Prior to calling this
381 * function we should already have done a push_ctxt().
383 int lustre_fsync(struct file *file)
386 ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
387 if (!file || !file->f_op || !file->f_op->fsync)
390 RETURN(file->f_op->fsync(file, file->f_dentry, 0));
392 EXPORT_SYMBOL(lustre_fsync);
394 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
397 mntget(ctxt->pwdmnt);
398 return dentry_open(de, ctxt->pwdmnt, flags);
400 EXPORT_SYMBOL(l_dentry_open);
402 #ifdef HAVE_VFS_READDIR_U64_INO
403 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
404 u64 ino, unsigned int d_type)
406 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
407 ino_t ino, unsigned int d_type)
410 struct l_linux_dirent *dirent;
411 struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
413 dirent = buf->lrc_dirent;
415 dirent->lld_off = offset;
417 OBD_ALLOC(dirent, sizeof(*dirent));
422 list_add_tail(&dirent->lld_list, buf->lrc_list);
424 buf->lrc_dirent = dirent;
425 dirent->lld_ino = ino;
426 LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
427 memcpy(dirent->lld_name, name, namlen);
432 long l_readdir(struct file *file, struct list_head *dentry_list)
434 struct l_linux_dirent *lastdirent;
435 struct l_readdir_callback buf;
438 buf.lrc_dirent = NULL;
439 buf.lrc_list = dentry_list;
441 error = vfs_readdir(file, l_filldir, &buf);
445 lastdirent = buf.lrc_dirent;
447 lastdirent->lld_off = file->f_pos;
451 EXPORT_SYMBOL(l_readdir);
453 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
454 static spinlock_t obd_memlist_lock = SPIN_LOCK_UNLOCKED;
455 static struct hlist_head *obd_memtable = NULL;
456 static unsigned long obd_memtable_size = 0;
458 static int lvfs_memdbg_init(int size)
460 struct hlist_head *head;
463 LASSERT(size > sizeof(sizeof(struct hlist_head)));
464 obd_memtable_size = size / sizeof(struct hlist_head);
466 CWARN("Allocating %lu memdbg entries.\n",
467 (unsigned long)obd_memtable_size);
469 LASSERT(obd_memtable == NULL);
470 obd_memtable = kmalloc(size, GFP_KERNEL);
474 i = obd_memtable_size;
477 INIT_HLIST_HEAD(head);
485 static int lvfs_memdbg_cleanup(void)
487 struct hlist_node *node = NULL, *tmp = NULL;
488 struct hlist_head *head;
489 struct obd_mem_track *mt;
492 spin_lock(&obd_memlist_lock);
493 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
494 hlist_for_each_safe(node, tmp, head) {
495 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
496 hlist_del_init(&mt->mt_hash);
500 spin_unlock(&obd_memlist_lock);
505 static inline unsigned long const hashfn(void *ptr)
507 return (unsigned long)ptr &
508 (obd_memtable_size - 1);
511 static void __lvfs_memdbg_insert(struct obd_mem_track *mt)
513 struct hlist_head *head = obd_memtable +
515 hlist_add_head(&mt->mt_hash, head);
518 void lvfs_memdbg_insert(struct obd_mem_track *mt)
520 spin_lock(&obd_memlist_lock);
521 __lvfs_memdbg_insert(mt);
522 spin_unlock(&obd_memlist_lock);
524 EXPORT_SYMBOL(lvfs_memdbg_insert);
526 static void __lvfs_memdbg_remove(struct obd_mem_track *mt)
528 hlist_del_init(&mt->mt_hash);
531 void lvfs_memdbg_remove(struct obd_mem_track *mt)
533 spin_lock(&obd_memlist_lock);
534 __lvfs_memdbg_remove(mt);
535 spin_unlock(&obd_memlist_lock);
537 EXPORT_SYMBOL(lvfs_memdbg_remove);
539 static struct obd_mem_track *__lvfs_memdbg_find(void *ptr)
541 struct hlist_node *node = NULL;
542 struct obd_mem_track *mt = NULL;
543 struct hlist_head *head;
545 head = obd_memtable + hashfn(ptr);
547 hlist_for_each(node, head) {
548 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
549 if ((unsigned long)mt->mt_ptr == (unsigned long)ptr)
556 struct obd_mem_track *lvfs_memdbg_find(void *ptr)
558 struct obd_mem_track *mt;
560 spin_lock(&obd_memlist_lock);
561 mt = __lvfs_memdbg_find(ptr);
562 spin_unlock(&obd_memlist_lock);
566 EXPORT_SYMBOL(lvfs_memdbg_find);
568 int lvfs_memdbg_check_insert(struct obd_mem_track *mt)
570 struct obd_mem_track *tmp;
572 spin_lock(&obd_memlist_lock);
573 tmp = __lvfs_memdbg_find(mt->mt_ptr);
575 __lvfs_memdbg_insert(mt);
576 spin_unlock(&obd_memlist_lock);
579 spin_unlock(&obd_memlist_lock);
582 EXPORT_SYMBOL(lvfs_memdbg_check_insert);
584 struct obd_mem_track *
585 lvfs_memdbg_check_remove(void *ptr)
587 struct obd_mem_track *mt;
589 spin_lock(&obd_memlist_lock);
590 mt = __lvfs_memdbg_find(ptr);
592 __lvfs_memdbg_remove(mt);
593 spin_unlock(&obd_memlist_lock);
596 spin_unlock(&obd_memlist_lock);
599 EXPORT_SYMBOL(lvfs_memdbg_check_remove);
602 void lvfs_memdbg_show(void)
604 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
605 struct hlist_node *node = NULL;
606 struct hlist_head *head;
607 struct obd_mem_track *mt;
611 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
616 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
617 spin_lock(&obd_memlist_lock);
618 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
619 hlist_for_each(node, head) {
621 CWARN("Abnormal memory activities:\n");
624 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
625 CWARN(" [%s] ptr: 0x%p, size: %d, src at %s\n",
626 ((mt->mt_flags & OBD_MT_WRONG_SIZE) ?
627 "wrong size" : "leaked memory"),
628 mt->mt_ptr, mt->mt_size, mt->mt_loc);
631 spin_unlock(&obd_memlist_lock);
634 EXPORT_SYMBOL(lvfs_memdbg_show);
636 #ifdef LUSTRE_KERNEL_VERSION
637 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
638 #error rdonly patchset must be updated [cfs bz11248]
640 void dev_set_rdonly(lvfs_sbdev_type dev);
641 int dev_check_rdonly(lvfs_sbdev_type dev);
643 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
645 lvfs_sbdev_sync(dev);
646 if (jdev && (jdev != dev)) {
647 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
649 dev_set_rdonly(jdev);
651 CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
655 int lvfs_check_rdonly(lvfs_sbdev_type dev)
657 return dev_check_rdonly(dev);
660 EXPORT_SYMBOL(__lvfs_set_rdonly);
661 EXPORT_SYMBOL(lvfs_check_rdonly);
663 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
665 char *write_page = NULL;
670 OBD_ALLOC(write_page, CFS_PAGE_SIZE);
674 rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
676 OBD_FREE(write_page, CFS_PAGE_SIZE);
678 CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
681 EXPORT_SYMBOL(lvfs_check_io_health);
682 #endif /* LUSTRE_KERNEL_VERSION */
684 void obd_update_maxusage()
688 max1 = obd_pages_sum();
689 max2 = obd_memory_sum();
691 spin_lock(&obd_updatemax_lock);
692 if (max1 > obd_max_pages)
693 obd_max_pages = max1;
694 if (max2 > obd_max_alloc)
695 obd_max_alloc = max2;
696 spin_unlock(&obd_updatemax_lock);
700 __u64 obd_memory_max(void)
704 spin_lock(&obd_updatemax_lock);
706 spin_unlock(&obd_updatemax_lock);
711 __u64 obd_pages_max(void)
715 spin_lock(&obd_updatemax_lock);
717 spin_unlock(&obd_updatemax_lock);
722 EXPORT_SYMBOL(obd_update_maxusage);
723 EXPORT_SYMBOL(obd_pages_max);
724 EXPORT_SYMBOL(obd_memory_max);
725 EXPORT_SYMBOL(obd_memory);
728 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
729 enum lprocfs_fields_flags field)
737 centry = atomic_read(&lc->lc_cntl.la_entry);
740 case LPROCFS_FIELDS_FLAGS_CONFIG:
743 case LPROCFS_FIELDS_FLAGS_SUM:
746 case LPROCFS_FIELDS_FLAGS_MIN:
749 case LPROCFS_FIELDS_FLAGS_MAX:
752 case LPROCFS_FIELDS_FLAGS_AVG:
753 ret = (lc->lc_max - lc->lc_min)/2;
755 case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
756 ret = lc->lc_sumsquare;
758 case LPROCFS_FIELDS_FLAGS_COUNT:
764 } while (centry != atomic_read(&lc->lc_cntl.la_entry) &&
765 centry != atomic_read(&lc->lc_cntl.la_exit));
769 EXPORT_SYMBOL(lprocfs_read_helper);
772 static int __init lvfs_linux_init(void)
775 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
776 lvfs_memdbg_init(PAGE_SIZE);
781 static void __exit lvfs_linux_exit(void)
787 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
788 lvfs_memdbg_cleanup();
793 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
794 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
795 MODULE_LICENSE("GPL");
797 module_init(lvfs_linux_init);
798 module_exit(lvfs_linux_exit);