1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * lustre/lib/lvfs_linux.c
5 * Lustre filesystem abstraction routines
7 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8 * Author: Andreas Dilger <adilger@clusterfs.com>
10 * This file is part of Lustre, http://www.lustre.org.
12 * Lustre is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General Public
14 * License as published by the Free Software Foundation.
16 * Lustre is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with Lustre; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 # define EXPORT_SYMTAB
30 #define DEBUG_SUBSYSTEM S_FILTER
32 #include <linux/version.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <lustre_fsfilt.h>
42 #include <linux/module.h>
43 #include <linux/init.h>
44 #include <linux/lustre_compat25.h>
46 #include "lvfs_internal.h"
49 #include <lustre_lib.h>
50 #include <lustre_quota.h>
52 __u64 obd_max_pages = 0;
53 __u64 obd_max_alloc = 0;
54 struct lprocfs_stats *obd_memory = NULL;
55 spinlock_t obd_updatemax_lock = SPIN_LOCK_UNLOCKED;
56 /* refine later and change to seqlock or simlar from libcfs */
58 /* Debugging check only needed during development */
60 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
61 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
63 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
65 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
66 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
67 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
70 static void push_group_info(struct lvfs_run_ctxt *save,
71 struct group_info *ginfo)
74 save->ngroups = current_ngroups;
78 save->group_info = current->group_info;
79 current->group_info = ginfo;
84 static void pop_group_info(struct lvfs_run_ctxt *save,
85 struct group_info *ginfo)
88 current_ngroups = save->ngroups;
91 current->group_info = save->group_info;
96 /* push / pop to root of obd store */
97 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
98 struct lvfs_ucred *uc)
100 //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
101 ASSERT_CTXT_MAGIC(new_ctx->magic);
102 OBD_SET_CTXT_MAGIC(save);
106 "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
107 save, current, current->fs, current->fs->pwd,
108 atomic_read(¤t->fs->pwd->d_count),
109 atomic_read(¤t->fs->pwd->d_inode->i_count),
110 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
112 atomic_read(¤t->fs->pwdmnt->mnt_count));
116 LASSERT(atomic_read(¤t->fs->pwd->d_count));
117 LASSERT(atomic_read(&new_ctx->pwd->d_count));
118 save->pwd = dget(current->fs->pwd);
119 save->pwdmnt = mntget(current->fs->pwdmnt);
120 save->luc.luc_umask = current->fs->umask;
121 save->ngroups = current->group_info->ngroups;
124 LASSERT(save->pwdmnt);
125 LASSERT(new_ctx->pwd);
126 LASSERT(new_ctx->pwdmnt);
129 save->luc.luc_uid = current->uid;
130 save->luc.luc_gid = current->gid;
131 save->luc.luc_fsuid = current->fsuid;
132 save->luc.luc_fsgid = current->fsgid;
133 save->luc.luc_cap = current->cap_effective;
135 current->uid = uc->luc_uid;
136 current->gid = uc->luc_gid;
137 current->fsuid = uc->luc_fsuid;
138 current->fsgid = uc->luc_fsgid;
139 current->cap_effective = uc->luc_cap;
141 push_group_info(save,
143 uc->luc_identity ? uc->luc_identity->mi_ginfo :
146 current->fs->umask = 0; /* umask already applied on client */
148 ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
152 "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
153 new_ctx, current, current->fs, current->fs->pwd,
154 atomic_read(¤t->fs->pwd->d_count),
155 atomic_read(¤t->fs->pwd->d_inode->i_count),
156 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
158 atomic_read(¤t->fs->pwdmnt->mnt_count));
161 EXPORT_SYMBOL(push_ctxt);
163 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
164 struct lvfs_ucred *uc)
167 ASSERT_CTXT_MAGIC(saved->magic);
169 ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
173 " = pop %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
174 new_ctx, current, current->fs, current->fs->pwd,
175 atomic_read(¤t->fs->pwd->d_count),
176 atomic_read(¤t->fs->pwd->d_inode->i_count),
177 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
179 atomic_read(¤t->fs->pwdmnt->mnt_count));
182 LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
183 current->fs->pwd, new_ctx->pwd);
184 LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
185 current->fs->pwdmnt, new_ctx->pwdmnt);
188 ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
191 mntput(saved->pwdmnt);
192 current->fs->umask = saved->luc.luc_umask;
194 current->uid = saved->luc.luc_uid;
195 current->gid = saved->luc.luc_gid;
196 current->fsuid = saved->luc.luc_fsuid;
197 current->fsgid = saved->luc.luc_fsgid;
198 current->cap_effective = saved->luc.luc_cap;
199 pop_group_info(saved,
201 uc->luc_identity ? uc->luc_identity->mi_ginfo :
207 "= pop %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
208 saved, current, current->fs, current->fs->pwd,
209 atomic_read(¤t->fs->pwd->d_count),
210 atomic_read(¤t->fs->pwd->d_inode->i_count),
211 current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
213 atomic_read(¤t->fs->pwdmnt->mnt_count));
216 EXPORT_SYMBOL(pop_ctxt);
218 /* utility to make a file */
219 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
221 struct dentry *dchild;
225 // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
226 CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
228 dchild = ll_lookup_one_len(name, dir, strlen(name));
230 GOTO(out_up, dchild);
232 if (dchild->d_inode) {
233 int old_mode = dchild->d_inode->i_mode;
234 if (!S_ISREG(old_mode))
235 GOTO(out_err, err = -EEXIST);
237 /* Fixup file permissions if necessary */
238 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
239 CWARN("fixing permissions on %s from %o to %o\n",
240 name, old_mode, mode);
241 dchild->d_inode->i_mode = (mode & S_IALLUGO) |
242 (old_mode & ~S_IALLUGO);
243 mark_inode_dirty(dchild->d_inode);
245 GOTO(out_up, dchild);
248 err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
257 dchild = ERR_PTR(err);
261 EXPORT_SYMBOL(simple_mknod);
263 /* utility to make a directory */
264 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
266 struct dentry *dchild;
270 // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
271 CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
272 dchild = ll_lookup_one_len(name, dir, strlen(name));
274 GOTO(out_up, dchild);
276 if (dchild->d_inode) {
277 int old_mode = dchild->d_inode->i_mode;
278 if (!S_ISDIR(old_mode)) {
279 CERROR("found %s (%lu/%u) is mode %o\n", name,
280 dchild->d_inode->i_ino,
281 dchild->d_inode->i_generation, old_mode);
282 GOTO(out_err, err = -ENOTDIR);
285 /* Fixup directory permissions if necessary */
286 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
288 "fixing permissions on %s from %o to %o\n",
289 name, old_mode, mode);
290 dchild->d_inode->i_mode = (mode & S_IALLUGO) |
291 (old_mode & ~S_IALLUGO);
292 mark_inode_dirty(dchild->d_inode);
294 GOTO(out_up, dchild);
297 err = vfs_mkdir(dir->d_inode, dchild, mode);
305 dchild = ERR_PTR(err);
309 EXPORT_SYMBOL(simple_mkdir);
311 /* utility to rename a file */
312 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
314 struct dentry *dchild_old, *dchild_new;
318 ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
319 CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
320 (int)strlen(oldname), oldname, (int)strlen(newname), newname);
322 dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
323 if (IS_ERR(dchild_old))
324 RETURN(PTR_ERR(dchild_old));
326 if (!dchild_old->d_inode)
327 GOTO(put_old, err = -ENOENT);
329 dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
330 if (IS_ERR(dchild_new))
331 GOTO(put_old, err = PTR_ERR(dchild_new));
333 err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
340 EXPORT_SYMBOL(lustre_rename);
343 * Read a file from within kernel context. Prior to calling this
344 * function we should already have done a push_ctxt().
346 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
348 ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
349 if (!file || !file->f_op || !file->f_op->read || !off)
352 return file->f_op->read(file, buf, len, off);
354 EXPORT_SYMBOL(lustre_fread);
357 * Write a file from within kernel context. Prior to calling this
358 * function we should already have done a push_ctxt().
360 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
363 ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
371 if (!file->f_op->write)
374 RETURN(file->f_op->write(file, buf, len, off));
376 EXPORT_SYMBOL(lustre_fwrite);
379 * Sync a file from within kernel context. Prior to calling this
380 * function we should already have done a push_ctxt().
382 int lustre_fsync(struct file *file)
385 ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
386 if (!file || !file->f_op || !file->f_op->fsync)
389 RETURN(file->f_op->fsync(file, file->f_dentry, 0));
391 EXPORT_SYMBOL(lustre_fsync);
393 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
396 mntget(ctxt->pwdmnt);
397 return dentry_open(de, ctxt->pwdmnt, flags);
399 EXPORT_SYMBOL(l_dentry_open);
401 #ifdef HAVE_VFS_READDIR_U64_INO
402 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
403 u64 ino, unsigned int d_type)
405 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
406 ino_t ino, unsigned int d_type)
409 struct l_linux_dirent *dirent;
410 struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
412 dirent = buf->lrc_dirent;
414 dirent->lld_off = offset;
416 OBD_ALLOC(dirent, sizeof(*dirent));
421 list_add_tail(&dirent->lld_list, buf->lrc_list);
423 buf->lrc_dirent = dirent;
424 dirent->lld_ino = ino;
425 LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
426 memcpy(dirent->lld_name, name, namlen);
431 long l_readdir(struct file *file, struct list_head *dentry_list)
433 struct l_linux_dirent *lastdirent;
434 struct l_readdir_callback buf;
437 buf.lrc_dirent = NULL;
438 buf.lrc_list = dentry_list;
440 error = vfs_readdir(file, l_filldir, &buf);
444 lastdirent = buf.lrc_dirent;
446 lastdirent->lld_off = file->f_pos;
450 EXPORT_SYMBOL(l_readdir);
452 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
453 static spinlock_t obd_memlist_lock = SPIN_LOCK_UNLOCKED;
454 static struct hlist_head *obd_memtable = NULL;
455 static unsigned long obd_memtable_size = 0;
457 static int lvfs_memdbg_init(int size)
459 struct hlist_head *head;
462 LASSERT(size > sizeof(sizeof(struct hlist_head)));
463 obd_memtable_size = size / sizeof(struct hlist_head);
465 CWARN("Allocating %lu memdbg entries.\n",
466 (unsigned long)obd_memtable_size);
468 LASSERT(obd_memtable == NULL);
469 obd_memtable = kmalloc(size, GFP_KERNEL);
473 i = obd_memtable_size;
476 INIT_HLIST_HEAD(head);
484 static int lvfs_memdbg_cleanup(void)
486 struct hlist_node *node = NULL, *tmp = NULL;
487 struct hlist_head *head;
488 struct obd_mem_track *mt;
491 spin_lock(&obd_memlist_lock);
492 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
493 hlist_for_each_safe(node, tmp, head) {
494 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
495 hlist_del_init(&mt->mt_hash);
499 spin_unlock(&obd_memlist_lock);
504 static inline unsigned long const hashfn(void *ptr)
506 return (unsigned long)ptr &
507 (obd_memtable_size - 1);
510 static void __lvfs_memdbg_insert(struct obd_mem_track *mt)
512 struct hlist_head *head = obd_memtable +
514 hlist_add_head(&mt->mt_hash, head);
517 void lvfs_memdbg_insert(struct obd_mem_track *mt)
519 spin_lock(&obd_memlist_lock);
520 __lvfs_memdbg_insert(mt);
521 spin_unlock(&obd_memlist_lock);
523 EXPORT_SYMBOL(lvfs_memdbg_insert);
525 static void __lvfs_memdbg_remove(struct obd_mem_track *mt)
527 hlist_del_init(&mt->mt_hash);
530 void lvfs_memdbg_remove(struct obd_mem_track *mt)
532 spin_lock(&obd_memlist_lock);
533 __lvfs_memdbg_remove(mt);
534 spin_unlock(&obd_memlist_lock);
536 EXPORT_SYMBOL(lvfs_memdbg_remove);
538 static struct obd_mem_track *__lvfs_memdbg_find(void *ptr)
540 struct hlist_node *node = NULL;
541 struct obd_mem_track *mt = NULL;
542 struct hlist_head *head;
544 head = obd_memtable + hashfn(ptr);
546 hlist_for_each(node, head) {
547 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
548 if ((unsigned long)mt->mt_ptr == (unsigned long)ptr)
555 struct obd_mem_track *lvfs_memdbg_find(void *ptr)
557 struct obd_mem_track *mt;
559 spin_lock(&obd_memlist_lock);
560 mt = __lvfs_memdbg_find(ptr);
561 spin_unlock(&obd_memlist_lock);
565 EXPORT_SYMBOL(lvfs_memdbg_find);
567 int lvfs_memdbg_check_insert(struct obd_mem_track *mt)
569 struct obd_mem_track *tmp;
571 spin_lock(&obd_memlist_lock);
572 tmp = __lvfs_memdbg_find(mt->mt_ptr);
574 __lvfs_memdbg_insert(mt);
575 spin_unlock(&obd_memlist_lock);
578 spin_unlock(&obd_memlist_lock);
581 EXPORT_SYMBOL(lvfs_memdbg_check_insert);
583 struct obd_mem_track *
584 lvfs_memdbg_check_remove(void *ptr)
586 struct obd_mem_track *mt;
588 spin_lock(&obd_memlist_lock);
589 mt = __lvfs_memdbg_find(ptr);
591 __lvfs_memdbg_remove(mt);
592 spin_unlock(&obd_memlist_lock);
595 spin_unlock(&obd_memlist_lock);
598 EXPORT_SYMBOL(lvfs_memdbg_check_remove);
601 void lvfs_memdbg_show(void)
603 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
604 struct hlist_node *node = NULL;
605 struct hlist_head *head;
606 struct obd_mem_track *mt;
610 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
615 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
616 spin_lock(&obd_memlist_lock);
617 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
618 hlist_for_each(node, head) {
620 CWARN("Abnormal memory activities:\n");
623 mt = hlist_entry(node, struct obd_mem_track, mt_hash);
624 CWARN(" [%s] ptr: 0x%p, size: %d, src at %s\n",
625 ((mt->mt_flags & OBD_MT_WRONG_SIZE) ?
626 "wrong size" : "leaked memory"),
627 mt->mt_ptr, mt->mt_size, mt->mt_loc);
630 spin_unlock(&obd_memlist_lock);
633 EXPORT_SYMBOL(lvfs_memdbg_show);
635 #ifdef LUSTRE_KERNEL_VERSION
636 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
637 #error rdonly patchset must be updated [cfs bz11248]
639 void dev_set_rdonly(lvfs_sbdev_type dev);
640 int dev_check_rdonly(lvfs_sbdev_type dev);
642 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
644 lvfs_sbdev_sync(dev);
645 if (jdev && (jdev != dev)) {
646 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
648 dev_set_rdonly(jdev);
650 CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
654 int lvfs_check_rdonly(lvfs_sbdev_type dev)
656 return dev_check_rdonly(dev);
659 EXPORT_SYMBOL(__lvfs_set_rdonly);
660 EXPORT_SYMBOL(lvfs_check_rdonly);
662 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
664 char *write_page = NULL;
669 OBD_ALLOC(write_page, CFS_PAGE_SIZE);
673 rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
675 OBD_FREE(write_page, CFS_PAGE_SIZE);
677 CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
680 EXPORT_SYMBOL(lvfs_check_io_health);
681 #endif /* LUSTRE_KERNEL_VERSION */
683 void obd_update_maxusage()
687 max1 = obd_pages_sum();
688 max2 = obd_memory_sum();
690 spin_lock(&obd_updatemax_lock);
691 if (max1 > obd_max_pages)
692 obd_max_pages = max1;
693 if (max2 > obd_max_alloc)
694 obd_max_alloc = max2;
695 spin_unlock(&obd_updatemax_lock);
699 __u64 obd_memory_max(void)
703 spin_lock(&obd_updatemax_lock);
705 spin_unlock(&obd_updatemax_lock);
710 __u64 obd_pages_max(void)
714 spin_lock(&obd_updatemax_lock);
716 spin_unlock(&obd_updatemax_lock);
721 EXPORT_SYMBOL(obd_update_maxusage);
722 EXPORT_SYMBOL(obd_pages_max);
723 EXPORT_SYMBOL(obd_memory_max);
724 EXPORT_SYMBOL(obd_memory);
727 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
728 enum lprocfs_fields_flags field)
736 centry = atomic_read(&lc->lc_cntl.la_entry);
739 case LPROCFS_FIELDS_FLAGS_CONFIG:
742 case LPROCFS_FIELDS_FLAGS_SUM:
745 case LPROCFS_FIELDS_FLAGS_MIN:
748 case LPROCFS_FIELDS_FLAGS_MAX:
751 case LPROCFS_FIELDS_FLAGS_AVG:
752 ret = (lc->lc_max - lc->lc_min)/2;
754 case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
755 ret = lc->lc_sumsquare;
757 case LPROCFS_FIELDS_FLAGS_COUNT:
763 } while (centry != atomic_read(&lc->lc_cntl.la_entry) &&
764 centry != atomic_read(&lc->lc_cntl.la_exit));
768 EXPORT_SYMBOL(lprocfs_read_helper);
771 static int __init lvfs_linux_init(void)
774 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
775 lvfs_memdbg_init(PAGE_SIZE);
780 static void __exit lvfs_linux_exit(void)
786 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
787 lvfs_memdbg_cleanup();
792 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
793 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
794 MODULE_LICENSE("GPL");
796 module_init(lvfs_linux_init);
797 module_exit(lvfs_linux_exit);