Whamcloud - gitweb
- dump log if memory leak observed
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  lustre/lib/lvfs_linux.c
5  *  Lustre filesystem abstraction routines
6  *
7  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8  *   Author: Andreas Dilger <adilger@clusterfs.com>
9  *
10  *   This file is part of Lustre, http://www.lustre.org.
11  *
12  *   Lustre is free software; you can redistribute it and/or
13  *   modify it under the terms of version 2 of the GNU General Public
14  *   License as published by the Free Software Foundation.
15  *
16  *   Lustre is distributed in the hope that it will be useful,
17  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
18  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  *   GNU General Public License for more details.
20  *
21  *   You should have received a copy of the GNU General Public License
22  *   along with Lustre; if not, write to the Free Software
23  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24  */
25
26 #ifndef EXPORT_SYMTAB
27 # define EXPORT_SYMTAB
28 #endif
29
30 #define DEBUG_SUBSYSTEM S_FILTER
31
32 #include <linux/version.h>
33 #include <linux/fs.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <linux/lustre_fsfilt.h>
41 #include <linux/obd.h>
42 #include <linux/obd_class.h>
43 #include <linux/module.h>
44 #include <linux/init.h>
45 #include <linux/lustre_compat25.h>
46 #include <linux/lvfs.h>
47 #include "lvfs_internal.h"
48
49 #include <linux/obd.h>
50 #include <linux/lustre_lib.h>
51 #include <linux/lustre_mds.h>   /* for mds_grp_hash_entry */
52
53 atomic_t obd_memory;
54 int obd_memmax;
55
56 /* Debugging check only needed during development */
57 #ifdef OBD_CTXT_DEBUG
58 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
59 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
60                                               msg)
61 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
62
63 #else
64 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
65 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
66 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
67 #endif
68
69 static void push_group_info(struct lvfs_run_ctxt *save,
70                             struct group_info *ginfo)
71 {
72         if (!ginfo) {
73                 save->ngroups = current_ngroups;
74                 current_ngroups = 0;
75         } else {
76 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
77                 task_lock(current);
78                 save->group_info = current->group_info;
79                 current->group_info = ginfo;
80                 task_unlock(current);
81 #else
82                 LASSERT(ginfo->ngroups <= NGROUPS);
83                 /* save old */
84                 save->group_info.ngroups = current->ngroups;
85                 if (current->ngroups)
86                         memcpy(save->group_info.small_block, current->groups,
87                                current->ngroups);
88                 /* push new */
89                 current->ngroups = ginfo->ngroups;
90                 if (ginfo->ngroups)
91                         memcpy(current->groups, ginfo->small_block,
92                                current->ngroups);
93 #endif
94         }
95 }
96
97 static void pop_group_info(struct lvfs_run_ctxt *save,
98                            struct group_info *ginfo)
99 {
100         if (!ginfo) {
101                 current_ngroups = save->ngroups;
102         } else {
103 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
104                 task_lock(current);
105                 current->group_info = save->group_info;
106                 task_unlock(current);
107 #else
108                 current->ngroups = ginfo->ngroups;
109                 if (current->ngroups)
110                         memcpy(current->groups, save->group_info.small_block,
111                                current->ngroups);
112 #endif
113         }
114 }
115
116 /* push / pop to root of obd store */
117 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
118                struct lvfs_ucred *uc)
119 {
120         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
121         ASSERT_CTXT_MAGIC(new_ctx->magic);
122         LASSERT(save->magic != OBD_RUN_CTXT_MAGIC || save->pid != current->pid);
123         OBD_SET_CTXT_MAGIC(save);
124         save->pid = current->pid;
125
126         /*
127         CDEBUG(D_INFO,
128                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
129                save, current, current->fs, current->fs->pwd,
130                atomic_read(&current->fs->pwd->d_count),
131                atomic_read(&current->fs->pwd->d_inode->i_count),
132                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
133                current->fs->pwdmnt,
134                atomic_read(&current->fs->pwdmnt->mnt_count));
135         */
136
137         save->fs = get_fs();
138         LASSERT(atomic_read(&current->fs->pwd->d_count));
139         LASSERT(atomic_read(&new_ctx->pwd->d_count));
140         save->pwd = dget(current->fs->pwd);
141         save->pwdmnt = mntget(current->fs->pwdmnt);
142         save->ngroups = current_ngroups;
143         save->luc.luc_umask = current->fs->umask;
144
145         LASSERT(save->pwd);
146         LASSERT(save->pwdmnt);
147         LASSERT(new_ctx->pwd);
148         LASSERT(new_ctx->pwdmnt);
149
150         if (uc) {
151                 save->luc.luc_uid = current->uid;
152                 save->luc.luc_gid = current->gid;
153                 save->luc.luc_fsuid = current->fsuid;
154                 save->luc.luc_fsgid = current->fsgid;
155                 save->luc.luc_cap = current->cap_effective;
156
157                 current->uid = uc->luc_uid;
158                 current->gid = uc->luc_gid;
159                 current->fsuid = uc->luc_fsuid;
160                 current->fsgid = uc->luc_fsgid;
161                 current->cap_effective = uc->luc_cap;
162
163                 push_group_info(save, uc->luc_ginfo);
164         }
165         current->fs->umask = 0; /* umask already applied on client */
166         set_fs(new_ctx->fs);
167         set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
168
169         /*
170         CDEBUG(D_INFO,
171                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
172                new_ctx, current, current->fs, current->fs->pwd,
173                atomic_read(&current->fs->pwd->d_count),
174                atomic_read(&current->fs->pwd->d_inode->i_count),
175                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
176                current->fs->pwdmnt,
177                atomic_read(&current->fs->pwdmnt->mnt_count));
178         */
179 }
180 EXPORT_SYMBOL(push_ctxt);
181
182 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
183               struct lvfs_ucred *uc)
184 {
185         //printk("pc0");
186         ASSERT_CTXT_MAGIC(saved->magic);
187         LASSERT(saved->pid == current->pid);
188         saved->magic = 0;
189         saved->pid = 0;
190         //printk("pc1");
191         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
192
193         /*
194         CDEBUG(D_INFO,
195                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
196                new_ctx, current, current->fs, current->fs->pwd,
197                atomic_read(&current->fs->pwd->d_count),
198                atomic_read(&current->fs->pwd->d_inode->i_count),
199                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
200                current->fs->pwdmnt,
201                atomic_read(&current->fs->pwdmnt->mnt_count));
202         */
203
204         LASSERT(current->fs->pwd == new_ctx->pwd);
205         LASSERT(current->fs->pwdmnt == new_ctx->pwdmnt);
206
207         set_fs(saved->fs);
208         set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
209
210         dput(saved->pwd);
211         mntput(saved->pwdmnt);
212         current->fs->umask = saved->luc.luc_umask;
213         if (uc) {
214                 current->uid = saved->luc.luc_uid;
215                 current->gid = saved->luc.luc_gid;
216                 current->fsuid = saved->luc.luc_fsuid;
217                 current->fsgid = saved->luc.luc_fsgid;
218                 current->cap_effective = saved->luc.luc_cap;
219
220                 pop_group_info(saved, uc->luc_ginfo);
221         }
222
223         /*
224         CDEBUG(D_INFO,
225                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
226                saved, current, current->fs, current->fs->pwd,
227                atomic_read(&current->fs->pwd->d_count),
228                atomic_read(&current->fs->pwd->d_inode->i_count),
229                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
230                current->fs->pwdmnt,
231                atomic_read(&current->fs->pwdmnt->mnt_count));
232         */
233 }
234 EXPORT_SYMBOL(pop_ctxt);
235
236 /* utility to make a file */
237 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
238 {
239         struct dentry *dchild;
240         int err = 0;
241         ENTRY;
242
243         ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
244         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
245
246         dchild = ll_lookup_one_len(name, dir, strlen(name));
247         if (IS_ERR(dchild))
248                 GOTO(out_up, dchild);
249
250         if (dchild->d_inode) {
251                 int old_mode = dchild->d_inode->i_mode;
252                 if (!S_ISREG(old_mode))
253                         GOTO(out_err, err = -EEXIST);
254
255                 /* Fixup file permissions if necessary */
256                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
257                         CWARN("fixing permissions on %s from %o to %o\n",
258                               name, old_mode, mode);
259                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
260                                                   (old_mode & ~S_IALLUGO);
261                         mark_inode_dirty(dchild->d_inode);
262                 }
263                 GOTO(out_up, dchild);
264         }
265
266         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
267                             NULL);
268         if (err)
269                 GOTO(out_err, err);
270
271         RETURN(dchild);
272
273 out_err:
274         dput(dchild);
275         dchild = ERR_PTR(err);
276 out_up:
277         return dchild;
278 }
279 EXPORT_SYMBOL(simple_mknod);
280
281 /* utility to make a directory */
282 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
283 {
284         struct dentry *dchild;
285         int err = 0;
286         ENTRY;
287
288         ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
289         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
290         dchild = ll_lookup_one_len(name, dir, strlen(name));
291         if (IS_ERR(dchild))
292                 GOTO(out_up, dchild);
293
294         if (dchild->d_inode) {
295                 int old_mode = dchild->d_inode->i_mode;
296                 if (!S_ISDIR(old_mode)) {
297                         CERROR("found %s (%lu/%u) is mode %o\n", name,
298                                dchild->d_inode->i_ino,
299                                dchild->d_inode->i_generation, old_mode);
300                         GOTO(out_err, err = -ENOTDIR);
301                 }
302
303                 /* Fixup directory permissions if necessary */
304                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
305                         CWARN("fixing permissions on %s from %o to %o\n",
306                               name, old_mode, mode);
307                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
308                                                   (old_mode & ~S_IALLUGO);
309                         mark_inode_dirty(dchild->d_inode);
310                 }
311                 GOTO(out_up, dchild);
312         }
313
314         err = vfs_mkdir(dir->d_inode, dchild, mode);
315         if (err)
316                 GOTO(out_err, err);
317
318         RETURN(dchild);
319
320 out_err:
321         dput(dchild);
322         dchild = ERR_PTR(err);
323 out_up:
324         return dchild;
325 }
326 EXPORT_SYMBOL(simple_mkdir);
327
328 /*
329  * Read a file from within kernel context.  Prior to calling this
330  * function we should already have done a push_ctxt().
331  */
332 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
333 {
334         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
335         if (!file || !file->f_op || !file->f_op->read || !off)
336                 RETURN(-ENOSYS);
337
338         return file->f_op->read(file, buf, len, off);
339 }
340 EXPORT_SYMBOL(lustre_fread);
341
342 /*
343  * Write a file from within kernel context.  Prior to calling this
344  * function we should already have done a push_ctxt().
345  */
346 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
347 {
348         ENTRY;
349         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
350         if (!file)
351                 RETURN(-ENOENT);
352         if (!file->f_op)
353                 RETURN(-ENOSYS);
354         if (!off)
355                 RETURN(-EINVAL);
356
357         if (!file->f_op->write)
358                 RETURN(-EROFS);
359
360         RETURN(file->f_op->write(file, buf, len, off));
361 }
362 EXPORT_SYMBOL(lustre_fwrite);
363
364 /*
365  * Sync a file from within kernel context.  Prior to calling this
366  * function we should already have done a push_ctxt().
367  */
368 int lustre_fsync(struct file *file)
369 {
370         ENTRY;
371         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
372         if (!file || !file->f_op || !file->f_op->fsync)
373                 RETURN(-ENOSYS);
374
375         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
376 }
377 EXPORT_SYMBOL(lustre_fsync);
378
379 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
380                              int flags)
381 {
382         mntget(ctxt->pwdmnt);
383         return dentry_open(de, ctxt->pwdmnt, flags);
384 }
385 EXPORT_SYMBOL(l_dentry_open);
386
387 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
388                      ino_t ino, unsigned int d_type)
389 {
390         struct l_linux_dirent *dirent;
391         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
392         
393         dirent = buf->lrc_dirent;
394         if (dirent)
395                dirent->lld_off = offset; 
396
397         OBD_ALLOC(dirent, sizeof(*dirent));
398
399         list_add_tail(&dirent->lld_list, buf->lrc_list);
400
401         buf->lrc_dirent = dirent;
402         dirent->lld_ino = ino;
403         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
404         memcpy(dirent->lld_name, name, namlen);
405
406         return 0;
407 }
408
409 long l_readdir(struct file *file, struct list_head *dentry_list)
410 {
411         struct l_linux_dirent *lastdirent;
412         struct l_readdir_callback buf;
413         int error;
414
415         buf.lrc_dirent = NULL;
416         buf.lrc_list = dentry_list; 
417
418         error = vfs_readdir(file, l_filldir, &buf);
419         if (error < 0)
420                 return error;
421
422         lastdirent = buf.lrc_dirent;
423         if (lastdirent)
424                 lastdirent->lld_off = file->f_pos;
425
426         return 0; 
427 }
428 EXPORT_SYMBOL(l_readdir);
429 EXPORT_SYMBOL(obd_memory);
430 EXPORT_SYMBOL(obd_memmax);
431
432 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
433 static spinlock_t obd_memlist_lock = SPIN_LOCK_UNLOCKED;
434 static struct hlist_head *obd_memtable;
435 static unsigned long obd_memtable_size;
436
437 static int lvfs_memdbg_init(int size)
438 {
439         struct hlist_head *head;
440         int i;
441
442         LASSERT(size > sizeof(sizeof(struct hlist_head)));
443         obd_memtable_size = size / sizeof(struct hlist_head);
444
445         CWARN("allocating %lu malloc entries...\n",
446               (unsigned long)obd_memtable_size);
447
448         obd_memtable = kmalloc(size, GFP_KERNEL);
449         if (!obd_memtable)
450                 return -ENOMEM;
451
452         i = obd_memtable_size;
453         head = obd_memtable;
454         do {
455                 INIT_HLIST_HEAD(head);
456                 head++;
457                 i--;
458         } while(i);
459
460         return 0;
461 }
462
463 static int lvfs_memdbg_cleanup(void)
464 {
465         struct hlist_node *node = NULL, *tmp = NULL;
466         struct hlist_head *head;
467         struct mem_track *mt;
468         int i;
469
470         spin_lock(&obd_memlist_lock);
471         for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
472                 hlist_for_each_safe(node, tmp, head) {
473                         mt = hlist_entry(node, struct mem_track, m_hash);
474                         hlist_del_init(&mt->m_hash);
475                         kfree(mt);
476                 }
477         }
478         spin_unlock(&obd_memlist_lock);
479         kfree(obd_memtable);
480         return 0;
481 }
482
483 static inline unsigned long const hashfn(void *ptr)
484 {
485         return (unsigned long)ptr &
486                 (obd_memtable_size - 1);
487 }
488
489 static void __lvfs_memdbg_insert(struct mem_track *mt)
490 {
491         struct hlist_head *head = obd_memtable +
492                 hashfn(mt->m_ptr);
493         hlist_add_head(&mt->m_hash, head);
494 }
495
496 void lvfs_memdbg_insert(struct mem_track *mt)
497 {
498         spin_lock(&obd_memlist_lock);
499         __lvfs_memdbg_insert(mt);
500         spin_unlock(&obd_memlist_lock);
501 }
502 EXPORT_SYMBOL(lvfs_memdbg_insert);
503
504 static void __lvfs_memdbg_remove(struct mem_track *mt)
505 {
506         hlist_del_init(&mt->m_hash);
507 }
508
509 void lvfs_memdbg_remove(struct mem_track *mt)
510 {
511         spin_lock(&obd_memlist_lock);
512         __lvfs_memdbg_remove(mt);
513         spin_unlock(&obd_memlist_lock);
514 }
515 EXPORT_SYMBOL(lvfs_memdbg_remove);
516
517 static struct mem_track *__lvfs_memdbg_find(void *ptr)
518 {
519         struct hlist_node *node = NULL;
520         struct mem_track *mt = NULL;
521         struct hlist_head *head;
522
523         head = obd_memtable + hashfn(ptr);
524
525         hlist_for_each(node, head) {
526                 mt = hlist_entry(node, struct mem_track, m_hash);
527                 if ((unsigned long)mt->m_ptr == (unsigned long)ptr)
528                         break;
529                 mt = NULL;
530         }
531         return mt;
532 }
533
534 struct mem_track *lvfs_memdbg_find(void *ptr)
535 {
536         struct mem_track *mt;
537
538         spin_lock(&obd_memlist_lock);
539         mt = __lvfs_memdbg_find(ptr);
540         spin_unlock(&obd_memlist_lock);
541         
542         return mt;
543 }
544 EXPORT_SYMBOL(lvfs_memdbg_find);
545
546 int lvfs_memdbg_check_insert(struct mem_track *mt)
547 {
548         spin_lock(&obd_memlist_lock);
549         if (!__lvfs_memdbg_find(mt->m_ptr)) {
550                 __lvfs_memdbg_insert(mt);
551                 spin_unlock(&obd_memlist_lock);
552                 return 1;
553         }
554         spin_unlock(&obd_memlist_lock);
555         return 0;
556 }
557 EXPORT_SYMBOL(lvfs_memdbg_check_insert);
558
559 struct mem_track *
560 lvfs_memdbg_check_remove(void *ptr)
561 {
562         struct mem_track *mt;
563
564         spin_lock(&obd_memlist_lock);
565         mt = __lvfs_memdbg_find(ptr);
566         if (mt) {
567                 __lvfs_memdbg_remove(mt);
568                 spin_unlock(&obd_memlist_lock);
569                 return mt;
570         }
571         spin_unlock(&obd_memlist_lock);
572         return NULL;
573 }
574 EXPORT_SYMBOL(lvfs_memdbg_check_remove);
575 #endif
576
577 void lvfs_memdbg_show(void)
578 {
579 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
580         struct hlist_node *node = NULL;
581         struct hlist_head *head;
582         struct mem_track *mt;
583 #endif
584         int leaked;
585         
586 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
587         int i;
588 #endif
589
590         leaked = atomic_read(&obd_memory);
591
592         if (leaked > 0) {
593                 CWARN("memory leaks detected (max %d, leaked %d)\n",
594                       obd_memmax, leaked);
595
596 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
597                 spin_lock(&obd_memlist_lock);
598                 for (i = 0, head = obd_memtable; i < obd_memtable_size; i++, head++) {
599                         hlist_for_each(node, head) {
600                                 mt = hlist_entry(node, struct mem_track, m_hash);
601                                 CWARN("  ptr: 0x%p, size: %d, src at \"%s\"\n",
602                                       mt->m_ptr, mt->m_size, mt->m_loc);
603                         }
604                 }
605                 spin_unlock(&obd_memlist_lock);
606 #endif
607                 /* remove for production */
608                 portals_debug_dumplog();
609         }
610 }
611 EXPORT_SYMBOL(lvfs_memdbg_show);
612
613 static int __init lvfs_linux_init(void)
614 {
615         ENTRY;
616 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
617         lvfs_memdbg_init(PAGE_SIZE);
618 #endif
619         lvfs_mount_list_init();
620         RETURN(0);
621 }
622
623 static void __exit lvfs_linux_exit(void)
624 {
625         ENTRY;
626
627         lvfs_mount_list_cleanup();
628         lvfs_memdbg_show();
629
630 #if defined (CONFIG_DEBUG_MEMORY) && defined(__KERNEL__)
631         lvfs_memdbg_cleanup();
632 #endif
633         EXIT;
634         return;
635 }
636
637 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
638 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
639 MODULE_LICENSE("GPL");
640
641 module_init(lvfs_linux_init);
642 module_exit(lvfs_linux_exit);