Whamcloud - gitweb
b=16098
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see [sun.com URL with a
20  * copy of GPLv2].
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #ifndef EXPORT_SYMTAB
42 # define EXPORT_SYMTAB
43 #endif
44
45 #define DEBUG_SUBSYSTEM S_FILTER
46
47 #include <linux/version.h>
48 #include <linux/fs.h>
49 #include <asm/unistd.h>
50 #include <linux/slab.h>
51 #include <linux/pagemap.h>
52 #include <linux/quotaops.h>
53 #include <linux/version.h>
54 #include <libcfs/libcfs.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/module.h>
58 #include <linux/init.h>
59 #include <linux/lustre_compat25.h>
60 #include <lvfs.h>
61 #include "lvfs_internal.h"
62
63 #include <obd.h>
64 #include <lustre_lib.h>
65 #include <lustre_quota.h>
66
67 __u64 obd_max_pages = 0;
68 __u64 obd_max_alloc = 0;
69 struct lprocfs_stats *obd_memory = NULL;
70 spinlock_t obd_updatemax_lock = SPIN_LOCK_UNLOCKED;
71 /* refine later and change to seqlock or simlar from libcfs */
72
73 /* Debugging check only needed during development */
74 #ifdef OBD_CTXT_DEBUG
75 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
76 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
77                                               msg)
78 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
79 #else
80 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
81 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
82 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
83 #endif
84
85 static void push_group_info(struct lvfs_run_ctxt *save,
86                             struct group_info *ginfo)
87 {
88         if (!ginfo) {
89                 save->ngroups = current_ngroups;
90                 current_ngroups = 0;
91         } else {
92                 task_lock(current);
93                 save->group_info = current->group_info;
94                 current->group_info = ginfo;
95                 task_unlock(current);
96         }
97 }
98
99 static void pop_group_info(struct lvfs_run_ctxt *save,
100                            struct group_info *ginfo)
101 {
102         if (!ginfo) {
103                 current_ngroups = save->ngroups;
104         } else {
105                 task_lock(current);
106                 current->group_info = save->group_info;
107                 task_unlock(current);
108         }
109 }
110
111 /* push / pop to root of obd store */
112 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
113                struct lvfs_ucred *uc)
114 {
115         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
116         ASSERT_CTXT_MAGIC(new_ctx->magic);
117         OBD_SET_CTXT_MAGIC(save);
118
119         /*
120         CDEBUG(D_INFO,
121                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
122                save, current, current->fs, current->fs->pwd,
123                atomic_read(&current->fs->pwd->d_count),
124                atomic_read(&current->fs->pwd->d_inode->i_count),
125                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
126                current->fs->pwdmnt,
127                atomic_read(&current->fs->pwdmnt->mnt_count));
128         */
129
130         save->fs = get_fs();
131         LASSERT(atomic_read(&current->fs->pwd->d_count));
132         LASSERT(atomic_read(&new_ctx->pwd->d_count));
133         save->pwd = dget(current->fs->pwd);
134         save->pwdmnt = mntget(current->fs->pwdmnt);
135         save->luc.luc_umask = current->fs->umask;
136         save->ngroups = current->group_info->ngroups;
137
138         LASSERT(save->pwd);
139         LASSERT(save->pwdmnt);
140         LASSERT(new_ctx->pwd);
141         LASSERT(new_ctx->pwdmnt);
142
143         if (uc) {
144                 save->luc.luc_uid = current->uid;
145                 save->luc.luc_gid = current->gid;
146                 save->luc.luc_fsuid = current->fsuid;
147                 save->luc.luc_fsgid = current->fsgid;
148                 save->luc.luc_cap = current->cap_effective;
149
150                 current->uid = uc->luc_uid;
151                 current->gid = uc->luc_gid;
152                 current->fsuid = uc->luc_fsuid;
153                 current->fsgid = uc->luc_fsgid;
154                 current->cap_effective = uc->luc_cap;
155
156                 push_group_info(save,
157                                 uc->luc_ginfo ?:
158                                 uc->luc_identity ? uc->luc_identity->mi_ginfo :
159                                                    NULL);
160         }
161         current->fs->umask = 0; /* umask already applied on client */
162         set_fs(new_ctx->fs);
163         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
164
165         /*
166         CDEBUG(D_INFO,
167                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
168                new_ctx, current, current->fs, current->fs->pwd,
169                atomic_read(&current->fs->pwd->d_count),
170                atomic_read(&current->fs->pwd->d_inode->i_count),
171                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
172                current->fs->pwdmnt,
173                atomic_read(&current->fs->pwdmnt->mnt_count));
174         */
175 }
176 EXPORT_SYMBOL(push_ctxt);
177
178 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
179               struct lvfs_ucred *uc)
180 {
181         //printk("pc0");
182         ASSERT_CTXT_MAGIC(saved->magic);
183         //printk("pc1");
184         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
185
186         /*
187         CDEBUG(D_INFO,
188                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
189                new_ctx, current, current->fs, current->fs->pwd,
190                atomic_read(&current->fs->pwd->d_count),
191                atomic_read(&current->fs->pwd->d_inode->i_count),
192                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
193                current->fs->pwdmnt,
194                atomic_read(&current->fs->pwdmnt->mnt_count));
195         */
196
197         LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
198                  current->fs->pwd, new_ctx->pwd);
199         LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
200                  current->fs->pwdmnt, new_ctx->pwdmnt);
201
202         set_fs(saved->fs);
203         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
204
205         dput(saved->pwd);
206         mntput(saved->pwdmnt);
207         current->fs->umask = saved->luc.luc_umask;
208         if (uc) {
209                 current->uid = saved->luc.luc_uid;
210                 current->gid = saved->luc.luc_gid;
211                 current->fsuid = saved->luc.luc_fsuid;
212                 current->fsgid = saved->luc.luc_fsgid;
213                 current->cap_effective = saved->luc.luc_cap;
214                 pop_group_info(saved,
215                                uc->luc_ginfo ?:
216                                uc->luc_identity ? uc->luc_identity->mi_ginfo :
217                                                   NULL);
218         }
219
220         /*
221         CDEBUG(D_INFO,
222                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
223                saved, current, current->fs, current->fs->pwd,
224                atomic_read(&current->fs->pwd->d_count),
225                atomic_read(&current->fs->pwd->d_inode->i_count),
226                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
227                current->fs->pwdmnt,
228                atomic_read(&current->fs->pwdmnt->mnt_count));
229         */
230 }
231 EXPORT_SYMBOL(pop_ctxt);
232
233 /* utility to make a file */
234 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
235 {
236         struct dentry *dchild;
237         int err = 0;
238         ENTRY;
239
240         // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
241         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
242
243         dchild = ll_lookup_one_len(name, dir, strlen(name));
244         if (IS_ERR(dchild))
245                 GOTO(out_up, dchild);
246
247         if (dchild->d_inode) {
248                 int old_mode = dchild->d_inode->i_mode;
249                 if (!S_ISREG(old_mode))
250                         GOTO(out_err, err = -EEXIST);
251
252                 /* Fixup file permissions if necessary */
253                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
254                         CWARN("fixing permissions on %s from %o to %o\n",
255                               name, old_mode, mode);
256                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
257                                                   (old_mode & ~S_IALLUGO);
258                         mark_inode_dirty(dchild->d_inode);
259                 }
260                 GOTO(out_up, dchild);
261         }
262
263         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
264                             NULL);
265         if (err)
266                 GOTO(out_err, err);
267
268         RETURN(dchild);
269
270 out_err:
271         dput(dchild);
272         dchild = ERR_PTR(err);
273 out_up:
274         return dchild;
275 }
276 EXPORT_SYMBOL(simple_mknod);
277
278 /* utility to make a directory */
279 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
280 {
281         struct dentry *dchild;
282         int err = 0;
283         ENTRY;
284
285         // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
286         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
287         dchild = ll_lookup_one_len(name, dir, strlen(name));
288         if (IS_ERR(dchild))
289                 GOTO(out_up, dchild);
290
291         if (dchild->d_inode) {
292                 int old_mode = dchild->d_inode->i_mode;
293                 if (!S_ISDIR(old_mode)) {
294                         CERROR("found %s (%lu/%u) is mode %o\n", name,
295                                dchild->d_inode->i_ino,
296                                dchild->d_inode->i_generation, old_mode);
297                         GOTO(out_err, err = -ENOTDIR);
298                 }
299
300                 /* Fixup directory permissions if necessary */
301                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
302                         CDEBUG(D_CONFIG, 
303                                "fixing permissions on %s from %o to %o\n",
304                                name, old_mode, mode);
305                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
306                                                   (old_mode & ~S_IALLUGO);
307                         mark_inode_dirty(dchild->d_inode);
308                 }
309                 GOTO(out_up, dchild);
310         }
311
312         err = vfs_mkdir(dir->d_inode, dchild, mode);
313         if (err)
314                 GOTO(out_err, err);
315
316         RETURN(dchild);
317
318 out_err:
319         dput(dchild);
320         dchild = ERR_PTR(err);
321 out_up:
322         return dchild;
323 }
324 EXPORT_SYMBOL(simple_mkdir);
325
326 /* utility to rename a file */
327 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
328 {
329         struct dentry *dchild_old, *dchild_new;
330         int err = 0;
331         ENTRY;
332
333         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
334         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
335                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
336
337         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
338         if (IS_ERR(dchild_old))
339                 RETURN(PTR_ERR(dchild_old));
340
341         if (!dchild_old->d_inode) 
342                 GOTO(put_old, err = -ENOENT);
343
344         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
345         if (IS_ERR(dchild_new))
346                 GOTO(put_old, err = PTR_ERR(dchild_new));
347
348         err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
349
350         dput(dchild_new);
351 put_old:
352         dput(dchild_old);
353         RETURN(err);
354 }
355 EXPORT_SYMBOL(lustre_rename);
356
357 /*
358  * Read a file from within kernel context.  Prior to calling this
359  * function we should already have done a push_ctxt().
360  */
361 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
362 {
363         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
364         if (!file || !file->f_op || !file->f_op->read || !off)
365                 RETURN(-ENOSYS);
366
367         return file->f_op->read(file, buf, len, off);
368 }
369 EXPORT_SYMBOL(lustre_fread);
370
371 /*
372  * Write a file from within kernel context.  Prior to calling this
373  * function we should already have done a push_ctxt().
374  */
375 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
376 {
377         ENTRY;
378         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
379         if (!file)
380                 RETURN(-ENOENT);
381         if (!file->f_op)
382                 RETURN(-ENOSYS);
383         if (!off)
384                 RETURN(-EINVAL);
385
386         if (!file->f_op->write)
387                 RETURN(-EROFS);
388
389         RETURN(file->f_op->write(file, buf, len, off));
390 }
391 EXPORT_SYMBOL(lustre_fwrite);
392
393 /*
394  * Sync a file from within kernel context.  Prior to calling this
395  * function we should already have done a push_ctxt().
396  */
397 int lustre_fsync(struct file *file)
398 {
399         ENTRY;
400         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
401         if (!file || !file->f_op || !file->f_op->fsync)
402                 RETURN(-ENOSYS);
403
404         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
405 }
406 EXPORT_SYMBOL(lustre_fsync);
407
408 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
409                              int flags)
410 {
411         mntget(ctxt->pwdmnt);
412         return dentry_open(de, ctxt->pwdmnt, flags);
413 }
414 EXPORT_SYMBOL(l_dentry_open);
415
416 #ifdef HAVE_VFS_READDIR_U64_INO
417 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
418                      u64 ino, unsigned int d_type)
419 #else
420 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
421                      ino_t ino, unsigned int d_type)
422 #endif
423 {
424         struct l_linux_dirent *dirent;
425         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
426
427         dirent = buf->lrc_dirent;
428         if (dirent)
429                dirent->lld_off = offset;
430
431         OBD_ALLOC(dirent, sizeof(*dirent));
432
433         if (!dirent)
434                 return -ENOMEM;
435
436         list_add_tail(&dirent->lld_list, buf->lrc_list);
437
438         buf->lrc_dirent = dirent;
439         dirent->lld_ino = ino;
440         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
441         memcpy(dirent->lld_name, name, namlen);
442
443         return 0;
444 }
445
446 long l_readdir(struct file *file, struct list_head *dentry_list)
447 {
448         struct l_linux_dirent *lastdirent;
449         struct l_readdir_callback buf;
450         int error;
451
452         buf.lrc_dirent = NULL;
453         buf.lrc_list = dentry_list;
454
455         error = vfs_readdir(file, l_filldir, &buf);
456         if (error < 0)
457                 return error;
458
459         lastdirent = buf.lrc_dirent;
460         if (lastdirent)
461                 lastdirent->lld_off = file->f_pos;
462
463         return 0;
464 }
465 EXPORT_SYMBOL(l_readdir);
466
467 #ifdef LUSTRE_KERNEL_VERSION
468 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
469 #error rdonly patchset must be updated [cfs bz11248]
470 #endif
471 void dev_set_rdonly(lvfs_sbdev_type dev);
472 int dev_check_rdonly(lvfs_sbdev_type dev);
473
474 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
475 {
476         lvfs_sbdev_sync(dev);
477         if (jdev && (jdev != dev)) {
478                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
479                        (long)jdev);
480                 dev_set_rdonly(jdev);
481         }
482         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
483         dev_set_rdonly(dev);
484 }
485
486 int lvfs_check_rdonly(lvfs_sbdev_type dev)
487 {
488         return dev_check_rdonly(dev);
489 }
490
491 EXPORT_SYMBOL(__lvfs_set_rdonly);
492 EXPORT_SYMBOL(lvfs_check_rdonly);
493
494 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
495 {
496         char *write_page = NULL;
497         loff_t offset = 0;
498         int rc = 0;
499         ENTRY;
500
501         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
502         if (!write_page)
503                 RETURN(-ENOMEM);
504
505         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
506
507         OBD_FREE(write_page, CFS_PAGE_SIZE);
508
509         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
510         RETURN(rc);
511 }
512 EXPORT_SYMBOL(lvfs_check_io_health);
513 #endif /* LUSTRE_KERNEL_VERSION */
514
515 void obd_update_maxusage()
516 {
517         __u64 max1, max2;
518
519         max1 = obd_pages_sum();
520         max2 = obd_memory_sum();
521
522         spin_lock(&obd_updatemax_lock);
523         if (max1 > obd_max_pages)
524                 obd_max_pages = max1;
525         if (max2 > obd_max_alloc)
526                 obd_max_alloc = max2;
527         spin_unlock(&obd_updatemax_lock);
528         
529 }
530
531 __u64 obd_memory_max(void)
532 {
533         __u64 ret;
534
535         spin_lock(&obd_updatemax_lock);
536         ret = obd_max_alloc;
537         spin_unlock(&obd_updatemax_lock);
538
539         return ret;
540 }
541
542 __u64 obd_pages_max(void)
543 {
544         __u64 ret;
545
546         spin_lock(&obd_updatemax_lock);
547         ret = obd_max_pages;
548         spin_unlock(&obd_updatemax_lock);
549
550         return ret;
551 }
552
553 EXPORT_SYMBOL(obd_update_maxusage);
554 EXPORT_SYMBOL(obd_pages_max);
555 EXPORT_SYMBOL(obd_memory_max);
556 EXPORT_SYMBOL(obd_memory);
557
558 #ifdef LPROCFS
559 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
560                           enum lprocfs_fields_flags field)
561 {
562         __s64 ret = 0;
563         int centry;
564
565         if (!lc)
566                 RETURN(0);
567         do {
568                 centry = atomic_read(&lc->lc_cntl.la_entry);
569
570                 switch (field) {
571                         case LPROCFS_FIELDS_FLAGS_CONFIG:
572                                 ret = lc->lc_config;
573                                 break;
574                         case LPROCFS_FIELDS_FLAGS_SUM:
575                                 ret = lc->lc_sum;
576                                 break;
577                         case LPROCFS_FIELDS_FLAGS_MIN:
578                                 ret = lc->lc_min;
579                                 break;
580                         case LPROCFS_FIELDS_FLAGS_MAX:
581                                 ret = lc->lc_max;
582                                 break;
583                         case LPROCFS_FIELDS_FLAGS_AVG:
584                                 ret = (lc->lc_max - lc->lc_min)/2;
585                                 break;
586                         case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
587                                 ret = lc->lc_sumsquare;
588                                 break;
589                         case LPROCFS_FIELDS_FLAGS_COUNT:
590                                 ret = lc->lc_count;
591                                 break;
592                         default:
593                                 break;
594                 };
595         } while (centry != atomic_read(&lc->lc_cntl.la_entry) &&
596                  centry != atomic_read(&lc->lc_cntl.la_exit));
597
598         RETURN(ret);
599 }
600 EXPORT_SYMBOL(lprocfs_read_helper);
601 #endif /* LPROCFS */
602
603 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
604 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
605 MODULE_LICENSE("GPL");