Whamcloud - gitweb
Land b1_8_gate onto b1_8 (20081218_1708)
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #ifndef EXPORT_SYMTAB
42 # define EXPORT_SYMTAB
43 #endif
44
45 #define DEBUG_SUBSYSTEM S_FILTER
46
47 #include <linux/version.h>
48 #include <linux/fs.h>
49 #include <asm/unistd.h>
50 #include <linux/slab.h>
51 #include <linux/pagemap.h>
52 #include <linux/quotaops.h>
53 #include <linux/version.h>
54 #include <libcfs/kp30.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/module.h>
58 #include <linux/init.h>
59 #include <linux/lustre_compat25.h>
60 #include <lvfs.h>
61 #include "lvfs_internal.h"
62
63 #include <obd.h>
64 #include <lustre_lib.h>
65 #include <lustre_quota.h>
66
67 /* Debugging check only needed during development */
68 #ifdef OBD_CTXT_DEBUG
69 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
70 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
71                                               msg)
72 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
73 #else
74 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
75 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
76 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
77 #endif
78
79 static void push_group_info(struct lvfs_run_ctxt *save,
80                             struct upcall_cache_entry *uce)
81 {
82         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
83
84         if (!ginfo) {
85                 save->ngroups = current_ngroups;
86                 current_ngroups = 0;
87         } else {
88 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
89                 task_lock(current);
90                 save->group_info = current->group_info;
91                 current->group_info = ginfo;
92                 task_unlock(current);
93 #else
94                 LASSERT(ginfo->ngroups <= NGROUPS);
95                 LASSERT(current->ngroups <= NGROUPS_SMALL);
96                 /* save old */
97                 save->group_info.ngroups = current->ngroups;
98                 if (current->ngroups)
99                         memcpy(save->group_info.small_block, current->groups,
100                                current->ngroups * sizeof(gid_t));
101                 /* push new */
102                 current->ngroups = ginfo->ngroups;
103                 if (ginfo->ngroups)
104                         memcpy(current->groups, ginfo->small_block,
105                                current->ngroups * sizeof(gid_t));
106 #endif
107         }
108 }
109
110 static void pop_group_info(struct lvfs_run_ctxt *save,
111                            struct upcall_cache_entry *uce)
112 {
113         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
114
115         if (!ginfo) {
116                 current_ngroups = save->ngroups;
117         } else {
118 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
119                 task_lock(current);
120                 current->group_info = save->group_info;
121                 task_unlock(current);
122 #else
123                 current->ngroups = save->group_info.ngroups;
124                 if (current->ngroups)
125                         memcpy(current->groups, save->group_info.small_block,
126                                current->ngroups * sizeof(gid_t));
127 #endif
128         }
129 }
130
131 /* push / pop to root of obd store */
132 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
133                struct lvfs_ucred *uc)
134 {
135         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
136         ASSERT_CTXT_MAGIC(new_ctx->magic);
137         OBD_SET_CTXT_MAGIC(save);
138
139         /*
140         CDEBUG(D_INFO,
141                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
142                save, current, current->fs, current->fs->pwd,
143                atomic_read(&current->fs->pwd->d_count),
144                atomic_read(&current->fs->pwd->d_inode->i_count),
145                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
146                current->fs->pwdmnt,
147                atomic_read(&current->fs->pwdmnt->mnt_count));
148         */
149
150         save->fs = get_fs();
151         LASSERT(atomic_read(&current->fs->pwd->d_count));
152         LASSERT(atomic_read(&new_ctx->pwd->d_count));
153         save->pwd = dget(current->fs->pwd);
154         save->pwdmnt = mntget(current->fs->pwdmnt);
155         save->luc.luc_umask = current->fs->umask;
156
157         LASSERT(save->pwd);
158         LASSERT(save->pwdmnt);
159         LASSERT(new_ctx->pwd);
160         LASSERT(new_ctx->pwdmnt);
161
162         if (uc) {
163                 save->luc.luc_fsuid = current->fsuid;
164                 save->luc.luc_fsgid = current->fsgid;
165                 save->luc.luc_cap = current->cap_effective;
166
167                 current->fsuid = uc->luc_fsuid;
168                 current->fsgid = uc->luc_fsgid;
169                 current->cap_effective = uc->luc_cap;
170                 push_group_info(save, uc->luc_uce);
171         }
172         current->fs->umask = 0; /* umask already applied on client */
173         set_fs(new_ctx->fs);
174         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
175
176         /*
177         CDEBUG(D_INFO,
178                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
179                new_ctx, current, current->fs, current->fs->pwd,
180                atomic_read(&current->fs->pwd->d_count),
181                atomic_read(&current->fs->pwd->d_inode->i_count),
182                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
183                current->fs->pwdmnt,
184                atomic_read(&current->fs->pwdmnt->mnt_count));
185         */
186 }
187 EXPORT_SYMBOL(push_ctxt);
188
189 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
190               struct lvfs_ucred *uc)
191 {
192         //printk("pc0");
193         ASSERT_CTXT_MAGIC(saved->magic);
194         //printk("pc1");
195         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
196
197         /*
198         CDEBUG(D_INFO,
199                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
200                new_ctx, current, current->fs, current->fs->pwd,
201                atomic_read(&current->fs->pwd->d_count),
202                atomic_read(&current->fs->pwd->d_inode->i_count),
203                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
204                current->fs->pwdmnt,
205                atomic_read(&current->fs->pwdmnt->mnt_count));
206         */
207
208         LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
209                  current->fs->pwd, new_ctx->pwd);
210         LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
211                  current->fs->pwdmnt, new_ctx->pwdmnt);
212
213         set_fs(saved->fs);
214         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
215
216         dput(saved->pwd);
217         mntput(saved->pwdmnt);
218         current->fs->umask = saved->luc.luc_umask;
219         if (uc) {
220                 current->fsuid = saved->luc.luc_fsuid;
221                 current->fsgid = saved->luc.luc_fsgid;
222                 current->cap_effective = saved->luc.luc_cap;
223                 pop_group_info(saved, uc->luc_uce);
224         }
225
226         /*
227         CDEBUG(D_INFO,
228                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
229                saved, current, current->fs, current->fs->pwd,
230                atomic_read(&current->fs->pwd->d_count),
231                atomic_read(&current->fs->pwd->d_inode->i_count),
232                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
233                current->fs->pwdmnt,
234                atomic_read(&current->fs->pwdmnt->mnt_count));
235         */
236 }
237 EXPORT_SYMBOL(pop_ctxt);
238
239 /* utility to make a file */
240 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
241 {
242         struct dentry *dchild;
243         int err = 0;
244         ENTRY;
245
246         ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
247         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
248
249         dchild = ll_lookup_one_len(name, dir, strlen(name));
250         if (IS_ERR(dchild))
251                 GOTO(out_up, dchild);
252
253         if (dchild->d_inode) {
254                 int old_mode = dchild->d_inode->i_mode;
255                 if (!S_ISREG(old_mode))
256                         GOTO(out_err, err = -EEXIST);
257
258                 /* Fixup file permissions if necessary */
259                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
260                         CWARN("fixing permissions on %s from %o to %o\n",
261                               name, old_mode, mode);
262                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
263                                                   (old_mode & ~S_IALLUGO);
264                         mark_inode_dirty(dchild->d_inode);
265                 }
266                 GOTO(out_up, dchild);
267         }
268
269         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
270                             NULL);
271         if (err)
272                 GOTO(out_err, err);
273
274         RETURN(dchild);
275
276 out_err:
277         dput(dchild);
278         dchild = ERR_PTR(err);
279 out_up:
280         return dchild;
281 }
282 EXPORT_SYMBOL(simple_mknod);
283
284 /* utility to make a directory */
285 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
286                             char *name, int mode, int fix)
287 {
288         struct dentry *dchild;
289         int err = 0;
290         ENTRY;
291
292         ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
293         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
294         dchild = ll_lookup_one_len(name, dir, strlen(name));
295         if (IS_ERR(dchild))
296                 GOTO(out_up, dchild);
297
298         if (dchild->d_inode) {
299                 int old_mode = dchild->d_inode->i_mode;
300                 if (!S_ISDIR(old_mode)) {
301                         CERROR("found %s (%lu/%u) is mode %o\n", name,
302                                dchild->d_inode->i_ino,
303                                dchild->d_inode->i_generation, old_mode);
304                         GOTO(out_err, err = -ENOTDIR);
305                 }
306
307                 /* Fixup directory permissions if necessary */
308                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
309                         CDEBUG(D_CONFIG, 
310                                "fixing permissions on %s from %o to %o\n",
311                                name, old_mode, mode);
312                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
313                                                   (old_mode & ~S_IALLUGO);
314                         mark_inode_dirty(dchild->d_inode);
315                 }
316                 GOTO(out_up, dchild);
317         }
318
319         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
320         if (err)
321                 GOTO(out_err, err);
322
323         RETURN(dchild);
324
325 out_err:
326         dput(dchild);
327         dchild = ERR_PTR(err);
328 out_up:
329         return dchild;
330 }
331 EXPORT_SYMBOL(simple_mkdir);
332
333 /* utility to rename a file */
334 int lustre_rename(struct dentry *dir, struct vfsmount *mnt, 
335                   char *oldname, char *newname)
336 {
337         struct dentry *dchild_old, *dchild_new;
338         int err = 0;
339         ENTRY;
340
341         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
342         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
343                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
344
345         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
346         if (IS_ERR(dchild_old))
347                 RETURN(PTR_ERR(dchild_old));
348
349         if (!dchild_old->d_inode) 
350                 GOTO(put_old, err = -ENOENT);
351
352         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
353         if (IS_ERR(dchild_new))
354                 GOTO(put_old, err = PTR_ERR(dchild_new));
355
356         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, 
357                             dir->d_inode, dchild_new, mnt);
358
359         dput(dchild_new);
360 put_old:
361         dput(dchild_old);
362         RETURN(err);
363 }
364 EXPORT_SYMBOL(lustre_rename);
365
366 /*
367  * Read a file from within kernel context.  Prior to calling this
368  * function we should already have done a push_ctxt().
369  */
370 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
371 {
372         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
373         if (!file || !file->f_op || !file->f_op->read || !off)
374                 RETURN(-ENOSYS);
375
376         return file->f_op->read(file, buf, len, off);
377 }
378 EXPORT_SYMBOL(lustre_fread);
379
380 /*
381  * Write a file from within kernel context.  Prior to calling this
382  * function we should already have done a push_ctxt().
383  */
384 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
385 {
386         ENTRY;
387         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
388         if (!file)
389                 RETURN(-ENOENT);
390         if (!file->f_op)
391                 RETURN(-ENOSYS);
392         if (!off)
393                 RETURN(-EINVAL);
394
395         if (!file->f_op->write)
396                 RETURN(-EROFS);
397
398         RETURN(file->f_op->write(file, buf, len, off));
399 }
400 EXPORT_SYMBOL(lustre_fwrite);
401
402 /*
403  * Sync a file from within kernel context.  Prior to calling this
404  * function we should already have done a push_ctxt().
405  */
406 int lustre_fsync(struct file *file)
407 {
408         ENTRY;
409         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
410         if (!file || !file->f_op || !file->f_op->fsync)
411                 RETURN(-ENOSYS);
412
413         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
414 }
415 EXPORT_SYMBOL(lustre_fsync);
416
417 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
418                              int flags)
419 {
420         mntget(ctxt->pwdmnt);
421         return dentry_open(de, ctxt->pwdmnt, flags);
422 }
423 EXPORT_SYMBOL(l_dentry_open);
424
425 #ifdef HAVE_VFS_READDIR_U64_INO
426 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
427                      u64 ino, unsigned int d_type)
428 #else
429 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
430                      ino_t ino, unsigned int d_type)
431 #endif
432 {
433         struct l_linux_dirent *dirent;
434         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
435
436         dirent = buf->lrc_dirent;
437         if (dirent)
438                dirent->lld_off = offset;
439
440         OBD_ALLOC(dirent, sizeof(*dirent));
441
442         if (!dirent)
443                 return -ENOMEM;
444
445         list_add_tail(&dirent->lld_list, buf->lrc_list);
446
447         buf->lrc_dirent = dirent;
448         dirent->lld_ino = ino;
449         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
450         memcpy(dirent->lld_name, name, namlen);
451
452         return 0;
453 }
454
455 long l_readdir(struct file *file, struct list_head *dentry_list)
456 {
457         struct l_linux_dirent *lastdirent;
458         struct l_readdir_callback buf;
459         int error;
460
461         buf.lrc_dirent = NULL;
462         buf.lrc_list = dentry_list; 
463
464         error = vfs_readdir(file, l_filldir, &buf);
465         if (error < 0)
466                 return error;
467
468         lastdirent = buf.lrc_dirent;
469         if (lastdirent)
470                 lastdirent->lld_off = file->f_pos;
471
472         return 0; 
473 }
474 EXPORT_SYMBOL(l_readdir);
475
476 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
477                     struct iattr *newattrs)
478 {
479         int rc;
480
481         LOCK_INODE_MUTEX(dchild->d_inode);
482 #ifdef HAVE_SECURITY_PLUG
483         rc = notify_change(dchild, mnt, newattrs);
484 #else
485         rc = notify_change(dchild, newattrs);
486 #endif
487         UNLOCK_INODE_MUTEX(dchild->d_inode);
488         return rc;
489 }
490 EXPORT_SYMBOL(l_notify_change);
491
492 /* utility to truncate a file */
493 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
494                     char *name, loff_t length)
495 {
496         struct dentry *dchild;
497         struct iattr newattrs;
498         int err = 0;
499         ENTRY;
500
501         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
502                name, (long long)length);
503         dchild = ll_lookup_one_len(name, dir, strlen(name));
504         if (IS_ERR(dchild))
505                 GOTO(out, err = PTR_ERR(dchild));
506
507         if (dchild->d_inode) {
508                 int old_mode = dchild->d_inode->i_mode;
509                 if (S_ISDIR(old_mode)) {
510                         CERROR("found %s (%lu/%u) is mode %o\n", name,
511                                dchild->d_inode->i_ino,
512                                dchild->d_inode->i_generation, old_mode);
513                         GOTO(out_dput, err = -EISDIR);
514                 }
515
516                 newattrs.ia_size = length;
517                 newattrs.ia_valid = ATTR_SIZE;
518                 err = l_notify_change(mnt, dchild, &newattrs);
519         }
520         EXIT;
521 out_dput:
522         dput(dchild);
523 out:
524         return err;
525 }
526 EXPORT_SYMBOL(simple_truncate);
527
528 #ifdef LUSTRE_KERNEL_VERSION
529 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
530 #error rdonly patchset must be updated [cfs bz11248]
531 #endif
532
533 void dev_set_rdonly(lvfs_sbdev_type dev);
534 int dev_check_rdonly(lvfs_sbdev_type dev);
535
536 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
537 {
538         lvfs_sbdev_sync(dev);
539         if (jdev && (jdev != dev)) {
540                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
541                        (long)jdev);
542                 dev_set_rdonly(jdev);
543         }
544         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
545         dev_set_rdonly(dev);
546 }
547
548 int lvfs_check_rdonly(lvfs_sbdev_type dev)
549 {
550         return dev_check_rdonly(dev);
551 }
552
553 EXPORT_SYMBOL(__lvfs_set_rdonly);
554 EXPORT_SYMBOL(lvfs_check_rdonly);
555 #endif /* LUSTRE_KERNEL_VERSION */
556
557 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
558 {
559         char *write_page = NULL;
560         loff_t offset = 0;
561         int rc = 0;
562         ENTRY;
563
564         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
565         if (!write_page)
566                 RETURN(-ENOMEM);
567         
568         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
569        
570         OBD_FREE(write_page, CFS_PAGE_SIZE);
571
572         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
573         RETURN(rc); 
574 }
575 EXPORT_SYMBOL(lvfs_check_io_health);
576
577 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
578 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
579 MODULE_LICENSE("GPL");