Whamcloud - gitweb
beea8baa9186a63db3f1f2a5188738a827033dfd
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  lustre/lib/lvfs_linux.c
5  *  Lustre filesystem abstraction routines
6  *
7  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
8  *   Author: Andreas Dilger <adilger@clusterfs.com>
9  *
10  *   This file is part of Lustre, http://www.lustre.org.
11  *
12  *   Lustre is free software; you can redistribute it and/or
13  *   modify it under the terms of version 2 of the GNU General Public
14  *   License as published by the Free Software Foundation.
15  *
16  *   Lustre is distributed in the hope that it will be useful,
17  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
18  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  *   GNU General Public License for more details.
20  *
21  *   You should have received a copy of the GNU General Public License
22  *   along with Lustre; if not, write to the Free Software
23  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24  */
25
26 #ifndef EXPORT_SYMTAB
27 # define EXPORT_SYMTAB
28 #endif
29
30 #define DEBUG_SUBSYSTEM S_FILTER
31
32 #include <linux/version.h>
33 #include <linux/fs.h>
34 #include <asm/unistd.h>
35 #include <linux/slab.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/version.h>
39 #include <libcfs/kp30.h>
40 #include <lustre_fsfilt.h>
41 #include <obd.h>
42 #include <linux/module.h>
43 #include <linux/init.h>
44 #include <linux/lustre_compat25.h>
45 #include <lvfs.h>
46 #include "lvfs_internal.h"
47
48 #include <obd.h>
49 #include <lustre_lib.h>
50 #include <lustre_quota.h>
51
52 /* Debugging check only needed during development */
53 #ifdef OBD_CTXT_DEBUG
54 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
55 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
56                                               msg)
57 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
58 #else
59 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
60 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
61 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
62 #endif
63
64 static void push_group_info(struct lvfs_run_ctxt *save,
65                             struct upcall_cache_entry *uce)
66 {
67         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
68
69         if (!ginfo) {
70                 save->ngroups = current_ngroups;
71                 current_ngroups = 0;
72         } else {
73 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
74                 task_lock(current);
75                 save->group_info = current->group_info;
76                 current->group_info = ginfo;
77                 task_unlock(current);
78 #else
79                 LASSERT(ginfo->ngroups <= NGROUPS);
80                 LASSERT(current->ngroups <= NGROUPS_SMALL);
81                 /* save old */
82                 save->group_info.ngroups = current->ngroups;
83                 if (current->ngroups)
84                         memcpy(save->group_info.small_block, current->groups,
85                                current->ngroups * sizeof(gid_t));
86                 /* push new */
87                 current->ngroups = ginfo->ngroups;
88                 if (ginfo->ngroups)
89                         memcpy(current->groups, ginfo->small_block,
90                                current->ngroups * sizeof(gid_t));
91 #endif
92         }
93 }
94
95 static void pop_group_info(struct lvfs_run_ctxt *save,
96                            struct upcall_cache_entry *uce)
97 {
98         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
99
100         if (!ginfo) {
101                 current_ngroups = save->ngroups;
102         } else {
103 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
104                 task_lock(current);
105                 current->group_info = save->group_info;
106                 task_unlock(current);
107 #else
108                 current->ngroups = save->group_info.ngroups;
109                 if (current->ngroups)
110                         memcpy(current->groups, save->group_info.small_block,
111                                current->ngroups * sizeof(gid_t));
112 #endif
113         }
114 }
115
116 /* push / pop to root of obd store */
117 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
118                struct lvfs_ucred *uc)
119 {
120         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
121         ASSERT_CTXT_MAGIC(new_ctx->magic);
122         OBD_SET_CTXT_MAGIC(save);
123
124         /*
125         CDEBUG(D_INFO,
126                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
127                save, current, current->fs, current->fs->pwd,
128                atomic_read(&current->fs->pwd->d_count),
129                atomic_read(&current->fs->pwd->d_inode->i_count),
130                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
131                current->fs->pwdmnt,
132                atomic_read(&current->fs->pwdmnt->mnt_count));
133         */
134
135         save->fs = get_fs();
136         LASSERT(atomic_read(&current->fs->pwd->d_count));
137         LASSERT(atomic_read(&new_ctx->pwd->d_count));
138         save->pwd = dget(current->fs->pwd);
139         save->pwdmnt = mntget(current->fs->pwdmnt);
140         save->luc.luc_umask = current->fs->umask;
141
142         LASSERT(save->pwd);
143         LASSERT(save->pwdmnt);
144         LASSERT(new_ctx->pwd);
145         LASSERT(new_ctx->pwdmnt);
146
147         if (uc) {
148                 save->luc.luc_fsuid = current->fsuid;
149                 save->luc.luc_fsgid = current->fsgid;
150                 save->luc.luc_cap = current->cap_effective;
151
152                 current->fsuid = uc->luc_fsuid;
153                 current->fsgid = uc->luc_fsgid;
154                 current->cap_effective = uc->luc_cap;
155                 push_group_info(save, uc->luc_uce);
156         }
157         current->fs->umask = 0; /* umask already applied on client */
158         set_fs(new_ctx->fs);
159         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
160
161         /*
162         CDEBUG(D_INFO,
163                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
164                new_ctx, current, current->fs, current->fs->pwd,
165                atomic_read(&current->fs->pwd->d_count),
166                atomic_read(&current->fs->pwd->d_inode->i_count),
167                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
168                current->fs->pwdmnt,
169                atomic_read(&current->fs->pwdmnt->mnt_count));
170         */
171 }
172 EXPORT_SYMBOL(push_ctxt);
173
174 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
175               struct lvfs_ucred *uc)
176 {
177         //printk("pc0");
178         ASSERT_CTXT_MAGIC(saved->magic);
179         //printk("pc1");
180         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
181
182         /*
183         CDEBUG(D_INFO,
184                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
185                new_ctx, current, current->fs, current->fs->pwd,
186                atomic_read(&current->fs->pwd->d_count),
187                atomic_read(&current->fs->pwd->d_inode->i_count),
188                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
189                current->fs->pwdmnt,
190                atomic_read(&current->fs->pwdmnt->mnt_count));
191         */
192
193         LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
194                  current->fs->pwd, new_ctx->pwd);
195         LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
196                  current->fs->pwdmnt, new_ctx->pwdmnt);
197
198         set_fs(saved->fs);
199         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
200
201         dput(saved->pwd);
202         mntput(saved->pwdmnt);
203         current->fs->umask = saved->luc.luc_umask;
204         if (uc) {
205                 current->fsuid = saved->luc.luc_fsuid;
206                 current->fsgid = saved->luc.luc_fsgid;
207                 current->cap_effective = saved->luc.luc_cap;
208                 pop_group_info(saved, uc->luc_uce);
209         }
210
211         /*
212         CDEBUG(D_INFO,
213                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
214                saved, current, current->fs, current->fs->pwd,
215                atomic_read(&current->fs->pwd->d_count),
216                atomic_read(&current->fs->pwd->d_inode->i_count),
217                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
218                current->fs->pwdmnt,
219                atomic_read(&current->fs->pwdmnt->mnt_count));
220         */
221 }
222 EXPORT_SYMBOL(pop_ctxt);
223
224 /* utility to make a file */
225 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
226 {
227         struct dentry *dchild;
228         int err = 0;
229         ENTRY;
230
231         ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
232         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
233
234         dchild = ll_lookup_one_len(name, dir, strlen(name));
235         if (IS_ERR(dchild))
236                 GOTO(out_up, dchild);
237
238         if (dchild->d_inode) {
239                 int old_mode = dchild->d_inode->i_mode;
240                 if (!S_ISREG(old_mode))
241                         GOTO(out_err, err = -EEXIST);
242
243                 /* Fixup file permissions if necessary */
244                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
245                         CWARN("fixing permissions on %s from %o to %o\n",
246                               name, old_mode, mode);
247                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
248                                                   (old_mode & ~S_IALLUGO);
249                         mark_inode_dirty(dchild->d_inode);
250                 }
251                 GOTO(out_up, dchild);
252         }
253
254         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
255                             NULL);
256         if (err)
257                 GOTO(out_err, err);
258
259         RETURN(dchild);
260
261 out_err:
262         dput(dchild);
263         dchild = ERR_PTR(err);
264 out_up:
265         return dchild;
266 }
267 EXPORT_SYMBOL(simple_mknod);
268
269 /* utility to make a directory */
270 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode, int fix)
271 {
272         struct dentry *dchild;
273         int err = 0;
274         ENTRY;
275
276         ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
277         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
278         dchild = ll_lookup_one_len(name, dir, strlen(name));
279         if (IS_ERR(dchild))
280                 GOTO(out_up, dchild);
281
282         if (dchild->d_inode) {
283                 int old_mode = dchild->d_inode->i_mode;
284                 if (!S_ISDIR(old_mode)) {
285                         CERROR("found %s (%lu/%u) is mode %o\n", name,
286                                dchild->d_inode->i_ino,
287                                dchild->d_inode->i_generation, old_mode);
288                         GOTO(out_err, err = -ENOTDIR);
289                 }
290
291                 /* Fixup directory permissions if necessary */
292                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
293                         CDEBUG(D_CONFIG, 
294                                "fixing permissions on %s from %o to %o\n",
295                                name, old_mode, mode);
296                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
297                                                   (old_mode & ~S_IALLUGO);
298                         mark_inode_dirty(dchild->d_inode);
299                 }
300                 GOTO(out_up, dchild);
301         }
302
303         err = vfs_mkdir(dir->d_inode, dchild, mode);
304         if (err)
305                 GOTO(out_err, err);
306
307         RETURN(dchild);
308
309 out_err:
310         dput(dchild);
311         dchild = ERR_PTR(err);
312 out_up:
313         return dchild;
314 }
315 EXPORT_SYMBOL(simple_mkdir);
316
317 /* utility to rename a file */
318 int lustre_rename(struct dentry *dir, char *oldname, char *newname)
319 {
320         struct dentry *dchild_old, *dchild_new;
321         int err = 0;
322         ENTRY;
323
324         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
325         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
326                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
327
328         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
329         if (IS_ERR(dchild_old))
330                 RETURN(PTR_ERR(dchild_old));
331
332         if (!dchild_old->d_inode) 
333                 GOTO(put_old, err = -ENOENT);
334
335         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
336         if (IS_ERR(dchild_new))
337                 GOTO(put_old, err = PTR_ERR(dchild_new));
338
339         err = vfs_rename(dir->d_inode, dchild_old, dir->d_inode, dchild_new);
340
341         dput(dchild_new);
342 put_old:
343         dput(dchild_old);
344         RETURN(err);
345 }
346 EXPORT_SYMBOL(lustre_rename);
347
348 /*
349  * Read a file from within kernel context.  Prior to calling this
350  * function we should already have done a push_ctxt().
351  */
352 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
353 {
354         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
355         if (!file || !file->f_op || !file->f_op->read || !off)
356                 RETURN(-ENOSYS);
357
358         return file->f_op->read(file, buf, len, off);
359 }
360 EXPORT_SYMBOL(lustre_fread);
361
362 /*
363  * Write a file from within kernel context.  Prior to calling this
364  * function we should already have done a push_ctxt().
365  */
366 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
367 {
368         ENTRY;
369         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
370         if (!file)
371                 RETURN(-ENOENT);
372         if (!file->f_op)
373                 RETURN(-ENOSYS);
374         if (!off)
375                 RETURN(-EINVAL);
376
377         if (!file->f_op->write)
378                 RETURN(-EROFS);
379
380         RETURN(file->f_op->write(file, buf, len, off));
381 }
382 EXPORT_SYMBOL(lustre_fwrite);
383
384 /*
385  * Sync a file from within kernel context.  Prior to calling this
386  * function we should already have done a push_ctxt().
387  */
388 int lustre_fsync(struct file *file)
389 {
390         ENTRY;
391         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
392         if (!file || !file->f_op || !file->f_op->fsync)
393                 RETURN(-ENOSYS);
394
395         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
396 }
397 EXPORT_SYMBOL(lustre_fsync);
398
399 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
400                              int flags)
401 {
402         mntget(ctxt->pwdmnt);
403         return dentry_open(de, ctxt->pwdmnt, flags);
404 }
405 EXPORT_SYMBOL(l_dentry_open);
406
407 #ifdef HAVE_VFS_READDIR_U64_INO
408 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
409                      u64 ino, unsigned int d_type)
410 #else
411 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
412                      ino_t ino, unsigned int d_type)
413 #endif
414 {
415         struct l_linux_dirent *dirent;
416         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
417
418         dirent = buf->lrc_dirent;
419         if (dirent)
420                dirent->lld_off = offset;
421
422         OBD_ALLOC(dirent, sizeof(*dirent));
423
424         if (!dirent)
425                 return -ENOMEM;
426
427         list_add_tail(&dirent->lld_list, buf->lrc_list);
428
429         buf->lrc_dirent = dirent;
430         dirent->lld_ino = ino;
431         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
432         memcpy(dirent->lld_name, name, namlen);
433
434         return 0;
435 }
436
437 long l_readdir(struct file *file, struct list_head *dentry_list)
438 {
439         struct l_linux_dirent *lastdirent;
440         struct l_readdir_callback buf;
441         int error;
442
443         buf.lrc_dirent = NULL;
444         buf.lrc_list = dentry_list; 
445
446         error = vfs_readdir(file, l_filldir, &buf);
447         if (error < 0)
448                 return error;
449
450         lastdirent = buf.lrc_dirent;
451         if (lastdirent)
452                 lastdirent->lld_off = file->f_pos;
453
454         return 0; 
455 }
456 EXPORT_SYMBOL(l_readdir);
457
458
459 #ifdef LUSTRE_KERNEL_VERSION
460 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
461 #error rdonly patchset must be updated [cfs bz11248]
462 #endif
463
464 void dev_set_rdonly(lvfs_sbdev_type dev);
465 int dev_check_rdonly(lvfs_sbdev_type dev);
466
467 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
468 {
469         lvfs_sbdev_sync(dev);
470         if (jdev && (jdev != dev)) {
471                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
472                        (long)jdev);
473                 dev_set_rdonly(jdev);
474         }
475         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
476         dev_set_rdonly(dev);
477 }
478
479 int lvfs_check_rdonly(lvfs_sbdev_type dev)
480 {
481         return dev_check_rdonly(dev);
482 }
483
484 EXPORT_SYMBOL(__lvfs_set_rdonly);
485 EXPORT_SYMBOL(lvfs_check_rdonly);
486 #endif /* LUSTRE_KERNEL_VERSION */
487
488 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
489 {
490         char *write_page = NULL;
491         loff_t offset = 0;
492         int rc = 0;
493         ENTRY;
494
495         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
496         if (!write_page)
497                 RETURN(-ENOMEM);
498         
499         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
500        
501         OBD_FREE(write_page, CFS_PAGE_SIZE);
502
503         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
504         RETURN(rc); 
505 }
506 EXPORT_SYMBOL(lvfs_check_io_health);
507
508 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
509 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
510 MODULE_LICENSE("GPL");