Whamcloud - gitweb
b=15253 add failover nidlist to proc import
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #ifndef EXPORT_SYMTAB
42 # define EXPORT_SYMTAB
43 #endif
44
45 #define DEBUG_SUBSYSTEM S_FILTER
46
47 #include <linux/version.h>
48 #include <linux/fs.h>
49 #include <asm/unistd.h>
50 #include <linux/slab.h>
51 #include <linux/pagemap.h>
52 #include <linux/quotaops.h>
53 #include <linux/version.h>
54 #include <libcfs/kp30.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/module.h>
58 #include <linux/init.h>
59 #include <linux/lustre_compat25.h>
60 #include <lvfs.h>
61 #include "lvfs_internal.h"
62
63 #include <obd.h>
64 #include <lustre_lib.h>
65 #include <lustre_quota.h>
66
67 /* Debugging check only needed during development */
68 #ifdef OBD_CTXT_DEBUG
69 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
70 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
71                                               msg)
72 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
73 #else
74 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
75 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
76 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
77 #endif
78
79 static void push_group_info(struct lvfs_run_ctxt *save,
80                             struct upcall_cache_entry *uce)
81 {
82         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
83
84         if (!ginfo) {
85                 save->ngroups = current_ngroups;
86                 current_ngroups = 0;
87         } else {
88 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
89                 struct cred *cred;
90                 task_lock(current);
91                 save->group_info = current_cred()->group_info;
92                 if ((cred = prepare_creds())) {
93                         cred->group_info = ginfo;
94                         commit_creds(cred);
95                 }
96                 task_unlock(current);
97 #else
98                 LASSERT(ginfo->ngroups <= NGROUPS);
99                 LASSERT(current->ngroups <= NGROUPS_SMALL);
100                 /* save old */
101                 save->group_info.ngroups = current->ngroups;
102                 if (current->ngroups)
103                         memcpy(save->group_info.small_block, current->groups,
104                                current->ngroups * sizeof(gid_t));
105                 /* push new */
106                 current->ngroups = ginfo->ngroups;
107                 if (ginfo->ngroups)
108                         memcpy(current->groups, ginfo->small_block,
109                                current->ngroups * sizeof(gid_t));
110 #endif
111         }
112 }
113
114 static void pop_group_info(struct lvfs_run_ctxt *save,
115                            struct upcall_cache_entry *uce)
116 {
117         struct group_info *ginfo = uce ? uce->ue_group_info : NULL;
118
119         if (!ginfo) {
120                 current_ngroups = save->ngroups;
121         } else {
122 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
123                 struct cred *cred;
124                 task_lock(current);
125                 if ((cred = prepare_creds())) {
126                         cred->group_info = save->group_info;
127                         commit_creds(cred);
128                 }
129                 task_unlock(current);
130 #else
131                 current->ngroups = save->group_info.ngroups;
132                 if (current->ngroups)
133                         memcpy(current->groups, save->group_info.small_block,
134                                current->ngroups * sizeof(gid_t));
135 #endif
136         }
137 }
138
139 /* push / pop to root of obd store */
140 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
141                struct lvfs_ucred *uc)
142 {
143         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
144         ASSERT_CTXT_MAGIC(new_ctx->magic);
145         OBD_SET_CTXT_MAGIC(save);
146
147         /*
148         CDEBUG(D_INFO,
149                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
150                save, current, current->fs, current->fs->pwd,
151                atomic_read(&current->fs->pwd->d_count),
152                atomic_read(&current->fs->pwd->d_inode->i_count),
153                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
154                current->fs->pwdmnt,
155                atomic_read(&current->fs->pwdmnt->mnt_count));
156         */
157
158         save->fs = get_fs();
159         LASSERT(atomic_read(&cfs_fs_pwd(current->fs)->d_count));
160         LASSERT(atomic_read(&new_ctx->pwd->d_count));
161         save->pwd = dget(cfs_fs_pwd(current->fs));
162         save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
163         save->luc.luc_umask = current->fs->umask;
164
165         LASSERT(save->pwd);
166         LASSERT(save->pwdmnt);
167         LASSERT(new_ctx->pwd);
168         LASSERT(new_ctx->pwdmnt);
169
170         if (uc) {
171                 struct cred *cred;
172                 save->luc.luc_fsuid = current_fsuid();
173                 save->luc.luc_fsgid = current_fsgid();
174                 save->luc.luc_cap = current_cap();
175
176                 if ((cred = prepare_creds())) {
177                         cred->fsuid = uc->luc_fsuid;
178                         cred->fsgid = uc->luc_fsgid;
179                         cred->cap_effective = uc->luc_cap;
180                         commit_creds(cred);
181                 }
182
183                 push_group_info(save, uc->luc_uce);
184         }
185         current->fs->umask = 0; /* umask already applied on client */
186         set_fs(new_ctx->fs);
187         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
188
189         /*
190         CDEBUG(D_INFO,
191                "= push %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
192                new_ctx, current, current->fs, current->fs->pwd,
193                atomic_read(&current->fs->pwd->d_count),
194                atomic_read(&current->fs->pwd->d_inode->i_count),
195                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
196                current->fs->pwdmnt,
197                atomic_read(&current->fs->pwdmnt->mnt_count));
198         */
199 }
200 EXPORT_SYMBOL(push_ctxt);
201
202 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
203               struct lvfs_ucred *uc)
204 {
205         //printk("pc0");
206         ASSERT_CTXT_MAGIC(saved->magic);
207         //printk("pc1");
208         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
209
210         /*
211         CDEBUG(D_INFO,
212                " = pop  %p==%p = cur %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
213                new_ctx, current, current->fs, current->fs->pwd,
214                atomic_read(&current->fs->pwd->d_count),
215                atomic_read(&current->fs->pwd->d_inode->i_count),
216                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
217                current->fs->pwdmnt,
218                atomic_read(&current->fs->pwdmnt->mnt_count));
219         */
220
221         LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
222                  cfs_fs_pwd(current->fs), new_ctx->pwd);
223         LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
224                  cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
225
226         set_fs(saved->fs);
227         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
228
229         dput(saved->pwd);
230         mntput(saved->pwdmnt);
231         current->fs->umask = saved->luc.luc_umask;
232         if (uc) {
233                 struct cred *cred;
234                 if ((cred = prepare_creds())) {
235                         cred->fsuid = saved->luc.luc_fsuid;
236                         cred->fsgid = saved->luc.luc_fsgid;
237                         cred->cap_effective = saved->luc.luc_cap;
238                         commit_creds(cred);
239                 }
240
241                 pop_group_info(saved, uc->luc_uce);
242         }
243
244         /*
245         CDEBUG(D_INFO,
246                "= pop  %p->%p = cur fs %p pwd %p:d%d:i%d (%.*s), pwdmnt %p:%d\n",
247                saved, current, current->fs, current->fs->pwd,
248                atomic_read(&current->fs->pwd->d_count),
249                atomic_read(&current->fs->pwd->d_inode->i_count),
250                current->fs->pwd->d_name.len, current->fs->pwd->d_name.name,
251                current->fs->pwdmnt,
252                atomic_read(&current->fs->pwdmnt->mnt_count));
253         */
254 }
255 EXPORT_SYMBOL(pop_ctxt);
256
257 /* utility to make a file */
258 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
259 {
260         struct dentry *dchild;
261         int err = 0;
262         ENTRY;
263
264         ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
265         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
266
267         dchild = ll_lookup_one_len(name, dir, strlen(name));
268         if (IS_ERR(dchild))
269                 GOTO(out_up, dchild);
270
271         if (dchild->d_inode) {
272                 int old_mode = dchild->d_inode->i_mode;
273                 if (!S_ISREG(old_mode))
274                         GOTO(out_err, err = -EEXIST);
275
276                 /* Fixup file permissions if necessary */
277                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
278                         CWARN("fixing permissions on %s from %o to %o\n",
279                               name, old_mode, mode);
280                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
281                                                   (old_mode & ~S_IALLUGO);
282                         mark_inode_dirty(dchild->d_inode);
283                 }
284                 GOTO(out_up, dchild);
285         }
286
287         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
288                             NULL);
289         if (err)
290                 GOTO(out_err, err);
291
292         RETURN(dchild);
293
294 out_err:
295         dput(dchild);
296         dchild = ERR_PTR(err);
297 out_up:
298         return dchild;
299 }
300 EXPORT_SYMBOL(simple_mknod);
301
302 /* utility to make a directory */
303 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
304                             char *name, int mode, int fix)
305 {
306         struct dentry *dchild;
307         int err = 0;
308         ENTRY;
309
310         ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
311         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
312         dchild = ll_lookup_one_len(name, dir, strlen(name));
313         if (IS_ERR(dchild))
314                 GOTO(out_up, dchild);
315
316         if (dchild->d_inode) {
317                 int old_mode = dchild->d_inode->i_mode;
318                 if (!S_ISDIR(old_mode)) {
319                         CERROR("found %s (%lu/%u) is mode %o\n", name,
320                                dchild->d_inode->i_ino,
321                                dchild->d_inode->i_generation, old_mode);
322                         GOTO(out_err, err = -ENOTDIR);
323                 }
324
325                 /* Fixup directory permissions if necessary */
326                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
327                         CDEBUG(D_CONFIG, 
328                                "fixing permissions on %s from %o to %o\n",
329                                name, old_mode, mode);
330                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
331                                                   (old_mode & ~S_IALLUGO);
332                         mark_inode_dirty(dchild->d_inode);
333                 }
334                 GOTO(out_up, dchild);
335         }
336
337         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
338         if (err)
339                 GOTO(out_err, err);
340
341         RETURN(dchild);
342
343 out_err:
344         dput(dchild);
345         dchild = ERR_PTR(err);
346 out_up:
347         return dchild;
348 }
349 EXPORT_SYMBOL(simple_mkdir);
350
351 /* utility to rename a file */
352 int lustre_rename(struct dentry *dir, struct vfsmount *mnt, 
353                   char *oldname, char *newname)
354 {
355         struct dentry *dchild_old, *dchild_new;
356         int err = 0;
357         ENTRY;
358
359         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
360         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
361                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
362
363         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
364         if (IS_ERR(dchild_old))
365                 RETURN(PTR_ERR(dchild_old));
366
367         if (!dchild_old->d_inode) 
368                 GOTO(put_old, err = -ENOENT);
369
370         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
371         if (IS_ERR(dchild_new))
372                 GOTO(put_old, err = PTR_ERR(dchild_new));
373
374         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, 
375                             dir->d_inode, dchild_new, mnt);
376
377         dput(dchild_new);
378 put_old:
379         dput(dchild_old);
380         RETURN(err);
381 }
382 EXPORT_SYMBOL(lustre_rename);
383
384 /*
385  * Read a file from within kernel context.  Prior to calling this
386  * function we should already have done a push_ctxt().
387  */
388 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
389 {
390         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
391         if (!file || !file->f_op || !file->f_op->read || !off)
392                 RETURN(-ENOSYS);
393
394         return file->f_op->read(file, buf, len, off);
395 }
396 EXPORT_SYMBOL(lustre_fread);
397
398 /*
399  * Write a file from within kernel context.  Prior to calling this
400  * function we should already have done a push_ctxt().
401  */
402 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
403 {
404         ENTRY;
405         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
406         if (!file)
407                 RETURN(-ENOENT);
408         if (!file->f_op)
409                 RETURN(-ENOSYS);
410         if (!off)
411                 RETURN(-EINVAL);
412
413         if (!file->f_op->write)
414                 RETURN(-EROFS);
415
416         RETURN(file->f_op->write(file, buf, len, off));
417 }
418 EXPORT_SYMBOL(lustre_fwrite);
419
420 /*
421  * Sync a file from within kernel context.  Prior to calling this
422  * function we should already have done a push_ctxt().
423  */
424 int lustre_fsync(struct file *file)
425 {
426         ENTRY;
427         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
428         if (!file || !file->f_op || !file->f_op->fsync)
429                 RETURN(-ENOSYS);
430
431         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
432 }
433 EXPORT_SYMBOL(lustre_fsync);
434
435 /* Note:  dput(dchild) will be called if there is an error */
436 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
437                              int flags)
438 {
439         mntget(ctxt->pwdmnt);
440         return ll_dentry_open(de, ctxt->pwdmnt, flags, current_cred());
441 }
442 EXPORT_SYMBOL(l_dentry_open);
443
444 #ifdef HAVE_VFS_READDIR_U64_INO
445 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
446                      u64 ino, unsigned int d_type)
447 #else
448 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
449                      ino_t ino, unsigned int d_type)
450 #endif
451 {
452         struct l_linux_dirent *dirent;
453         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
454
455         dirent = buf->lrc_dirent;
456         if (dirent)
457                dirent->lld_off = offset;
458
459         OBD_ALLOC(dirent, sizeof(*dirent));
460
461         if (!dirent)
462                 return -ENOMEM;
463
464         list_add_tail(&dirent->lld_list, buf->lrc_list);
465
466         buf->lrc_dirent = dirent;
467         dirent->lld_ino = ino;
468         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
469         memcpy(dirent->lld_name, name, namlen);
470
471         return 0;
472 }
473
474 long l_readdir(struct file *file, struct list_head *dentry_list)
475 {
476         struct l_linux_dirent *lastdirent;
477         struct l_readdir_callback buf;
478         int error;
479
480         buf.lrc_dirent = NULL;
481         buf.lrc_list = dentry_list; 
482
483         error = vfs_readdir(file, l_filldir, &buf);
484         if (error < 0)
485                 return error;
486
487         lastdirent = buf.lrc_dirent;
488         if (lastdirent)
489                 lastdirent->lld_off = file->f_pos;
490
491         return 0; 
492 }
493 EXPORT_SYMBOL(l_readdir);
494
495 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
496                     struct iattr *newattrs)
497 {
498         int rc;
499
500         LOCK_INODE_MUTEX(dchild->d_inode);
501 #ifdef HAVE_SECURITY_PLUG
502         rc = notify_change(dchild, mnt, newattrs);
503 #else
504         rc = notify_change(dchild, newattrs);
505 #endif
506         UNLOCK_INODE_MUTEX(dchild->d_inode);
507         return rc;
508 }
509 EXPORT_SYMBOL(l_notify_change);
510
511 /* utility to truncate a file */
512 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
513                     char *name, loff_t length)
514 {
515         struct dentry *dchild;
516         struct iattr newattrs;
517         int err = 0;
518         ENTRY;
519
520         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
521                name, (long long)length);
522         dchild = ll_lookup_one_len(name, dir, strlen(name));
523         if (IS_ERR(dchild))
524                 GOTO(out, err = PTR_ERR(dchild));
525
526         if (dchild->d_inode) {
527                 int old_mode = dchild->d_inode->i_mode;
528                 if (S_ISDIR(old_mode)) {
529                         CERROR("found %s (%lu/%u) is mode %o\n", name,
530                                dchild->d_inode->i_ino,
531                                dchild->d_inode->i_generation, old_mode);
532                         GOTO(out_dput, err = -EISDIR);
533                 }
534
535                 newattrs.ia_size = length;
536                 newattrs.ia_valid = ATTR_SIZE;
537                 err = l_notify_change(mnt, dchild, &newattrs);
538         }
539         EXIT;
540 out_dput:
541         dput(dchild);
542 out:
543         return err;
544 }
545 EXPORT_SYMBOL(simple_truncate);
546
547 #ifdef LUSTRE_KERNEL_VERSION
548 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
549 #error rdonly patchset must be updated [cfs bz11248]
550 #endif
551
552 void dev_set_rdonly(lvfs_sbdev_type dev);
553 int dev_check_rdonly(lvfs_sbdev_type dev);
554
555 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
556 {
557         lvfs_sbdev_sync(dev);
558         if (jdev && (jdev != dev)) {
559                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
560                        (long)jdev);
561                 dev_set_rdonly(jdev);
562         }
563         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
564         dev_set_rdonly(dev);
565 }
566
567 int lvfs_check_rdonly(lvfs_sbdev_type dev)
568 {
569         return dev_check_rdonly(dev);
570 }
571
572 EXPORT_SYMBOL(__lvfs_set_rdonly);
573 EXPORT_SYMBOL(lvfs_check_rdonly);
574 #endif /* LUSTRE_KERNEL_VERSION */
575
576 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
577 {
578         char *write_page = NULL;
579         loff_t offset = 0;
580         int rc = 0;
581         ENTRY;
582
583         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
584         if (!write_page)
585                 RETURN(-ENOMEM);
586         
587         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
588        
589         OBD_FREE(write_page, CFS_PAGE_SIZE);
590
591         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
592         RETURN(rc); 
593 }
594 EXPORT_SYMBOL(lvfs_check_io_health);
595
596 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
597 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
598 MODULE_LICENSE("GPL");