Whamcloud - gitweb
63d29926846d7f3893a9bce34f00d99c1dbee54b
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
30  * Use is subject to license terms.
31  *
32  * Copyright (c) 2011, 2012, Whamcloud, Inc.
33  */
34 /*
35  * This file is part of Lustre, http://www.lustre.org/
36  * Lustre is a trademark of Sun Microsystems, Inc.
37  *
38  * lustre/lvfs/lvfs_linux.c
39  *
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #ifndef EXPORT_SYMTAB
44 # define EXPORT_SYMTAB
45 #endif
46
47 #define DEBUG_SUBSYSTEM S_FILTER
48
49 #include <linux/version.h>
50 #include <linux/fs.h>
51 #include <asm/unistd.h>
52 #include <linux/slab.h>
53 #include <linux/pagemap.h>
54 #include <linux/quotaops.h>
55 #include <linux/version.h>
56 #include <libcfs/libcfs.h>
57 #include <lustre_fsfilt.h>
58 #include <obd.h>
59 #include <linux/module.h>
60 #include <linux/init.h>
61 #include <linux/lustre_compat25.h>
62 #include <lvfs.h>
63 #include "lvfs_internal.h"
64
65 #include <obd.h>
66 #include <lustre_lib.h>
67 #include <lustre_quota.h>
68
69 __u64 obd_max_pages = 0;
70 __u64 obd_max_alloc = 0;
71 struct lprocfs_stats *obd_memory = NULL;
72 cfs_spinlock_t obd_updatemax_lock = CFS_SPIN_LOCK_UNLOCKED;
73 /* refine later and change to seqlock or simlar from libcfs */
74
75 /* Debugging check only needed during development */
76 #ifdef OBD_CTXT_DEBUG
77 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
78 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
79                                               msg)
80 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
81 #else
82 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
83 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
84 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
85 #endif
86
87 static void push_group_info(struct lvfs_run_ctxt *save,
88                             struct group_info *ginfo)
89 {
90         if (!ginfo) {
91                 save->ngroups = current_ngroups;
92                 current_ngroups = 0;
93         } else {
94                 struct cred *cred;
95                 task_lock(current);
96                 save->group_info = current_cred()->group_info;
97                 if ((cred = prepare_creds())) {
98                         cred->group_info = ginfo;
99                         commit_creds(cred);
100                 }
101                 task_unlock(current);
102         }
103 }
104
105 static void pop_group_info(struct lvfs_run_ctxt *save,
106                            struct group_info *ginfo)
107 {
108         if (!ginfo) {
109                 current_ngroups = save->ngroups;
110         } else {
111                 struct cred *cred;
112                 task_lock(current);
113                 if ((cred = prepare_creds())) {
114                         cred->group_info = save->group_info;
115                         commit_creds(cred);
116                 }
117                 task_unlock(current);
118         }
119 }
120
121 /* push / pop to root of obd store */
122 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
123                struct lvfs_ucred *uc)
124 {
125         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
126         ASSERT_CTXT_MAGIC(new_ctx->magic);
127         OBD_SET_CTXT_MAGIC(save);
128
129         save->fs = get_fs();
130         LASSERT(cfs_atomic_read(&cfs_fs_pwd(current->fs)->d_count));
131         LASSERT(cfs_atomic_read(&new_ctx->pwd->d_count));
132         save->pwd = dget(cfs_fs_pwd(current->fs));
133         save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
134         save->luc.luc_umask = cfs_curproc_umask();
135         save->ngroups = current_cred()->group_info->ngroups;
136
137         LASSERT(save->pwd);
138         LASSERT(save->pwdmnt);
139         LASSERT(new_ctx->pwd);
140         LASSERT(new_ctx->pwdmnt);
141
142         if (uc) {
143                 struct cred *cred;
144                 save->luc.luc_uid = current_uid();
145                 save->luc.luc_gid = current_gid();
146                 save->luc.luc_fsuid = current_fsuid();
147                 save->luc.luc_fsgid = current_fsgid();
148                 save->luc.luc_cap = current_cap();
149
150                 if ((cred = prepare_creds())) {
151                         cred->uid = uc->luc_uid;
152                         cred->gid = uc->luc_gid;
153                         cred->fsuid = uc->luc_fsuid;
154                         cred->fsgid = uc->luc_fsgid;
155                         cred->cap_effective = uc->luc_cap;
156                         commit_creds(cred);
157                 }
158
159                 push_group_info(save,
160                                 uc->luc_ginfo ?:
161                                 uc->luc_identity ? uc->luc_identity->mi_ginfo :
162                                                    NULL);
163         }
164         current->fs->umask = 0; /* umask already applied on client */
165         set_fs(new_ctx->fs);
166         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
167 }
168 EXPORT_SYMBOL(push_ctxt);
169
170 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
171               struct lvfs_ucred *uc)
172 {
173         ASSERT_CTXT_MAGIC(saved->magic);
174         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
175
176         LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
177                  cfs_fs_pwd(current->fs), new_ctx->pwd);
178         LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
179                  cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
180
181         set_fs(saved->fs);
182         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
183
184         dput(saved->pwd);
185         mntput(saved->pwdmnt);
186         current->fs->umask = saved->luc.luc_umask;
187         if (uc) {
188                 struct cred *cred;
189                 if ((cred = prepare_creds())) {
190                         cred->uid = saved->luc.luc_uid;
191                         cred->gid = saved->luc.luc_gid;
192                         cred->fsuid = saved->luc.luc_fsuid;
193                         cred->fsgid = saved->luc.luc_fsgid;
194                         cred->cap_effective = saved->luc.luc_cap;
195                         commit_creds(cred);
196                 }
197
198                 pop_group_info(saved,
199                                uc->luc_ginfo ?:
200                                uc->luc_identity ? uc->luc_identity->mi_ginfo :
201                                                   NULL);
202         }
203 }
204 EXPORT_SYMBOL(pop_ctxt);
205
206 /* utility to make a file */
207 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
208 {
209         struct dentry *dchild;
210         int err = 0;
211         ENTRY;
212
213         // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
214         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
215
216         dchild = ll_lookup_one_len(name, dir, strlen(name));
217         if (IS_ERR(dchild))
218                 GOTO(out_up, dchild);
219
220         if (dchild->d_inode) {
221                 int old_mode = dchild->d_inode->i_mode;
222                 if (!S_ISREG(old_mode))
223                         GOTO(out_err, err = -EEXIST);
224
225                 /* Fixup file permissions if necessary */
226                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
227                         CWARN("fixing permissions on %s from %o to %o\n",
228                               name, old_mode, mode);
229                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
230                                                   (old_mode & ~S_IALLUGO);
231                         mark_inode_dirty(dchild->d_inode);
232                 }
233                 GOTO(out_up, dchild);
234         }
235
236         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
237                             NULL);
238         if (err)
239                 GOTO(out_err, err);
240
241         RETURN(dchild);
242
243 out_err:
244         dput(dchild);
245         dchild = ERR_PTR(err);
246 out_up:
247         return dchild;
248 }
249 EXPORT_SYMBOL(simple_mknod);
250
251 /* utility to make a directory */
252 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
253                             const char *name, int mode, int fix)
254 {
255         struct dentry *dchild;
256         int err = 0;
257         ENTRY;
258
259         // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
260         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
261         dchild = ll_lookup_one_len(name, dir, strlen(name));
262         if (IS_ERR(dchild))
263                 GOTO(out_up, dchild);
264
265         if (dchild->d_inode) {
266                 int old_mode = dchild->d_inode->i_mode;
267                 if (!S_ISDIR(old_mode)) {
268                         CERROR("found %s (%lu/%u) is mode %o\n", name,
269                                dchild->d_inode->i_ino,
270                                dchild->d_inode->i_generation, old_mode);
271                         GOTO(out_err, err = -ENOTDIR);
272                 }
273
274                 /* Fixup directory permissions if necessary */
275                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
276                         CDEBUG(D_CONFIG,
277                                "fixing permissions on %s from %o to %o\n",
278                                name, old_mode, mode);
279                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
280                                                   (old_mode & ~S_IALLUGO);
281                         mark_inode_dirty(dchild->d_inode);
282                 }
283                 GOTO(out_up, dchild);
284         }
285
286         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
287         if (err)
288                 GOTO(out_err, err);
289
290         RETURN(dchild);
291
292 out_err:
293         dput(dchild);
294         dchild = ERR_PTR(err);
295 out_up:
296         return dchild;
297 }
298 EXPORT_SYMBOL(simple_mkdir);
299
300 /* utility to rename a file */
301 int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
302                   char *oldname, char *newname)
303 {
304         struct dentry *dchild_old, *dchild_new;
305         int err = 0;
306         ENTRY;
307
308         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
309         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
310                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
311
312         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
313         if (IS_ERR(dchild_old))
314                 RETURN(PTR_ERR(dchild_old));
315
316         if (!dchild_old->d_inode)
317                 GOTO(put_old, err = -ENOENT);
318
319         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
320         if (IS_ERR(dchild_new))
321                 GOTO(put_old, err = PTR_ERR(dchild_new));
322
323         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
324                             dir->d_inode, dchild_new, mnt);
325
326         dput(dchild_new);
327 put_old:
328         dput(dchild_old);
329         RETURN(err);
330 }
331 EXPORT_SYMBOL(lustre_rename);
332
333 /*
334  * Read a file from within kernel context.  Prior to calling this
335  * function we should already have done a push_ctxt().
336  */
337 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
338 {
339         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
340         if (!file || !file->f_op || !file->f_op->read || !off)
341                 RETURN(-ENOSYS);
342
343         return file->f_op->read(file, buf, len, off);
344 }
345 EXPORT_SYMBOL(lustre_fread);
346
347 /*
348  * Write a file from within kernel context.  Prior to calling this
349  * function we should already have done a push_ctxt().
350  */
351 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
352 {
353         ENTRY;
354         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
355         if (!file)
356                 RETURN(-ENOENT);
357         if (!file->f_op)
358                 RETURN(-ENOSYS);
359         if (!off)
360                 RETURN(-EINVAL);
361
362         if (!file->f_op->write)
363                 RETURN(-EROFS);
364
365         RETURN(file->f_op->write(file, buf, len, off));
366 }
367 EXPORT_SYMBOL(lustre_fwrite);
368
369 /*
370  * Sync a file from within kernel context.  Prior to calling this
371  * function we should already have done a push_ctxt().
372  */
373 int lustre_fsync(struct file *file)
374 {
375         ENTRY;
376         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
377         if (!file || !file->f_op || !file->f_op->fsync)
378                 RETURN(-ENOSYS);
379
380         RETURN(cfs_do_fsync(file, 0));
381 }
382 EXPORT_SYMBOL(lustre_fsync);
383
384 /* Note: dput(dchild) will be called if there is an error */
385 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
386                              int flags)
387 {
388         mntget(ctxt->pwdmnt);
389         return ll_dentry_open(de, ctxt->pwdmnt, flags, current_cred());
390 }
391 EXPORT_SYMBOL(l_dentry_open);
392
393 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
394                      u64 ino, unsigned int d_type)
395 {
396         struct l_linux_dirent *dirent;
397         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
398
399         dirent = buf->lrc_dirent;
400         if (dirent)
401                dirent->lld_off = offset;
402
403         OBD_ALLOC(dirent, sizeof(*dirent));
404
405         if (!dirent)
406                 return -ENOMEM;
407
408         cfs_list_add_tail(&dirent->lld_list, buf->lrc_list);
409
410         buf->lrc_dirent = dirent;
411         dirent->lld_ino = ino;
412         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
413         memcpy(dirent->lld_name, name, namlen);
414
415         return 0;
416 }
417
418 long l_readdir(struct file *file, cfs_list_t *dentry_list)
419 {
420         struct l_linux_dirent *lastdirent;
421         struct l_readdir_callback buf;
422         int error;
423
424         buf.lrc_dirent = NULL;
425         buf.lrc_list = dentry_list;
426
427         error = vfs_readdir(file, l_filldir, &buf);
428         if (error < 0)
429                 return error;
430
431         lastdirent = buf.lrc_dirent;
432         if (lastdirent)
433                 lastdirent->lld_off = file->f_pos;
434
435         return 0;
436 }
437 EXPORT_SYMBOL(l_readdir);
438
439 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
440                  struct iattr *newattrs)
441 {
442         int rc;
443
444         LOCK_INODE_MUTEX(dchild->d_inode);
445 #ifdef HAVE_SECURITY_PLUG
446         rc = notify_change(dchild, mnt, newattrs);
447 #else
448         rc = notify_change(dchild, newattrs);
449 #endif
450         UNLOCK_INODE_MUTEX(dchild->d_inode);
451         return rc;
452 }
453 EXPORT_SYMBOL(l_notify_change);
454
455 /* utility to truncate a file */
456 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
457                  char *name, loff_t length)
458 {
459         struct dentry *dchild;
460         struct iattr newattrs;
461         int err = 0;
462         ENTRY;
463
464         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
465                name, (long long)length);
466         dchild = ll_lookup_one_len(name, dir, strlen(name));
467         if (IS_ERR(dchild))
468                 GOTO(out, err = PTR_ERR(dchild));
469
470         if (dchild->d_inode) {
471                 int old_mode = dchild->d_inode->i_mode;
472                 if (S_ISDIR(old_mode)) {
473                         CERROR("found %s (%lu/%u) is mode %o\n", name,
474                                dchild->d_inode->i_ino,
475                                dchild->d_inode->i_generation, old_mode);
476                         GOTO(out_dput, err = -EISDIR);
477                 }
478
479                 newattrs.ia_size = length;
480                 newattrs.ia_valid = ATTR_SIZE;
481                 err = l_notify_change(mnt, dchild, &newattrs);
482         }
483         EXIT;
484 out_dput:
485         dput(dchild);
486 out:
487         return err;
488 }
489 EXPORT_SYMBOL(simple_truncate);
490
491 #ifdef LUSTRE_KERNEL_VERSION
492 int __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
493 {
494 #ifdef HAVE_DEV_SET_RDONLY
495         if (jdev && (jdev != dev)) {
496                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
497                        (long)jdev);
498                 dev_set_rdonly(jdev);
499         }
500         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
501         dev_set_rdonly(dev);
502
503         return 0;
504 #else
505         CERROR("DEV %lx CANNOT BE SET READONLY\n", (long)dev);
506
507         return -EOPNOTSUPP;
508 #endif
509 }
510 EXPORT_SYMBOL(__lvfs_set_rdonly);
511
512 int lvfs_check_rdonly(lvfs_sbdev_type dev)
513 {
514 #ifdef HAVE_DEV_SET_RDONLY
515         return dev_check_rdonly(dev);
516 #else
517         return 0;
518 #endif
519 }
520 EXPORT_SYMBOL(lvfs_check_rdonly);
521
522 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
523 {
524         char *write_page = NULL;
525         loff_t offset = 0;
526         int rc = 0;
527         ENTRY;
528
529         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
530         if (!write_page)
531                 RETURN(-ENOMEM);
532
533         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
534
535         OBD_FREE(write_page, CFS_PAGE_SIZE);
536
537         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
538         RETURN(rc);
539 }
540 EXPORT_SYMBOL(lvfs_check_io_health);
541 #endif /* LUSTRE_KERNEL_VERSION */
542
543 void obd_update_maxusage()
544 {
545         __u64 max1, max2;
546
547         max1 = obd_pages_sum();
548         max2 = obd_memory_sum();
549
550         cfs_spin_lock(&obd_updatemax_lock);
551         if (max1 > obd_max_pages)
552                 obd_max_pages = max1;
553         if (max2 > obd_max_alloc)
554                 obd_max_alloc = max2;
555         cfs_spin_unlock(&obd_updatemax_lock);
556
557 }
558
559 __u64 obd_memory_max(void)
560 {
561         __u64 ret;
562
563         cfs_spin_lock(&obd_updatemax_lock);
564         ret = obd_max_alloc;
565         cfs_spin_unlock(&obd_updatemax_lock);
566
567         return ret;
568 }
569
570 __u64 obd_pages_max(void)
571 {
572         __u64 ret;
573
574         cfs_spin_lock(&obd_updatemax_lock);
575         ret = obd_max_pages;
576         cfs_spin_unlock(&obd_updatemax_lock);
577
578         return ret;
579 }
580
581 EXPORT_SYMBOL(obd_update_maxusage);
582 EXPORT_SYMBOL(obd_pages_max);
583 EXPORT_SYMBOL(obd_memory_max);
584 EXPORT_SYMBOL(obd_memory);
585
586 #ifdef LPROCFS
587 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
588                           enum lprocfs_fields_flags field)
589 {
590         __s64 ret = 0;
591         int centry;
592
593         if (!lc)
594                 RETURN(0);
595         do {
596                 centry = cfs_atomic_read(&lc->lc_cntl.la_entry);
597
598                 switch (field) {
599                         case LPROCFS_FIELDS_FLAGS_CONFIG:
600                                 ret = lc->lc_config;
601                                 break;
602                         case LPROCFS_FIELDS_FLAGS_SUM:
603                                 ret = lc->lc_sum + lc->lc_sum_irq;
604                                 break;
605                         case LPROCFS_FIELDS_FLAGS_MIN:
606                                 ret = lc->lc_min;
607                                 break;
608                         case LPROCFS_FIELDS_FLAGS_MAX:
609                                 ret = lc->lc_max;
610                                 break;
611                         case LPROCFS_FIELDS_FLAGS_AVG:
612                                 ret = (lc->lc_max - lc->lc_min)/2;
613                                 break;
614                         case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
615                                 ret = lc->lc_sumsquare;
616                                 break;
617                         case LPROCFS_FIELDS_FLAGS_COUNT:
618                                 ret = lc->lc_count;
619                                 break;
620                         default:
621                                 break;
622                 };
623         } while (centry != cfs_atomic_read(&lc->lc_cntl.la_entry) &&
624                  centry != cfs_atomic_read(&lc->lc_cntl.la_exit));
625
626         RETURN(ret);
627 }
628 EXPORT_SYMBOL(lprocfs_read_helper);
629 #endif /* LPROCFS */
630
631 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
632 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
633 MODULE_LICENSE("GPL");