Whamcloud - gitweb
5f1274c9b889aa51f04e2e2c69130276079f48e5
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_FILTER
42
43 #include <linux/version.h>
44 #include <linux/fs.h>
45 #include <asm/unistd.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <linux/quotaops.h>
49 #include <linux/version.h>
50 #include <libcfs/libcfs.h>
51 #include <lustre_fsfilt.h>
52 #include <obd.h>
53 #include <linux/module.h>
54 #include <linux/init.h>
55 #include <linux/lustre_compat25.h>
56 #include <lvfs.h>
57 #include "lvfs_internal.h"
58
59 #include <obd.h>
60 #include <lustre_lib.h>
61
62 __u64 obd_max_pages = 0;
63 __u64 obd_max_alloc = 0;
64 struct lprocfs_stats *obd_memory = NULL;
65 EXPORT_SYMBOL(obd_memory);
66 DEFINE_SPINLOCK(obd_updatemax_lock);
67 /* refine later and change to seqlock or simlar from libcfs */
68
69 /* Debugging check only needed during development */
70 #ifdef OBD_CTXT_DEBUG
71 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
72 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
73                                               msg)
74 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
75 #else
76 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
77 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
78 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
79 #endif
80
81 static void push_group_info(struct lvfs_run_ctxt *save,
82                             struct group_info *ginfo)
83 {
84         if (!ginfo) {
85                 save->ngroups = current_ngroups;
86                 current_ngroups = 0;
87         } else {
88                 struct cred *cred;
89                 task_lock(current);
90                 save->group_info = current_cred()->group_info;
91                 if ((cred = prepare_creds())) {
92                         cred->group_info = ginfo;
93                         commit_creds(cred);
94                 }
95                 task_unlock(current);
96         }
97 }
98
99 static void pop_group_info(struct lvfs_run_ctxt *save,
100                            struct group_info *ginfo)
101 {
102         if (!ginfo) {
103                 current_ngroups = save->ngroups;
104         } else {
105                 struct cred *cred;
106                 task_lock(current);
107                 if ((cred = prepare_creds())) {
108                         cred->group_info = save->group_info;
109                         commit_creds(cred);
110                 }
111                 task_unlock(current);
112         }
113 }
114
115 /* push / pop to root of obd store */
116 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
117                struct lvfs_ucred *uc)
118 {
119         /* if there is underlaying dt_device then push_ctxt is not needed */
120         if (new_ctx->dt != NULL)
121                 return;
122
123         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
124         ASSERT_CTXT_MAGIC(new_ctx->magic);
125         OBD_SET_CTXT_MAGIC(save);
126
127         save->fs = get_fs();
128         LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
129         LASSERT(d_refcount(new_ctx->pwd));
130         save->pwd = dget(cfs_fs_pwd(current->fs));
131         save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
132         save->luc.luc_umask = cfs_curproc_umask();
133         save->ngroups = current_cred()->group_info->ngroups;
134
135         LASSERT(save->pwd);
136         LASSERT(save->pwdmnt);
137         LASSERT(new_ctx->pwd);
138         LASSERT(new_ctx->pwdmnt);
139
140         if (uc) {
141                 struct cred *cred;
142                 save->luc.luc_uid = current_uid();
143                 save->luc.luc_gid = current_gid();
144                 save->luc.luc_fsuid = current_fsuid();
145                 save->luc.luc_fsgid = current_fsgid();
146                 save->luc.luc_cap = current_cap();
147
148                 if ((cred = prepare_creds())) {
149                         cred->uid = uc->luc_uid;
150                         cred->gid = uc->luc_gid;
151                         cred->fsuid = uc->luc_fsuid;
152                         cred->fsgid = uc->luc_fsgid;
153                         cred->cap_effective = uc->luc_cap;
154                         commit_creds(cred);
155                 }
156
157                 push_group_info(save,
158                                 uc->luc_ginfo ?:
159                                 uc->luc_identity ? uc->luc_identity->mi_ginfo :
160                                                    NULL);
161         }
162         current->fs->umask = 0; /* umask already applied on client */
163         set_fs(new_ctx->fs);
164         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
165 }
166 EXPORT_SYMBOL(push_ctxt);
167
168 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
169               struct lvfs_ucred *uc)
170 {
171         /* if there is underlaying dt_device then pop_ctxt is not needed */
172         if (new_ctx->dt != NULL)
173                 return;
174
175         ASSERT_CTXT_MAGIC(saved->magic);
176         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
177
178         LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
179                  cfs_fs_pwd(current->fs), new_ctx->pwd);
180         LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
181                  cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
182
183         set_fs(saved->fs);
184         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
185
186         dput(saved->pwd);
187         mntput(saved->pwdmnt);
188         current->fs->umask = saved->luc.luc_umask;
189         if (uc) {
190                 struct cred *cred;
191                 if ((cred = prepare_creds())) {
192                         cred->uid = saved->luc.luc_uid;
193                         cred->gid = saved->luc.luc_gid;
194                         cred->fsuid = saved->luc.luc_fsuid;
195                         cred->fsgid = saved->luc.luc_fsgid;
196                         cred->cap_effective = saved->luc.luc_cap;
197                         commit_creds(cred);
198                 }
199
200                 pop_group_info(saved,
201                                uc->luc_ginfo ?:
202                                uc->luc_identity ? uc->luc_identity->mi_ginfo :
203                                                   NULL);
204         }
205 }
206 EXPORT_SYMBOL(pop_ctxt);
207
208 /* utility to make a file */
209 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
210 {
211         struct dentry *dchild;
212         int err = 0;
213         ENTRY;
214
215         // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
216         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
217
218         dchild = ll_lookup_one_len(name, dir, strlen(name));
219         if (IS_ERR(dchild))
220                 GOTO(out_up, dchild);
221
222         if (dchild->d_inode) {
223                 int old_mode = dchild->d_inode->i_mode;
224                 if (!S_ISREG(old_mode))
225                         GOTO(out_err, err = -EEXIST);
226
227                 /* Fixup file permissions if necessary */
228                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
229                         CWARN("fixing permissions on %s from %o to %o\n",
230                               name, old_mode, mode);
231                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
232                                                   (old_mode & ~S_IALLUGO);
233                         mark_inode_dirty(dchild->d_inode);
234                 }
235                 GOTO(out_up, dchild);
236         }
237
238         err = vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
239                             NULL);
240         if (err)
241                 GOTO(out_err, err);
242
243         RETURN(dchild);
244
245 out_err:
246         dput(dchild);
247         dchild = ERR_PTR(err);
248 out_up:
249         return dchild;
250 }
251 EXPORT_SYMBOL(simple_mknod);
252
253 /* utility to make a directory */
254 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
255                             const char *name, int mode, int fix)
256 {
257         struct dentry *dchild;
258         int err = 0;
259         ENTRY;
260
261         // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
262         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
263         dchild = ll_lookup_one_len(name, dir, strlen(name));
264         if (IS_ERR(dchild))
265                 GOTO(out_up, dchild);
266
267         if (dchild->d_inode) {
268                 int old_mode = dchild->d_inode->i_mode;
269                 if (!S_ISDIR(old_mode)) {
270                         CERROR("found %s (%lu/%u) is mode %o\n", name,
271                                dchild->d_inode->i_ino,
272                                dchild->d_inode->i_generation, old_mode);
273                         GOTO(out_err, err = -ENOTDIR);
274                 }
275
276                 /* Fixup directory permissions if necessary */
277                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
278                         CDEBUG(D_CONFIG,
279                                "fixing permissions on %s from %o to %o\n",
280                                name, old_mode, mode);
281                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
282                                                   (old_mode & ~S_IALLUGO);
283                         mark_inode_dirty(dchild->d_inode);
284                 }
285                 GOTO(out_up, dchild);
286         }
287
288         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
289         if (err)
290                 GOTO(out_err, err);
291
292         RETURN(dchild);
293
294 out_err:
295         dput(dchild);
296         dchild = ERR_PTR(err);
297 out_up:
298         return dchild;
299 }
300 EXPORT_SYMBOL(simple_mkdir);
301
302 /* utility to rename a file */
303 int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
304                   char *oldname, char *newname)
305 {
306         struct dentry *dchild_old, *dchild_new;
307         int err = 0;
308         ENTRY;
309
310         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
311         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
312                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
313
314         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
315         if (IS_ERR(dchild_old))
316                 RETURN(PTR_ERR(dchild_old));
317
318         if (!dchild_old->d_inode)
319                 GOTO(put_old, err = -ENOENT);
320
321         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
322         if (IS_ERR(dchild_new))
323                 GOTO(put_old, err = PTR_ERR(dchild_new));
324
325         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
326                             dir->d_inode, dchild_new, mnt);
327
328         dput(dchild_new);
329 put_old:
330         dput(dchild_old);
331         RETURN(err);
332 }
333 EXPORT_SYMBOL(lustre_rename);
334
335 /*
336  * Read a file from within kernel context.  Prior to calling this
337  * function we should already have done a push_ctxt().
338  */
339 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
340 {
341         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
342         if (!file || !file->f_op || !file->f_op->read || !off)
343                 RETURN(-ENOSYS);
344
345         return file->f_op->read(file, buf, len, off);
346 }
347 EXPORT_SYMBOL(lustre_fread);
348
349 /*
350  * Write a file from within kernel context.  Prior to calling this
351  * function we should already have done a push_ctxt().
352  */
353 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
354 {
355         ENTRY;
356         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
357         if (!file)
358                 RETURN(-ENOENT);
359         if (!file->f_op)
360                 RETURN(-ENOSYS);
361         if (!off)
362                 RETURN(-EINVAL);
363
364         if (!file->f_op->write)
365                 RETURN(-EROFS);
366
367         RETURN(file->f_op->write(file, buf, len, off));
368 }
369 EXPORT_SYMBOL(lustre_fwrite);
370
371 /*
372  * Sync a file from within kernel context.  Prior to calling this
373  * function we should already have done a push_ctxt().
374  */
375 int lustre_fsync(struct file *file)
376 {
377         ENTRY;
378         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
379         if (!file || !file->f_op || !file->f_op->fsync)
380                 RETURN(-ENOSYS);
381
382         RETURN(cfs_do_fsync(file, 0));
383 }
384 EXPORT_SYMBOL(lustre_fsync);
385
386 /* Note: dput(dchild) will be called if there is an error */
387 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
388                              int flags)
389 {
390         mntget(ctxt->pwdmnt);
391         return ll_dentry_open(de, ctxt->pwdmnt, flags, current_cred());
392 }
393 EXPORT_SYMBOL(l_dentry_open);
394
395 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
396                      u64 ino, unsigned int d_type)
397 {
398         struct l_linux_dirent *dirent;
399         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
400
401         dirent = buf->lrc_dirent;
402         if (dirent)
403                dirent->lld_off = offset;
404
405         OBD_ALLOC(dirent, sizeof(*dirent));
406
407         if (!dirent)
408                 return -ENOMEM;
409
410         cfs_list_add_tail(&dirent->lld_list, buf->lrc_list);
411
412         buf->lrc_dirent = dirent;
413         dirent->lld_ino = ino;
414         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
415         memcpy(dirent->lld_name, name, namlen);
416
417         return 0;
418 }
419
420 long l_readdir(struct file *file, cfs_list_t *dentry_list)
421 {
422         struct l_linux_dirent *lastdirent;
423         struct l_readdir_callback buf;
424         int error;
425
426         buf.lrc_dirent = NULL;
427         buf.lrc_list = dentry_list;
428
429         error = vfs_readdir(file, l_filldir, &buf);
430         if (error < 0)
431                 return error;
432
433         lastdirent = buf.lrc_dirent;
434         if (lastdirent)
435                 lastdirent->lld_off = file->f_pos;
436
437         return 0;
438 }
439 EXPORT_SYMBOL(l_readdir);
440
441 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
442                     struct iattr *newattrs)
443 {
444         int rc;
445
446         mutex_lock(&dchild->d_inode->i_mutex);
447 #ifdef HAVE_SECURITY_PLUG
448         rc = notify_change(dchild, mnt, newattrs);
449 #else
450         rc = notify_change(dchild, newattrs);
451 #endif
452         mutex_unlock(&dchild->d_inode->i_mutex);
453         return rc;
454 }
455 EXPORT_SYMBOL(l_notify_change);
456
457 /* utility to truncate a file */
458 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
459                  char *name, loff_t length)
460 {
461         struct dentry *dchild;
462         struct iattr newattrs;
463         int err = 0;
464         ENTRY;
465
466         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
467                name, (long long)length);
468         dchild = ll_lookup_one_len(name, dir, strlen(name));
469         if (IS_ERR(dchild))
470                 GOTO(out, err = PTR_ERR(dchild));
471
472         if (dchild->d_inode) {
473                 int old_mode = dchild->d_inode->i_mode;
474                 if (S_ISDIR(old_mode)) {
475                         CERROR("found %s (%lu/%u) is mode %o\n", name,
476                                dchild->d_inode->i_ino,
477                                dchild->d_inode->i_generation, old_mode);
478                         GOTO(out_dput, err = -EISDIR);
479                 }
480
481                 newattrs.ia_size = length;
482                 newattrs.ia_valid = ATTR_SIZE;
483                 err = l_notify_change(mnt, dchild, &newattrs);
484         }
485         EXIT;
486 out_dput:
487         dput(dchild);
488 out:
489         return err;
490 }
491 EXPORT_SYMBOL(simple_truncate);
492
493 int __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
494 {
495 #ifdef HAVE_DEV_SET_RDONLY
496         if (jdev && (jdev != dev)) {
497                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
498                        (long)jdev);
499                 dev_set_rdonly(jdev);
500         }
501         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
502         dev_set_rdonly(dev);
503
504         return 0;
505 #else
506         CERROR("DEV %lx CANNOT BE SET READONLY\n", (long)dev);
507
508         return -EOPNOTSUPP;
509 #endif
510 }
511 EXPORT_SYMBOL(__lvfs_set_rdonly);
512
513 int lvfs_check_rdonly(lvfs_sbdev_type dev)
514 {
515 #ifdef HAVE_DEV_SET_RDONLY
516         return dev_check_rdonly(dev);
517 #else
518         return 0;
519 #endif
520 }
521 EXPORT_SYMBOL(lvfs_check_rdonly);
522
523 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
524 {
525         char *write_page = NULL;
526         loff_t offset = 0;
527         int rc = 0;
528         ENTRY;
529
530         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
531         if (!write_page)
532                 RETURN(-ENOMEM);
533
534         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
535
536         OBD_FREE(write_page, CFS_PAGE_SIZE);
537
538         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
539         RETURN(rc);
540 }
541 EXPORT_SYMBOL(lvfs_check_io_health);
542
543 void obd_update_maxusage()
544 {
545         __u64 max1, max2;
546
547         max1 = obd_pages_sum();
548         max2 = obd_memory_sum();
549
550         spin_lock(&obd_updatemax_lock);
551         if (max1 > obd_max_pages)
552                 obd_max_pages = max1;
553         if (max2 > obd_max_alloc)
554                 obd_max_alloc = max2;
555         spin_unlock(&obd_updatemax_lock);
556
557 }
558 EXPORT_SYMBOL(obd_update_maxusage);
559
560 __u64 obd_memory_max(void)
561 {
562         __u64 ret;
563
564         spin_lock(&obd_updatemax_lock);
565         ret = obd_max_alloc;
566         spin_unlock(&obd_updatemax_lock);
567
568         return ret;
569 }
570 EXPORT_SYMBOL(obd_memory_max);
571
572 __u64 obd_pages_max(void)
573 {
574         __u64 ret;
575
576         spin_lock(&obd_updatemax_lock);
577         ret = obd_max_pages;
578         spin_unlock(&obd_updatemax_lock);
579
580         return ret;
581 }
582 EXPORT_SYMBOL(obd_pages_max);
583
584 #ifdef LPROCFS
585 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
586                           enum lprocfs_fields_flags field)
587 {
588         __s64 ret = 0;
589         int centry;
590
591         if (!lc)
592                 RETURN(0);
593         do {
594                 centry = cfs_atomic_read(&lc->lc_cntl.la_entry);
595
596                 switch (field) {
597                         case LPROCFS_FIELDS_FLAGS_CONFIG:
598                                 ret = lc->lc_config;
599                                 break;
600                         case LPROCFS_FIELDS_FLAGS_SUM:
601                                 ret = lc->lc_sum + lc->lc_sum_irq;
602                                 break;
603                         case LPROCFS_FIELDS_FLAGS_MIN:
604                                 ret = lc->lc_min;
605                                 break;
606                         case LPROCFS_FIELDS_FLAGS_MAX:
607                                 ret = lc->lc_max;
608                                 break;
609                         case LPROCFS_FIELDS_FLAGS_AVG:
610                                 ret = (lc->lc_max - lc->lc_min)/2;
611                                 break;
612                         case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
613                                 ret = lc->lc_sumsquare;
614                                 break;
615                         case LPROCFS_FIELDS_FLAGS_COUNT:
616                                 ret = lc->lc_count;
617                                 break;
618                         default:
619                                 break;
620                 };
621         } while (centry != cfs_atomic_read(&lc->lc_cntl.la_entry) &&
622                  centry != cfs_atomic_read(&lc->lc_cntl.la_exit));
623
624         RETURN(ret);
625 }
626 EXPORT_SYMBOL(lprocfs_read_helper);
627 #endif /* LPROCFS */
628
629 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
630 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
631 MODULE_LICENSE("GPL");