Whamcloud - gitweb
eac9f9cba35f0c5dce5d31c38ad5fd19263fb395
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #ifndef EXPORT_SYMTAB
42 # define EXPORT_SYMTAB
43 #endif
44
45 #define DEBUG_SUBSYSTEM S_FILTER
46
47 #include <linux/version.h>
48 #include <linux/fs.h>
49 #include <asm/unistd.h>
50 #include <linux/slab.h>
51 #include <linux/pagemap.h>
52 #include <linux/quotaops.h>
53 #include <linux/version.h>
54 #include <libcfs/libcfs.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/module.h>
58 #include <linux/init.h>
59 #include <linux/lustre_compat25.h>
60 #include <lvfs.h>
61 #include "lvfs_internal.h"
62
63 #include <obd.h>
64 #include <lustre_lib.h>
65 #include <lustre_quota.h>
66
67 __u64 obd_max_pages = 0;
68 __u64 obd_max_alloc = 0;
69 struct lprocfs_stats *obd_memory = NULL;
70 cfs_spinlock_t obd_updatemax_lock = CFS_SPIN_LOCK_UNLOCKED;
71 /* refine later and change to seqlock or simlar from libcfs */
72
73 /* Debugging check only needed during development */
74 #ifdef OBD_CTXT_DEBUG
75 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
76 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
77                                               msg)
78 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
79 #else
80 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
81 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
82 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
83 #endif
84
85 static void push_group_info(struct lvfs_run_ctxt *save,
86                             struct group_info *ginfo)
87 {
88         if (!ginfo) {
89                 save->ngroups = current_ngroups;
90                 current_ngroups = 0;
91         } else {
92                 struct cred *cred;
93                 task_lock(current);
94                 save->group_info = current_cred()->group_info;
95                 if ((cred = prepare_creds())) {
96                         cred->group_info = ginfo;
97                         commit_creds(cred);
98                 }
99                 task_unlock(current);
100         }
101 }
102
103 static void pop_group_info(struct lvfs_run_ctxt *save,
104                            struct group_info *ginfo)
105 {
106         if (!ginfo) {
107                 current_ngroups = save->ngroups;
108         } else {
109                 struct cred *cred;
110                 task_lock(current);
111                 if ((cred = prepare_creds())) {
112                         cred->group_info = save->group_info;
113                         commit_creds(cred);
114                 }
115                 task_unlock(current);
116         }
117 }
118
119 /* push / pop to root of obd store */
120 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
121                struct lvfs_ucred *uc)
122 {
123         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
124         ASSERT_CTXT_MAGIC(new_ctx->magic);
125         OBD_SET_CTXT_MAGIC(save);
126
127         save->fs = get_fs();
128         LASSERT(cfs_atomic_read(&cfs_fs_pwd(current->fs)->d_count));
129         LASSERT(cfs_atomic_read(&new_ctx->pwd->d_count));
130         save->pwd = dget(cfs_fs_pwd(current->fs));
131         save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
132         save->luc.luc_umask = cfs_curproc_umask();
133         save->ngroups = current_cred()->group_info->ngroups;
134
135         LASSERT(save->pwd);
136         LASSERT(save->pwdmnt);
137         LASSERT(new_ctx->pwd);
138         LASSERT(new_ctx->pwdmnt);
139
140         if (uc) {
141                 struct cred *cred;
142                 save->luc.luc_uid = current_uid();
143                 save->luc.luc_gid = current_gid();
144                 save->luc.luc_fsuid = current_fsuid();
145                 save->luc.luc_fsgid = current_fsgid();
146                 save->luc.luc_cap = current_cap();
147
148                 if ((cred = prepare_creds())) {
149                         cred->uid = uc->luc_uid;
150                         cred->gid = uc->luc_gid;
151                         cred->fsuid = uc->luc_fsuid;
152                         cred->fsgid = uc->luc_fsgid;
153                         cred->cap_effective = uc->luc_cap;
154                         commit_creds(cred);
155                 }
156
157                 push_group_info(save,
158                                 uc->luc_ginfo ?:
159                                 uc->luc_identity ? uc->luc_identity->mi_ginfo :
160                                                    NULL);
161         }
162         current->fs->umask = 0; /* umask already applied on client */
163         set_fs(new_ctx->fs);
164         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
165 }
166 EXPORT_SYMBOL(push_ctxt);
167
168 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
169               struct lvfs_ucred *uc)
170 {
171         ASSERT_CTXT_MAGIC(saved->magic);
172         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
173
174         LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
175                  cfs_fs_pwd(current->fs), new_ctx->pwd);
176         LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
177                  cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
178
179         set_fs(saved->fs);
180         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
181
182         dput(saved->pwd);
183         mntput(saved->pwdmnt);
184         current->fs->umask = saved->luc.luc_umask;
185         if (uc) {
186                 struct cred *cred;
187                 if ((cred = prepare_creds())) {
188                         cred->uid = saved->luc.luc_uid;
189                         cred->gid = saved->luc.luc_gid;
190                         cred->fsuid = saved->luc.luc_fsuid;
191                         cred->fsgid = saved->luc.luc_fsgid;
192                         cred->cap_effective = saved->luc.luc_cap;
193                         commit_creds(cred);
194                 }
195
196                 pop_group_info(saved,
197                                uc->luc_ginfo ?:
198                                uc->luc_identity ? uc->luc_identity->mi_ginfo :
199                                                   NULL);
200         }
201 }
202 EXPORT_SYMBOL(pop_ctxt);
203
204 /* utility to make a file */
205 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
206 {
207         struct dentry *dchild;
208         int err = 0;
209         ENTRY;
210
211         // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
212         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
213
214         dchild = ll_lookup_one_len(name, dir, strlen(name));
215         if (IS_ERR(dchild))
216                 GOTO(out_up, dchild);
217
218         if (dchild->d_inode) {
219                 int old_mode = dchild->d_inode->i_mode;
220                 if (!S_ISREG(old_mode))
221                         GOTO(out_err, err = -EEXIST);
222
223                 /* Fixup file permissions if necessary */
224                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
225                         CWARN("fixing permissions on %s from %o to %o\n",
226                               name, old_mode, mode);
227                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
228                                                   (old_mode & ~S_IALLUGO);
229                         mark_inode_dirty(dchild->d_inode);
230                 }
231                 GOTO(out_up, dchild);
232         }
233
234         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
235                             NULL);
236         if (err)
237                 GOTO(out_err, err);
238
239         RETURN(dchild);
240
241 out_err:
242         dput(dchild);
243         dchild = ERR_PTR(err);
244 out_up:
245         return dchild;
246 }
247 EXPORT_SYMBOL(simple_mknod);
248
249 /* utility to make a directory */
250 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
251                             const char *name, int mode, int fix)
252 {
253         struct dentry *dchild;
254         int err = 0;
255         ENTRY;
256
257         // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
258         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
259         dchild = ll_lookup_one_len(name, dir, strlen(name));
260         if (IS_ERR(dchild))
261                 GOTO(out_up, dchild);
262
263         if (dchild->d_inode) {
264                 int old_mode = dchild->d_inode->i_mode;
265                 if (!S_ISDIR(old_mode)) {
266                         CERROR("found %s (%lu/%u) is mode %o\n", name,
267                                dchild->d_inode->i_ino,
268                                dchild->d_inode->i_generation, old_mode);
269                         GOTO(out_err, err = -ENOTDIR);
270                 }
271
272                 /* Fixup directory permissions if necessary */
273                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
274                         CDEBUG(D_CONFIG,
275                                "fixing permissions on %s from %o to %o\n",
276                                name, old_mode, mode);
277                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
278                                                   (old_mode & ~S_IALLUGO);
279                         mark_inode_dirty(dchild->d_inode);
280                 }
281                 GOTO(out_up, dchild);
282         }
283
284         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
285         if (err)
286                 GOTO(out_err, err);
287
288         RETURN(dchild);
289
290 out_err:
291         dput(dchild);
292         dchild = ERR_PTR(err);
293 out_up:
294         return dchild;
295 }
296 EXPORT_SYMBOL(simple_mkdir);
297
298 /* utility to rename a file */
299 int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
300                   char *oldname, char *newname)
301 {
302         struct dentry *dchild_old, *dchild_new;
303         int err = 0;
304         ENTRY;
305
306         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
307         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
308                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
309
310         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
311         if (IS_ERR(dchild_old))
312                 RETURN(PTR_ERR(dchild_old));
313
314         if (!dchild_old->d_inode)
315                 GOTO(put_old, err = -ENOENT);
316
317         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
318         if (IS_ERR(dchild_new))
319                 GOTO(put_old, err = PTR_ERR(dchild_new));
320
321         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
322                             dir->d_inode, dchild_new, mnt);
323
324         dput(dchild_new);
325 put_old:
326         dput(dchild_old);
327         RETURN(err);
328 }
329 EXPORT_SYMBOL(lustre_rename);
330
331 /*
332  * Read a file from within kernel context.  Prior to calling this
333  * function we should already have done a push_ctxt().
334  */
335 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
336 {
337         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
338         if (!file || !file->f_op || !file->f_op->read || !off)
339                 RETURN(-ENOSYS);
340
341         return file->f_op->read(file, buf, len, off);
342 }
343 EXPORT_SYMBOL(lustre_fread);
344
345 /*
346  * Write a file from within kernel context.  Prior to calling this
347  * function we should already have done a push_ctxt().
348  */
349 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
350 {
351         ENTRY;
352         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
353         if (!file)
354                 RETURN(-ENOENT);
355         if (!file->f_op)
356                 RETURN(-ENOSYS);
357         if (!off)
358                 RETURN(-EINVAL);
359
360         if (!file->f_op->write)
361                 RETURN(-EROFS);
362
363         RETURN(file->f_op->write(file, buf, len, off));
364 }
365 EXPORT_SYMBOL(lustre_fwrite);
366
367 /*
368  * Sync a file from within kernel context.  Prior to calling this
369  * function we should already have done a push_ctxt().
370  */
371 int lustre_fsync(struct file *file)
372 {
373         ENTRY;
374         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
375         if (!file || !file->f_op || !file->f_op->fsync)
376                 RETURN(-ENOSYS);
377
378         RETURN(cfs_do_fsync(file, 0));
379 }
380 EXPORT_SYMBOL(lustre_fsync);
381
382 /* Note: dput(dchild) will be called if there is an error */
383 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
384                              int flags)
385 {
386         mntget(ctxt->pwdmnt);
387         return ll_dentry_open(de, ctxt->pwdmnt, flags, current_cred());
388 }
389 EXPORT_SYMBOL(l_dentry_open);
390
391 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
392                      u64 ino, unsigned int d_type)
393 {
394         struct l_linux_dirent *dirent;
395         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
396
397         dirent = buf->lrc_dirent;
398         if (dirent)
399                dirent->lld_off = offset;
400
401         OBD_ALLOC(dirent, sizeof(*dirent));
402
403         if (!dirent)
404                 return -ENOMEM;
405
406         cfs_list_add_tail(&dirent->lld_list, buf->lrc_list);
407
408         buf->lrc_dirent = dirent;
409         dirent->lld_ino = ino;
410         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
411         memcpy(dirent->lld_name, name, namlen);
412
413         return 0;
414 }
415
416 long l_readdir(struct file *file, cfs_list_t *dentry_list)
417 {
418         struct l_linux_dirent *lastdirent;
419         struct l_readdir_callback buf;
420         int error;
421
422         buf.lrc_dirent = NULL;
423         buf.lrc_list = dentry_list;
424
425         error = vfs_readdir(file, l_filldir, &buf);
426         if (error < 0)
427                 return error;
428
429         lastdirent = buf.lrc_dirent;
430         if (lastdirent)
431                 lastdirent->lld_off = file->f_pos;
432
433         return 0;
434 }
435 EXPORT_SYMBOL(l_readdir);
436
437 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
438                  struct iattr *newattrs)
439 {
440         int rc;
441
442         LOCK_INODE_MUTEX(dchild->d_inode);
443 #ifdef HAVE_SECURITY_PLUG
444         rc = notify_change(dchild, mnt, newattrs);
445 #else
446         rc = notify_change(dchild, newattrs);
447 #endif
448         UNLOCK_INODE_MUTEX(dchild->d_inode);
449         return rc;
450 }
451 EXPORT_SYMBOL(l_notify_change);
452
453 /* utility to truncate a file */
454 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
455                  char *name, loff_t length)
456 {
457         struct dentry *dchild;
458         struct iattr newattrs;
459         int err = 0;
460         ENTRY;
461
462         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
463                name, (long long)length);
464         dchild = ll_lookup_one_len(name, dir, strlen(name));
465         if (IS_ERR(dchild))
466                 GOTO(out, err = PTR_ERR(dchild));
467
468         if (dchild->d_inode) {
469                 int old_mode = dchild->d_inode->i_mode;
470                 if (S_ISDIR(old_mode)) {
471                         CERROR("found %s (%lu/%u) is mode %o\n", name,
472                                dchild->d_inode->i_ino,
473                                dchild->d_inode->i_generation, old_mode);
474                         GOTO(out_dput, err = -EISDIR);
475                 }
476
477                 newattrs.ia_size = length;
478                 newattrs.ia_valid = ATTR_SIZE;
479                 err = l_notify_change(mnt, dchild, &newattrs);
480         }
481         EXIT;
482 out_dput:
483         dput(dchild);
484 out:
485         return err;
486 }
487 EXPORT_SYMBOL(simple_truncate);
488
489 #ifdef LUSTRE_KERNEL_VERSION
490 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
491 #error rdonly patchset must be updated [cfs bz11248]
492 #endif
493 void dev_set_rdonly(lvfs_sbdev_type dev);
494 int dev_check_rdonly(lvfs_sbdev_type dev);
495
496 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
497 {
498         if (jdev && (jdev != dev)) {
499                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
500                        (long)jdev);
501                 dev_set_rdonly(jdev);
502         }
503         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
504         dev_set_rdonly(dev);
505 }
506
507 int lvfs_check_rdonly(lvfs_sbdev_type dev)
508 {
509         return dev_check_rdonly(dev);
510 }
511
512 EXPORT_SYMBOL(__lvfs_set_rdonly);
513 EXPORT_SYMBOL(lvfs_check_rdonly);
514
515 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
516 {
517         char *write_page = NULL;
518         loff_t offset = 0;
519         int rc = 0;
520         ENTRY;
521
522         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
523         if (!write_page)
524                 RETURN(-ENOMEM);
525
526         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
527
528         OBD_FREE(write_page, CFS_PAGE_SIZE);
529
530         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
531         RETURN(rc);
532 }
533 EXPORT_SYMBOL(lvfs_check_io_health);
534 #endif /* LUSTRE_KERNEL_VERSION */
535
536 void obd_update_maxusage()
537 {
538         __u64 max1, max2;
539
540         max1 = obd_pages_sum();
541         max2 = obd_memory_sum();
542
543         cfs_spin_lock(&obd_updatemax_lock);
544         if (max1 > obd_max_pages)
545                 obd_max_pages = max1;
546         if (max2 > obd_max_alloc)
547                 obd_max_alloc = max2;
548         cfs_spin_unlock(&obd_updatemax_lock);
549
550 }
551
552 __u64 obd_memory_max(void)
553 {
554         __u64 ret;
555
556         cfs_spin_lock(&obd_updatemax_lock);
557         ret = obd_max_alloc;
558         cfs_spin_unlock(&obd_updatemax_lock);
559
560         return ret;
561 }
562
563 __u64 obd_pages_max(void)
564 {
565         __u64 ret;
566
567         cfs_spin_lock(&obd_updatemax_lock);
568         ret = obd_max_pages;
569         cfs_spin_unlock(&obd_updatemax_lock);
570
571         return ret;
572 }
573
574 EXPORT_SYMBOL(obd_update_maxusage);
575 EXPORT_SYMBOL(obd_pages_max);
576 EXPORT_SYMBOL(obd_memory_max);
577 EXPORT_SYMBOL(obd_memory);
578
579 #ifdef LPROCFS
580 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
581                           enum lprocfs_fields_flags field)
582 {
583         __s64 ret = 0;
584         int centry;
585
586         if (!lc)
587                 RETURN(0);
588         do {
589                 centry = cfs_atomic_read(&lc->lc_cntl.la_entry);
590
591                 switch (field) {
592                         case LPROCFS_FIELDS_FLAGS_CONFIG:
593                                 ret = lc->lc_config;
594                                 break;
595                         case LPROCFS_FIELDS_FLAGS_SUM:
596                                 ret = lc->lc_sum + lc->lc_sum_irq;
597                                 break;
598                         case LPROCFS_FIELDS_FLAGS_MIN:
599                                 ret = lc->lc_min;
600                                 break;
601                         case LPROCFS_FIELDS_FLAGS_MAX:
602                                 ret = lc->lc_max;
603                                 break;
604                         case LPROCFS_FIELDS_FLAGS_AVG:
605                                 ret = (lc->lc_max - lc->lc_min)/2;
606                                 break;
607                         case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
608                                 ret = lc->lc_sumsquare;
609                                 break;
610                         case LPROCFS_FIELDS_FLAGS_COUNT:
611                                 ret = lc->lc_count;
612                                 break;
613                         default:
614                                 break;
615                 };
616         } while (centry != cfs_atomic_read(&lc->lc_cntl.la_entry) &&
617                  centry != cfs_atomic_read(&lc->lc_cntl.la_exit));
618
619         RETURN(ret);
620 }
621 EXPORT_SYMBOL(lprocfs_read_helper);
622 #endif /* LPROCFS */
623
624 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
625 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
626 MODULE_LICENSE("GPL");