Whamcloud - gitweb
b=21571 stacksize and locking fixes for loadgen patch from umka
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #ifndef EXPORT_SYMTAB
42 # define EXPORT_SYMTAB
43 #endif
44
45 #define DEBUG_SUBSYSTEM S_FILTER
46
47 #include <linux/version.h>
48 #include <linux/fs.h>
49 #include <asm/unistd.h>
50 #include <linux/slab.h>
51 #include <linux/pagemap.h>
52 #include <linux/quotaops.h>
53 #include <linux/version.h>
54 #include <libcfs/libcfs.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/module.h>
58 #include <linux/init.h>
59 #include <linux/lustre_compat25.h>
60 #include <lvfs.h>
61 #include "lvfs_internal.h"
62
63 #include <obd.h>
64 #include <lustre_lib.h>
65 #include <lustre_quota.h>
66
67 __u64 obd_max_pages = 0;
68 __u64 obd_max_alloc = 0;
69 struct lprocfs_stats *obd_memory = NULL;
70 spinlock_t obd_updatemax_lock = SPIN_LOCK_UNLOCKED;
71 /* refine later and change to seqlock or simlar from libcfs */
72
73 /* Debugging check only needed during development */
74 #ifdef OBD_CTXT_DEBUG
75 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
76 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
77                                               msg)
78 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
79 #else
80 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
81 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
82 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
83 #endif
84
85 static void push_group_info(struct lvfs_run_ctxt *save,
86                             struct group_info *ginfo)
87 {
88         if (!ginfo) {
89                 save->ngroups = current_ngroups;
90                 current_ngroups = 0;
91         } else {
92                 task_lock(current);
93                 save->group_info = current->group_info;
94                 current->group_info = ginfo;
95                 task_unlock(current);
96         }
97 }
98
99 static void pop_group_info(struct lvfs_run_ctxt *save,
100                            struct group_info *ginfo)
101 {
102         if (!ginfo) {
103                 current_ngroups = save->ngroups;
104         } else {
105                 task_lock(current);
106                 current->group_info = save->group_info;
107                 task_unlock(current);
108         }
109 }
110
111 /* push / pop to root of obd store */
112 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
113                struct lvfs_ucred *uc)
114 {
115         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
116         ASSERT_CTXT_MAGIC(new_ctx->magic);
117         OBD_SET_CTXT_MAGIC(save);
118
119         save->fs = get_fs();
120         LASSERT(atomic_read(&cfs_fs_pwd(current->fs)->d_count));
121         LASSERT(atomic_read(&new_ctx->pwd->d_count));
122         save->pwd = dget(cfs_fs_pwd(current->fs));
123         save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
124         save->luc.luc_umask = current->fs->umask;
125         save->ngroups = current->group_info->ngroups;
126
127         LASSERT(save->pwd);
128         LASSERT(save->pwdmnt);
129         LASSERT(new_ctx->pwd);
130         LASSERT(new_ctx->pwdmnt);
131
132         if (uc) {
133                 save->luc.luc_uid = current->uid;
134                 save->luc.luc_gid = current->gid;
135                 save->luc.luc_fsuid = current->fsuid;
136                 save->luc.luc_fsgid = current->fsgid;
137                 save->luc.luc_cap = current->cap_effective;
138
139                 current->uid = uc->luc_uid;
140                 current->gid = uc->luc_gid;
141                 current->fsuid = uc->luc_fsuid;
142                 current->fsgid = uc->luc_fsgid;
143                 current->cap_effective = uc->luc_cap;
144
145                 push_group_info(save,
146                                 uc->luc_ginfo ?:
147                                 uc->luc_identity ? uc->luc_identity->mi_ginfo :
148                                                    NULL);
149         }
150         current->fs->umask = 0; /* umask already applied on client */
151         set_fs(new_ctx->fs);
152         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
153 }
154 EXPORT_SYMBOL(push_ctxt);
155
156 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
157               struct lvfs_ucred *uc)
158 {
159         ASSERT_CTXT_MAGIC(saved->magic);
160         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
161
162         LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
163                  cfs_fs_pwd(current->fs), new_ctx->pwd);
164         LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
165                  cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
166
167         set_fs(saved->fs);
168         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
169
170         dput(saved->pwd);
171         mntput(saved->pwdmnt);
172         current->fs->umask = saved->luc.luc_umask;
173         if (uc) {
174                 current->uid = saved->luc.luc_uid;
175                 current->gid = saved->luc.luc_gid;
176                 current->fsuid = saved->luc.luc_fsuid;
177                 current->fsgid = saved->luc.luc_fsgid;
178                 current->cap_effective = saved->luc.luc_cap;
179                 pop_group_info(saved,
180                                uc->luc_ginfo ?:
181                                uc->luc_identity ? uc->luc_identity->mi_ginfo :
182                                                   NULL);
183         }
184 }
185 EXPORT_SYMBOL(pop_ctxt);
186
187 /* utility to make a file */
188 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
189 {
190         struct dentry *dchild;
191         int err = 0;
192         ENTRY;
193
194         // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
195         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
196
197         dchild = ll_lookup_one_len(name, dir, strlen(name));
198         if (IS_ERR(dchild))
199                 GOTO(out_up, dchild);
200
201         if (dchild->d_inode) {
202                 int old_mode = dchild->d_inode->i_mode;
203                 if (!S_ISREG(old_mode))
204                         GOTO(out_err, err = -EEXIST);
205
206                 /* Fixup file permissions if necessary */
207                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
208                         CWARN("fixing permissions on %s from %o to %o\n",
209                               name, old_mode, mode);
210                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
211                                                   (old_mode & ~S_IALLUGO);
212                         mark_inode_dirty(dchild->d_inode);
213                 }
214                 GOTO(out_up, dchild);
215         }
216
217         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
218                             NULL);
219         if (err)
220                 GOTO(out_err, err);
221
222         RETURN(dchild);
223
224 out_err:
225         dput(dchild);
226         dchild = ERR_PTR(err);
227 out_up:
228         return dchild;
229 }
230 EXPORT_SYMBOL(simple_mknod);
231
232 /* utility to make a directory */
233 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
234                             const char *name, int mode, int fix)
235 {
236         struct dentry *dchild;
237         int err = 0;
238         ENTRY;
239
240         // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
241         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
242         dchild = ll_lookup_one_len(name, dir, strlen(name));
243         if (IS_ERR(dchild))
244                 GOTO(out_up, dchild);
245
246         if (dchild->d_inode) {
247                 int old_mode = dchild->d_inode->i_mode;
248                 if (!S_ISDIR(old_mode)) {
249                         CERROR("found %s (%lu/%u) is mode %o\n", name,
250                                dchild->d_inode->i_ino,
251                                dchild->d_inode->i_generation, old_mode);
252                         GOTO(out_err, err = -ENOTDIR);
253                 }
254
255                 /* Fixup directory permissions if necessary */
256                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
257                         CDEBUG(D_CONFIG,
258                                "fixing permissions on %s from %o to %o\n",
259                                name, old_mode, mode);
260                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
261                                                   (old_mode & ~S_IALLUGO);
262                         mark_inode_dirty(dchild->d_inode);
263                 }
264                 GOTO(out_up, dchild);
265         }
266
267         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
268         if (err)
269                 GOTO(out_err, err);
270
271         RETURN(dchild);
272
273 out_err:
274         dput(dchild);
275         dchild = ERR_PTR(err);
276 out_up:
277         return dchild;
278 }
279 EXPORT_SYMBOL(simple_mkdir);
280
281 /* utility to rename a file */
282 int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
283                   char *oldname, char *newname)
284 {
285         struct dentry *dchild_old, *dchild_new;
286         int err = 0;
287         ENTRY;
288
289         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
290         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
291                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
292
293         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
294         if (IS_ERR(dchild_old))
295                 RETURN(PTR_ERR(dchild_old));
296
297         if (!dchild_old->d_inode)
298                 GOTO(put_old, err = -ENOENT);
299
300         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
301         if (IS_ERR(dchild_new))
302                 GOTO(put_old, err = PTR_ERR(dchild_new));
303
304         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
305                             dir->d_inode, dchild_new, mnt);
306
307         dput(dchild_new);
308 put_old:
309         dput(dchild_old);
310         RETURN(err);
311 }
312 EXPORT_SYMBOL(lustre_rename);
313
314 /*
315  * Read a file from within kernel context.  Prior to calling this
316  * function we should already have done a push_ctxt().
317  */
318 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
319 {
320         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
321         if (!file || !file->f_op || !file->f_op->read || !off)
322                 RETURN(-ENOSYS);
323
324         return file->f_op->read(file, buf, len, off);
325 }
326 EXPORT_SYMBOL(lustre_fread);
327
328 /*
329  * Write a file from within kernel context.  Prior to calling this
330  * function we should already have done a push_ctxt().
331  */
332 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
333 {
334         ENTRY;
335         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
336         if (!file)
337                 RETURN(-ENOENT);
338         if (!file->f_op)
339                 RETURN(-ENOSYS);
340         if (!off)
341                 RETURN(-EINVAL);
342
343         if (!file->f_op->write)
344                 RETURN(-EROFS);
345
346         RETURN(file->f_op->write(file, buf, len, off));
347 }
348 EXPORT_SYMBOL(lustre_fwrite);
349
350 /*
351  * Sync a file from within kernel context.  Prior to calling this
352  * function we should already have done a push_ctxt().
353  */
354 int lustre_fsync(struct file *file)
355 {
356         ENTRY;
357         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
358         if (!file || !file->f_op || !file->f_op->fsync)
359                 RETURN(-ENOSYS);
360
361         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
362 }
363 EXPORT_SYMBOL(lustre_fsync);
364
365 /* Note: dput(dchild) will be called if there is an error */
366 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
367                              int flags)
368 {
369         mntget(ctxt->pwdmnt);
370         return dentry_open(de, ctxt->pwdmnt, flags);
371 }
372 EXPORT_SYMBOL(l_dentry_open);
373
374 #ifdef HAVE_VFS_READDIR_U64_INO
375 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
376                      u64 ino, unsigned int d_type)
377 #else
378 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
379                      ino_t ino, unsigned int d_type)
380 #endif
381 {
382         struct l_linux_dirent *dirent;
383         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
384
385         dirent = buf->lrc_dirent;
386         if (dirent)
387                dirent->lld_off = offset;
388
389         OBD_ALLOC(dirent, sizeof(*dirent));
390
391         if (!dirent)
392                 return -ENOMEM;
393
394         list_add_tail(&dirent->lld_list, buf->lrc_list);
395
396         buf->lrc_dirent = dirent;
397         dirent->lld_ino = ino;
398         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
399         memcpy(dirent->lld_name, name, namlen);
400
401         return 0;
402 }
403
404 long l_readdir(struct file *file, struct list_head *dentry_list)
405 {
406         struct l_linux_dirent *lastdirent;
407         struct l_readdir_callback buf;
408         int error;
409
410         buf.lrc_dirent = NULL;
411         buf.lrc_list = dentry_list;
412
413         error = vfs_readdir(file, l_filldir, &buf);
414         if (error < 0)
415                 return error;
416
417         lastdirent = buf.lrc_dirent;
418         if (lastdirent)
419                 lastdirent->lld_off = file->f_pos;
420
421         return 0;
422 }
423 EXPORT_SYMBOL(l_readdir);
424
425 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
426                  struct iattr *newattrs)
427 {
428         int rc;
429
430         LOCK_INODE_MUTEX(dchild->d_inode);
431 #ifdef HAVE_SECURITY_PLUG
432         rc = notify_change(dchild, mnt, newattrs);
433 #else
434         rc = notify_change(dchild, newattrs);
435 #endif
436         UNLOCK_INODE_MUTEX(dchild->d_inode);
437         return rc;
438 }
439 EXPORT_SYMBOL(l_notify_change);
440
441 /* utility to truncate a file */
442 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
443                  char *name, loff_t length)
444 {
445         struct dentry *dchild;
446         struct iattr newattrs;
447         int err = 0;
448         ENTRY;
449
450         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
451                name, (long long)length);
452         dchild = ll_lookup_one_len(name, dir, strlen(name));
453         if (IS_ERR(dchild))
454                 GOTO(out, err = PTR_ERR(dchild));
455
456         if (dchild->d_inode) {
457                 int old_mode = dchild->d_inode->i_mode;
458                 if (S_ISDIR(old_mode)) {
459                         CERROR("found %s (%lu/%u) is mode %o\n", name,
460                                dchild->d_inode->i_ino,
461                                dchild->d_inode->i_generation, old_mode);
462                         GOTO(out_dput, err = -EISDIR);
463                 }
464
465                 newattrs.ia_size = length;
466                 newattrs.ia_valid = ATTR_SIZE;
467                 err = l_notify_change(mnt, dchild, &newattrs);
468         }
469         EXIT;
470 out_dput:
471         dput(dchild);
472 out:
473         return err;
474 }
475 EXPORT_SYMBOL(simple_truncate);
476
477 #ifdef LUSTRE_KERNEL_VERSION
478 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
479 #error rdonly patchset must be updated [cfs bz11248]
480 #endif
481 void dev_set_rdonly(lvfs_sbdev_type dev);
482 int dev_check_rdonly(lvfs_sbdev_type dev);
483
484 void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
485 {
486         if (jdev && (jdev != dev)) {
487                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
488                        (long)jdev);
489                 dev_set_rdonly(jdev);
490         }
491         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
492         dev_set_rdonly(dev);
493 }
494
495 int lvfs_check_rdonly(lvfs_sbdev_type dev)
496 {
497         return dev_check_rdonly(dev);
498 }
499
500 EXPORT_SYMBOL(__lvfs_set_rdonly);
501 EXPORT_SYMBOL(lvfs_check_rdonly);
502
503 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
504 {
505         char *write_page = NULL;
506         loff_t offset = 0;
507         int rc = 0;
508         ENTRY;
509
510         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
511         if (!write_page)
512                 RETURN(-ENOMEM);
513
514         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
515
516         OBD_FREE(write_page, CFS_PAGE_SIZE);
517
518         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
519         RETURN(rc);
520 }
521 EXPORT_SYMBOL(lvfs_check_io_health);
522 #endif /* LUSTRE_KERNEL_VERSION */
523
524 void obd_update_maxusage()
525 {
526         __u64 max1, max2;
527
528         max1 = obd_pages_sum();
529         max2 = obd_memory_sum();
530
531         spin_lock(&obd_updatemax_lock);
532         if (max1 > obd_max_pages)
533                 obd_max_pages = max1;
534         if (max2 > obd_max_alloc)
535                 obd_max_alloc = max2;
536         spin_unlock(&obd_updatemax_lock);
537
538 }
539
540 __u64 obd_memory_max(void)
541 {
542         __u64 ret;
543
544         spin_lock(&obd_updatemax_lock);
545         ret = obd_max_alloc;
546         spin_unlock(&obd_updatemax_lock);
547
548         return ret;
549 }
550
551 __u64 obd_pages_max(void)
552 {
553         __u64 ret;
554
555         spin_lock(&obd_updatemax_lock);
556         ret = obd_max_pages;
557         spin_unlock(&obd_updatemax_lock);
558
559         return ret;
560 }
561
562 EXPORT_SYMBOL(obd_update_maxusage);
563 EXPORT_SYMBOL(obd_pages_max);
564 EXPORT_SYMBOL(obd_memory_max);
565 EXPORT_SYMBOL(obd_memory);
566
567 #ifdef LPROCFS
568 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
569                           enum lprocfs_fields_flags field)
570 {
571         __s64 ret = 0;
572         int centry;
573
574         if (!lc)
575                 RETURN(0);
576         do {
577                 centry = atomic_read(&lc->lc_cntl.la_entry);
578
579                 switch (field) {
580                         case LPROCFS_FIELDS_FLAGS_CONFIG:
581                                 ret = lc->lc_config;
582                                 break;
583                         case LPROCFS_FIELDS_FLAGS_SUM:
584                                 ret = lc->lc_sum + lc->lc_sum_irq;
585                                 break;
586                         case LPROCFS_FIELDS_FLAGS_MIN:
587                                 ret = lc->lc_min;
588                                 break;
589                         case LPROCFS_FIELDS_FLAGS_MAX:
590                                 ret = lc->lc_max;
591                                 break;
592                         case LPROCFS_FIELDS_FLAGS_AVG:
593                                 ret = (lc->lc_max - lc->lc_min)/2;
594                                 break;
595                         case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
596                                 ret = lc->lc_sumsquare;
597                                 break;
598                         case LPROCFS_FIELDS_FLAGS_COUNT:
599                                 ret = lc->lc_count;
600                                 break;
601                         default:
602                                 break;
603                 };
604         } while (centry != atomic_read(&lc->lc_cntl.la_entry) &&
605                  centry != atomic_read(&lc->lc_cntl.la_exit));
606
607         RETURN(ret);
608 }
609 EXPORT_SYMBOL(lprocfs_read_helper);
610 #endif /* LPROCFS */
611
612 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
613 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
614 MODULE_LICENSE("GPL");