Whamcloud - gitweb
LU-1302 mgs: mgs uses llog over OSD
[fs/lustre-release.git] / lustre / lvfs / lvfs_linux.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Whamcloud, Inc.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/lvfs_linux.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_FILTER
42
43 #include <linux/version.h>
44 #include <linux/fs.h>
45 #include <asm/unistd.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <linux/quotaops.h>
49 #include <linux/version.h>
50 #include <libcfs/libcfs.h>
51 #include <lustre_fsfilt.h>
52 #include <obd.h>
53 #include <linux/module.h>
54 #include <linux/init.h>
55 #include <linux/lustre_compat25.h>
56 #include <lvfs.h>
57 #include "lvfs_internal.h"
58
59 #include <obd.h>
60 #include <lustre_lib.h>
61 #include <lustre_quota.h>
62
63 __u64 obd_max_pages = 0;
64 __u64 obd_max_alloc = 0;
65 struct lprocfs_stats *obd_memory = NULL;
66 EXPORT_SYMBOL(obd_memory);
67 DEFINE_SPINLOCK(obd_updatemax_lock);
68 /* refine later and change to seqlock or simlar from libcfs */
69
70 /* Debugging check only needed during development */
71 #ifdef OBD_CTXT_DEBUG
72 # define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
73 # define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
74                                               msg)
75 # define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
76 #else
77 # define ASSERT_CTXT_MAGIC(magic) do {} while(0)
78 # define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
79 # define ASSERT_KERNEL_CTXT(msg) do {} while(0)
80 #endif
81
82 static void push_group_info(struct lvfs_run_ctxt *save,
83                             struct group_info *ginfo)
84 {
85         if (!ginfo) {
86                 save->ngroups = current_ngroups;
87                 current_ngroups = 0;
88         } else {
89                 struct cred *cred;
90                 task_lock(current);
91                 save->group_info = current_cred()->group_info;
92                 if ((cred = prepare_creds())) {
93                         cred->group_info = ginfo;
94                         commit_creds(cred);
95                 }
96                 task_unlock(current);
97         }
98 }
99
100 static void pop_group_info(struct lvfs_run_ctxt *save,
101                            struct group_info *ginfo)
102 {
103         if (!ginfo) {
104                 current_ngroups = save->ngroups;
105         } else {
106                 struct cred *cred;
107                 task_lock(current);
108                 if ((cred = prepare_creds())) {
109                         cred->group_info = save->group_info;
110                         commit_creds(cred);
111                 }
112                 task_unlock(current);
113         }
114 }
115
116 /* push / pop to root of obd store */
117 void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
118                struct lvfs_ucred *uc)
119 {
120         /* if there is underlaying dt_device then push_ctxt is not needed */
121         if (new_ctx->dt != NULL)
122                 return;
123
124         //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
125         ASSERT_CTXT_MAGIC(new_ctx->magic);
126         OBD_SET_CTXT_MAGIC(save);
127
128         save->fs = get_fs();
129         LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
130         LASSERT(d_refcount(new_ctx->pwd));
131         save->pwd = dget(cfs_fs_pwd(current->fs));
132         save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
133         save->luc.luc_umask = cfs_curproc_umask();
134         save->ngroups = current_cred()->group_info->ngroups;
135
136         LASSERT(save->pwd);
137         LASSERT(save->pwdmnt);
138         LASSERT(new_ctx->pwd);
139         LASSERT(new_ctx->pwdmnt);
140
141         if (uc) {
142                 struct cred *cred;
143                 save->luc.luc_uid = current_uid();
144                 save->luc.luc_gid = current_gid();
145                 save->luc.luc_fsuid = current_fsuid();
146                 save->luc.luc_fsgid = current_fsgid();
147                 save->luc.luc_cap = current_cap();
148
149                 if ((cred = prepare_creds())) {
150                         cred->uid = uc->luc_uid;
151                         cred->gid = uc->luc_gid;
152                         cred->fsuid = uc->luc_fsuid;
153                         cred->fsgid = uc->luc_fsgid;
154                         cred->cap_effective = uc->luc_cap;
155                         commit_creds(cred);
156                 }
157
158                 push_group_info(save,
159                                 uc->luc_ginfo ?:
160                                 uc->luc_identity ? uc->luc_identity->mi_ginfo :
161                                                    NULL);
162         }
163         current->fs->umask = 0; /* umask already applied on client */
164         set_fs(new_ctx->fs);
165         ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
166 }
167 EXPORT_SYMBOL(push_ctxt);
168
169 void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
170               struct lvfs_ucred *uc)
171 {
172         /* if there is underlaying dt_device then pop_ctxt is not needed */
173         if (new_ctx->dt != NULL)
174                 return;
175
176         ASSERT_CTXT_MAGIC(saved->magic);
177         ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
178
179         LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
180                  cfs_fs_pwd(current->fs), new_ctx->pwd);
181         LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
182                  cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
183
184         set_fs(saved->fs);
185         ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
186
187         dput(saved->pwd);
188         mntput(saved->pwdmnt);
189         current->fs->umask = saved->luc.luc_umask;
190         if (uc) {
191                 struct cred *cred;
192                 if ((cred = prepare_creds())) {
193                         cred->uid = saved->luc.luc_uid;
194                         cred->gid = saved->luc.luc_gid;
195                         cred->fsuid = saved->luc.luc_fsuid;
196                         cred->fsgid = saved->luc.luc_fsgid;
197                         cred->cap_effective = saved->luc.luc_cap;
198                         commit_creds(cred);
199                 }
200
201                 pop_group_info(saved,
202                                uc->luc_ginfo ?:
203                                uc->luc_identity ? uc->luc_identity->mi_ginfo :
204                                                   NULL);
205         }
206 }
207 EXPORT_SYMBOL(pop_ctxt);
208
209 /* utility to make a file */
210 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix)
211 {
212         struct dentry *dchild;
213         int err = 0;
214         ENTRY;
215
216         // ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n");
217         CDEBUG(D_INODE, "creating file %.*s\n", (int)strlen(name), name);
218
219         dchild = ll_lookup_one_len(name, dir, strlen(name));
220         if (IS_ERR(dchild))
221                 GOTO(out_up, dchild);
222
223         if (dchild->d_inode) {
224                 int old_mode = dchild->d_inode->i_mode;
225                 if (!S_ISREG(old_mode))
226                         GOTO(out_err, err = -EEXIST);
227
228                 /* Fixup file permissions if necessary */
229                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
230                         CWARN("fixing permissions on %s from %o to %o\n",
231                               name, old_mode, mode);
232                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
233                                                   (old_mode & ~S_IALLUGO);
234                         mark_inode_dirty(dchild->d_inode);
235                 }
236                 GOTO(out_up, dchild);
237         }
238
239         err = ll_vfs_create(dir->d_inode, dchild, (mode & ~S_IFMT) | S_IFREG,
240                             NULL);
241         if (err)
242                 GOTO(out_err, err);
243
244         RETURN(dchild);
245
246 out_err:
247         dput(dchild);
248         dchild = ERR_PTR(err);
249 out_up:
250         return dchild;
251 }
252 EXPORT_SYMBOL(simple_mknod);
253
254 /* utility to make a directory */
255 struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
256                             const char *name, int mode, int fix)
257 {
258         struct dentry *dchild;
259         int err = 0;
260         ENTRY;
261
262         // ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n");
263         CDEBUG(D_INODE, "creating directory %.*s\n", (int)strlen(name), name);
264         dchild = ll_lookup_one_len(name, dir, strlen(name));
265         if (IS_ERR(dchild))
266                 GOTO(out_up, dchild);
267
268         if (dchild->d_inode) {
269                 int old_mode = dchild->d_inode->i_mode;
270                 if (!S_ISDIR(old_mode)) {
271                         CERROR("found %s (%lu/%u) is mode %o\n", name,
272                                dchild->d_inode->i_ino,
273                                dchild->d_inode->i_generation, old_mode);
274                         GOTO(out_err, err = -ENOTDIR);
275                 }
276
277                 /* Fixup directory permissions if necessary */
278                 if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
279                         CDEBUG(D_CONFIG,
280                                "fixing permissions on %s from %o to %o\n",
281                                name, old_mode, mode);
282                         dchild->d_inode->i_mode = (mode & S_IALLUGO) |
283                                                   (old_mode & ~S_IALLUGO);
284                         mark_inode_dirty(dchild->d_inode);
285                 }
286                 GOTO(out_up, dchild);
287         }
288
289         err = ll_vfs_mkdir(dir->d_inode, dchild, mnt, mode);
290         if (err)
291                 GOTO(out_err, err);
292
293         RETURN(dchild);
294
295 out_err:
296         dput(dchild);
297         dchild = ERR_PTR(err);
298 out_up:
299         return dchild;
300 }
301 EXPORT_SYMBOL(simple_mkdir);
302
303 /* utility to rename a file */
304 int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
305                   char *oldname, char *newname)
306 {
307         struct dentry *dchild_old, *dchild_new;
308         int err = 0;
309         ENTRY;
310
311         ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
312         CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
313                (int)strlen(oldname), oldname, (int)strlen(newname), newname);
314
315         dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
316         if (IS_ERR(dchild_old))
317                 RETURN(PTR_ERR(dchild_old));
318
319         if (!dchild_old->d_inode)
320                 GOTO(put_old, err = -ENOENT);
321
322         dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
323         if (IS_ERR(dchild_new))
324                 GOTO(put_old, err = PTR_ERR(dchild_new));
325
326         err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
327                             dir->d_inode, dchild_new, mnt);
328
329         dput(dchild_new);
330 put_old:
331         dput(dchild_old);
332         RETURN(err);
333 }
334 EXPORT_SYMBOL(lustre_rename);
335
336 /*
337  * Read a file from within kernel context.  Prior to calling this
338  * function we should already have done a push_ctxt().
339  */
340 int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
341 {
342         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
343         if (!file || !file->f_op || !file->f_op->read || !off)
344                 RETURN(-ENOSYS);
345
346         return file->f_op->read(file, buf, len, off);
347 }
348 EXPORT_SYMBOL(lustre_fread);
349
350 /*
351  * Write a file from within kernel context.  Prior to calling this
352  * function we should already have done a push_ctxt().
353  */
354 int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
355 {
356         ENTRY;
357         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
358         if (!file)
359                 RETURN(-ENOENT);
360         if (!file->f_op)
361                 RETURN(-ENOSYS);
362         if (!off)
363                 RETURN(-EINVAL);
364
365         if (!file->f_op->write)
366                 RETURN(-EROFS);
367
368         RETURN(file->f_op->write(file, buf, len, off));
369 }
370 EXPORT_SYMBOL(lustre_fwrite);
371
372 /*
373  * Sync a file from within kernel context.  Prior to calling this
374  * function we should already have done a push_ctxt().
375  */
376 int lustre_fsync(struct file *file)
377 {
378         ENTRY;
379         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
380         if (!file || !file->f_op || !file->f_op->fsync)
381                 RETURN(-ENOSYS);
382
383         RETURN(cfs_do_fsync(file, 0));
384 }
385 EXPORT_SYMBOL(lustre_fsync);
386
387 /* Note: dput(dchild) will be called if there is an error */
388 struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
389                              int flags)
390 {
391         mntget(ctxt->pwdmnt);
392         return ll_dentry_open(de, ctxt->pwdmnt, flags, current_cred());
393 }
394 EXPORT_SYMBOL(l_dentry_open);
395
396 static int l_filldir(void *__buf, const char *name, int namlen, loff_t offset,
397                      u64 ino, unsigned int d_type)
398 {
399         struct l_linux_dirent *dirent;
400         struct l_readdir_callback *buf = (struct l_readdir_callback *)__buf;
401
402         dirent = buf->lrc_dirent;
403         if (dirent)
404                dirent->lld_off = offset;
405
406         OBD_ALLOC(dirent, sizeof(*dirent));
407
408         if (!dirent)
409                 return -ENOMEM;
410
411         cfs_list_add_tail(&dirent->lld_list, buf->lrc_list);
412
413         buf->lrc_dirent = dirent;
414         dirent->lld_ino = ino;
415         LASSERT(sizeof(dirent->lld_name) >= namlen + 1);
416         memcpy(dirent->lld_name, name, namlen);
417
418         return 0;
419 }
420
421 long l_readdir(struct file *file, cfs_list_t *dentry_list)
422 {
423         struct l_linux_dirent *lastdirent;
424         struct l_readdir_callback buf;
425         int error;
426
427         buf.lrc_dirent = NULL;
428         buf.lrc_list = dentry_list;
429
430         error = vfs_readdir(file, l_filldir, &buf);
431         if (error < 0)
432                 return error;
433
434         lastdirent = buf.lrc_dirent;
435         if (lastdirent)
436                 lastdirent->lld_off = file->f_pos;
437
438         return 0;
439 }
440 EXPORT_SYMBOL(l_readdir);
441
442 int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
443                     struct iattr *newattrs)
444 {
445         int rc;
446
447         mutex_lock(&dchild->d_inode->i_mutex);
448 #ifdef HAVE_SECURITY_PLUG
449         rc = notify_change(dchild, mnt, newattrs);
450 #else
451         rc = notify_change(dchild, newattrs);
452 #endif
453         mutex_unlock(&dchild->d_inode->i_mutex);
454         return rc;
455 }
456 EXPORT_SYMBOL(l_notify_change);
457
458 /* utility to truncate a file */
459 int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
460                  char *name, loff_t length)
461 {
462         struct dentry *dchild;
463         struct iattr newattrs;
464         int err = 0;
465         ENTRY;
466
467         CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
468                name, (long long)length);
469         dchild = ll_lookup_one_len(name, dir, strlen(name));
470         if (IS_ERR(dchild))
471                 GOTO(out, err = PTR_ERR(dchild));
472
473         if (dchild->d_inode) {
474                 int old_mode = dchild->d_inode->i_mode;
475                 if (S_ISDIR(old_mode)) {
476                         CERROR("found %s (%lu/%u) is mode %o\n", name,
477                                dchild->d_inode->i_ino,
478                                dchild->d_inode->i_generation, old_mode);
479                         GOTO(out_dput, err = -EISDIR);
480                 }
481
482                 newattrs.ia_size = length;
483                 newattrs.ia_valid = ATTR_SIZE;
484                 err = l_notify_change(mnt, dchild, &newattrs);
485         }
486         EXIT;
487 out_dput:
488         dput(dchild);
489 out:
490         return err;
491 }
492 EXPORT_SYMBOL(simple_truncate);
493
494 int __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
495 {
496 #ifdef HAVE_DEV_SET_RDONLY
497         if (jdev && (jdev != dev)) {
498                 CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
499                        (long)jdev);
500                 dev_set_rdonly(jdev);
501         }
502         CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
503         dev_set_rdonly(dev);
504
505         return 0;
506 #else
507         CERROR("DEV %lx CANNOT BE SET READONLY\n", (long)dev);
508
509         return -EOPNOTSUPP;
510 #endif
511 }
512 EXPORT_SYMBOL(__lvfs_set_rdonly);
513
514 int lvfs_check_rdonly(lvfs_sbdev_type dev)
515 {
516 #ifdef HAVE_DEV_SET_RDONLY
517         return dev_check_rdonly(dev);
518 #else
519         return 0;
520 #endif
521 }
522 EXPORT_SYMBOL(lvfs_check_rdonly);
523
524 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
525 {
526         char *write_page = NULL;
527         loff_t offset = 0;
528         int rc = 0;
529         ENTRY;
530
531         OBD_ALLOC(write_page, CFS_PAGE_SIZE);
532         if (!write_page)
533                 RETURN(-ENOMEM);
534
535         rc = fsfilt_write_record(obd, file, write_page, CFS_PAGE_SIZE, &offset, 1);
536
537         OBD_FREE(write_page, CFS_PAGE_SIZE);
538
539         CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc);
540         RETURN(rc);
541 }
542 EXPORT_SYMBOL(lvfs_check_io_health);
543
544 void obd_update_maxusage()
545 {
546         __u64 max1, max2;
547
548         max1 = obd_pages_sum();
549         max2 = obd_memory_sum();
550
551         cfs_spin_lock(&obd_updatemax_lock);
552         if (max1 > obd_max_pages)
553                 obd_max_pages = max1;
554         if (max2 > obd_max_alloc)
555                 obd_max_alloc = max2;
556         cfs_spin_unlock(&obd_updatemax_lock);
557
558 }
559 EXPORT_SYMBOL(obd_update_maxusage);
560
561 __u64 obd_memory_max(void)
562 {
563         __u64 ret;
564
565         cfs_spin_lock(&obd_updatemax_lock);
566         ret = obd_max_alloc;
567         cfs_spin_unlock(&obd_updatemax_lock);
568
569         return ret;
570 }
571 EXPORT_SYMBOL(obd_memory_max);
572
573 __u64 obd_pages_max(void)
574 {
575         __u64 ret;
576
577         cfs_spin_lock(&obd_updatemax_lock);
578         ret = obd_max_pages;
579         cfs_spin_unlock(&obd_updatemax_lock);
580
581         return ret;
582 }
583 EXPORT_SYMBOL(obd_pages_max);
584
585 #ifdef LPROCFS
586 __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
587                           enum lprocfs_fields_flags field)
588 {
589         __s64 ret = 0;
590         int centry;
591
592         if (!lc)
593                 RETURN(0);
594         do {
595                 centry = cfs_atomic_read(&lc->lc_cntl.la_entry);
596
597                 switch (field) {
598                         case LPROCFS_FIELDS_FLAGS_CONFIG:
599                                 ret = lc->lc_config;
600                                 break;
601                         case LPROCFS_FIELDS_FLAGS_SUM:
602                                 ret = lc->lc_sum + lc->lc_sum_irq;
603                                 break;
604                         case LPROCFS_FIELDS_FLAGS_MIN:
605                                 ret = lc->lc_min;
606                                 break;
607                         case LPROCFS_FIELDS_FLAGS_MAX:
608                                 ret = lc->lc_max;
609                                 break;
610                         case LPROCFS_FIELDS_FLAGS_AVG:
611                                 ret = (lc->lc_max - lc->lc_min)/2;
612                                 break;
613                         case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
614                                 ret = lc->lc_sumsquare;
615                                 break;
616                         case LPROCFS_FIELDS_FLAGS_COUNT:
617                                 ret = lc->lc_count;
618                                 break;
619                         default:
620                                 break;
621                 };
622         } while (centry != cfs_atomic_read(&lc->lc_cntl.la_entry) &&
623                  centry != cfs_atomic_read(&lc->lc_cntl.la_exit));
624
625         RETURN(ret);
626 }
627 EXPORT_SYMBOL(lprocfs_read_helper);
628 #endif /* LPROCFS */
629
630 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
631 MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
632 MODULE_LICENSE("GPL");