From 2be7bd991a12fc02f13523bee0665fe24a796219 Mon Sep 17 00:00:00 2001 From: braam Date: Wed, 9 Jul 2003 18:39:52 +0000 Subject: [PATCH] - another merge of b_devel into b_ad --- lustre/include/linux/lustre_mgmt.h | 34 + .../patches/vfs_intent-2.4.18-18-chaos65.patch | 1712 ++++++++++++++++++++ .../pc/vfs_intent-2.4.18-18-chaos65.pc | 12 + lustre/ldlm/ldlm_lib.c | 16 +- lustre/lov/lov_internal.h | 12 + lustre/mds/mds_lib.c | 1 + lustre/mgmt/.cvsignore | 9 + lustre/mgmt/mgmt_cli.c | 269 +++ lustre/obdfilter/filter_internal.h | 122 ++ lustre/obdfilter/filter_io.c | 764 +++++++++ lustre/obdfilter/filter_log.c | 379 +++++ lustre/obdfilter/filter_san.c | 130 ++ 12 files changed, 3456 insertions(+), 4 deletions(-) create mode 100644 lustre/include/linux/lustre_mgmt.h create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch create mode 100644 lustre/kernel_patches/pc/vfs_intent-2.4.18-18-chaos65.pc create mode 100644 lustre/lov/lov_internal.h create mode 100644 lustre/mgmt/.cvsignore create mode 100644 lustre/mgmt/mgmt_cli.c create mode 100644 lustre/obdfilter/filter_internal.h create mode 100644 lustre/obdfilter/filter_io.c create mode 100644 lustre/obdfilter/filter_log.c create mode 100644 lustre/obdfilter/filter_san.c diff --git a/lustre/include/linux/lustre_mgmt.h b/lustre/include/linux/lustre_mgmt.h new file mode 100644 index 0000000..f3f4a9d --- /dev/null +++ b/lustre/include/linux/lustre_mgmt.h @@ -0,0 +1,34 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This code is issued under the GNU General Public License. + * See the file COPYING in this distribution + */ + +#ifndef LUSTRE_MGMT_H +#define LUSTRE_MGMT_H + +#define LUSTRE_MGMTCLI_NAME "mgmtcli" + +/* For the convenience and type-safety of inter_module_getters. */ + +struct obd_device; +struct obd_uuid; + +/* + * The caller is responsible for ensuring that relevant_uuid -- if non-NULL -- + * points to valid memory until deregister is called. If relevant_uuid is NULL, + * all management events will be propagated to the registrant. Notice that + * deregister doesn't take a relevant_uuid-matching parameter; I should probably + * fix that at some point. + */ +typedef int (*mgmtcli_register_for_events_t)(struct obd_device *mgmt_obd, + struct obd_device *notify_obd, + struct obd_uuid *relevant_uuid); + +typedef int (*mgmtcli_deregister_for_events_t)(struct obd_device *mgmt_obd, + struct obd_device *notify_obd); + +#endif /* LUSTRE_MGMT_H */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch new file mode 100644 index 0000000..71ad1bb --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch @@ -0,0 +1,1712 @@ + fs/dcache.c | 19 ++ + fs/exec.c | 15 +- + fs/namei.c | 355 +++++++++++++++++++++++++++++++++++++++++-------- + fs/namespace.c | 30 +++- + fs/open.c | 122 ++++++++++++++-- + fs/proc/base.c | 3 + fs/stat.c | 27 ++- + include/linux/dcache.h | 53 +++++++ + include/linux/fs.h | 31 ++++ + kernel/exit.c | 3 + kernel/fork.c | 3 + kernel/ksyms.c | 1 + 12 files changed, 563 insertions(+), 99 deletions(-) + +--- linux-2.4.18-p4smp/fs/exec.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/exec.c 2003-07-08 14:45:17.000000000 -0600 +@@ -117,8 +117,9 @@ asmlinkage long sys_uselib(const char * + struct file * file; + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; + +- error = user_path_walk(library, &nd); ++ error = user_path_walk_it(library, &nd, &it); + if (error) + goto out; + +@@ -130,7 +131,8 @@ asmlinkage long sys_uselib(const char * + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -359,8 +361,9 @@ struct file *open_exec(const char *name) + struct inode *inode; + struct file *file; + int err = 0; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; + +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); ++ err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; +@@ -372,7 +375,8 @@ struct file *open_exec(const char *name) + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +@@ -384,6 +388,7 @@ out: + return file; + } + } ++ intent_release(&it); + path_release(&nd); + } + goto out; +@@ -1104,7 +1109,7 @@ int do_coredump(long signr, struct pt_re + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +--- linux-2.4.18-p4smp/fs/dcache.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/dcache.c 2003-07-08 14:45:17.000000000 -0600 +@@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -859,13 +866,19 @@ void d_delete(struct dentry * dentry) + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ +--- linux-2.4.18-p4smp/fs/namespace.c~vfs_intent-2.4.18-18-chaos65 2002-06-25 22:16:14.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/namespace.c 2003-07-08 14:45:17.000000000 -0600 +@@ -99,6 +99,7 @@ static void detach_mnt(struct vfsmount * + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -110,6 +111,7 @@ static void attach_mnt(struct vfsmount * + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -485,14 +487,17 @@ static int do_loopback(struct nameidata + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; +- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); +- if (err) ++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -515,6 +520,7 @@ static int do_loopback(struct nameidata + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -698,7 +704,8 @@ long do_mount(char * dev_name, char * di + unsigned long flags, void *data_page) + { + struct nameidata nd; +- int retval = 0; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ int retval = 0; + int mnt_flags = 0; + + /* Discard magic */ +@@ -722,10 +729,11 @@ long do_mount(char * dev_name, char * di + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); + + /* ... and get the mountpoint */ +- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); +- if (retval) ++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; +- ++ } + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + data_page); +@@ -736,6 +744,8 @@ long do_mount(char * dev_name, char * di + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -901,6 +911,8 @@ asmlinkage long sys_pivot_root(const cha + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + int error; + + if (!capable(CAP_SYS_ADMIN)) +@@ -908,14 +920,14 @@ asmlinkage long sys_pivot_root(const cha + + lock_kernel(); + +- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); ++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); + if (error) + goto out0; + error = -EINVAL; + if (!check_mnt(new_nd.mnt)) + goto out1; + +- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); ++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); + if (error) + goto out1; + +@@ -970,8 +982,10 @@ out2: + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); +--- linux-2.4.18-p4smp/fs/namei.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/namei.c 2003-07-08 14:45:17.000000000 -0600 +@@ -94,6 +94,13 @@ + * XEmacs seems to be relying on it... + */ + ++void intent_release(struct lookup_intent *it) ++{ ++ if (it && it->it_op_release) ++ it->it_op_release(it); ++ ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd) + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * dentry = d_lookup(parent, name); + ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); +@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * result; + struct inode *dir = parent->d_inode; + ++again: ++ + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc + result = ERR_PTR(-ENOMEM); + if (dentry) { + lock_kernel(); ++ if (dir->i_op->lookup_it) ++ result = dir->i_op->lookup_it(dir, dentry, it, flags); ++ else + result = dir->i_op->lookup(dir, dentry); + unlock_kernel(); + if (result) +@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc + dput(result); + result = ERR_PTR(-ENOENT); + } ++ } else if (result->d_op && result->d_op->d_revalidate_it) { ++ if (!result->d_op->d_revalidate_it(result, flags, it) && ++ !d_invalidate(result)) { ++ dput(result); ++ goto again; ++ } + } + return result; + } +@@ -334,7 +362,8 @@ int max_recursive_link = 5; + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++ struct lookup_intent *it) + { + int err; + if (current->link_count >= max_recursive_link) +@@ -348,10 +377,21 @@ static inline int do_follow_link(struct + current->link_count++; + current->total_link_count++; + UPDATE_ATIME(dentry->d_inode); +- err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ nd->it = it; ++ if (dentry->d_inode->i_op->follow_link2) ++ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); ++ else ++ err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(it); ++ path_release(nd); ++ err = -ENOLINK; ++ } + current->link_count--; + return err; + loop: ++ intent_release(it); + path_release(nd); + return -ELOOP; + } +@@ -381,15 +421,26 @@ int follow_up(struct vfsmount **mnt, str + return __follow_up(mnt, dentry); + } + +-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) ++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry, ++ struct lookup_intent *it) + { + struct vfsmount *mounted; + + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (mounted) { ++ int opc = 0, mode = 0; + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); ++ if (it) { ++ opc = it->it_op; ++ mode = it->it_mode; ++ } ++ intent_release(it); ++ if (it) { ++ it->it_op = opc; ++ it->it_mode = mode; ++ } + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); +@@ -401,7 +452,7 @@ static inline int __follow_down(struct v + + int follow_down(struct vfsmount **mnt, struct dentry **dentry) + { +- return __follow_down(mnt,dentry); ++ return __follow_down(mnt,dentry,NULL); + } + + static inline void follow_dotdot(struct nameidata *nd) +@@ -437,7 +488,7 @@ static inline void follow_dotdot(struct + mntput(nd->mnt); + nd->mnt = parent; + } +- while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry)) ++ while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry, NULL)) + ; + } + +@@ -449,7 +500,8 @@ static inline void follow_dotdot(struct + * + * We expect 'base' to be positive and a directory. + */ +-int link_path_walk(const char * name, struct nameidata *nd) ++int link_path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it) + { + struct dentry *dentry; + struct inode *inode; +@@ -526,18 +578,18 @@ int link_path_walk(const char * name, st + break; + } + /* This does the actual lookups.. */ +- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + if (!dentry) { + err = -EWOULDBLOCKIO; + if (atomic) + break; +- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; + } + /* Check mountpoints.. */ +- while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL)) + ; + + err = -ENOENT; +@@ -548,8 +600,8 @@ int link_path_walk(const char * name, st + if (!inode->i_op) + goto out_dput; + +- if (inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ if (inode->i_op->follow_link || inode->i_op->follow_link2) { ++ err = do_follow_link(dentry, nd, NULL); + dput(dentry); + if (err) + goto return_err; +@@ -565,7 +617,7 @@ int link_path_walk(const char * name, st + nd->dentry = dentry; + } + err = -ENOTDIR; +- if (!inode->i_op->lookup) ++ if (!inode->i_op->lookup && !inode->i_op->lookup_it) + break; + continue; + /* here ends the main loop */ +@@ -592,22 +644,23 @@ last_component: + if (err < 0) + break; + } +- dentry = cached_lookup(nd->dentry, &this, 0); ++ dentry = cached_lookup(nd->dentry, &this, 0, it); + if (!dentry) { + err = -EWOULDBLOCKIO; + if (atomic) + break; +- dentry = real_lookup(nd->dentry, &this, 0); ++ dentry = real_lookup(nd->dentry, &this, 0, it); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; + } +- while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, it)) + ; + inode = dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) +- && inode && inode->i_op && inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ && inode && inode->i_op && ++ (inode->i_op->follow_link || inode->i_op->follow_link2)) { ++ err = do_follow_link(dentry, nd, it); + dput(dentry); + if (err) + goto return_err; +@@ -621,7 +674,8 @@ last_component: + goto no_inode; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; +- if (!inode->i_op || !inode->i_op->lookup) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup_it)) + break; + } + goto return_base; +@@ -645,7 +699,24 @@ return_reval: + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; +- if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { ++ revalidate_again: ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, NULL); ++ d_invalidate(dentry); ++ dput(dentry); ++ dentry = new; ++ goto revalidate_again; ++ } ++ } ++ else if (dentry && dentry->d_op && dentry->d_op->d_revalidate){ + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, 0)) { + d_invalidate(dentry); +@@ -658,15 +729,28 @@ out_dput: + dput(dentry); + break; + } ++ if (err) ++ intent_release(it); + path_release(nd); + return_err: + return err; + } + ++int link_path_walk(const char * name, struct nameidata *nd) ++{ ++ return link_path_walk_it(name, nd, NULL); ++} ++ ++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) ++{ ++ current->total_link_count = 0; ++ return link_path_walk_it(name, nd, it); ++} ++ + int path_walk(const char * name, struct nameidata *nd) + { + current->total_link_count = 0; +- return link_path_walk(name, nd); ++ return link_path_walk_it(name, nd, NULL); + } + + /* SMP-safe */ +@@ -751,6 +835,17 @@ walk_init_root(const char *name, struct + } + + /* SMP-safe */ ++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ int error = 0; ++ if (path_init(path, flags, nd)) ++ error = path_walk_it(path, nd, it); ++ return error; ++} ++ ++ ++/* SMP-safe */ + int path_lookup(const char *path, unsigned flags, struct nameidata *nd) + { + int error = 0; +@@ -765,6 +860,7 @@ int path_init(const char *name, unsigned + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->it = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -779,7 +875,8 @@ int path_init(const char *name, unsigned + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, ++ struct lookup_intent *it) + { + struct dentry * dentry; + struct inode *inode; +@@ -802,13 +899,16 @@ struct dentry * lookup_hash(struct qstr + goto out; + } + +- dentry = cached_lookup(base, name, 0); ++ dentry = cached_lookup(base, name, 0, it); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; + lock_kernel(); ++ if (inode->i_op->lookup_it) ++ dentry = inode->i_op->lookup_it(inode, new, it, 0); ++ else + dentry = inode->i_op->lookup(inode, new); + unlock_kernel(); + if (!dentry) +@@ -820,6 +920,12 @@ out: + return dentry; + } + ++struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++{ ++ return lookup_hash_it(name, base, NULL); ++} ++ ++ + /* SMP-safe */ + struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) + { +@@ -841,7 +947,7 @@ struct dentry * lookup_one_len(const cha + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return lookup_hash_it(&this, base, NULL); + access: + return ERR_PTR(-EACCES); + } +@@ -872,6 +978,23 @@ int __user_walk(const char *name, unsign + return err; + } + ++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ char *tmp; ++ int err; ++ ++ tmp = getname(name); ++ err = PTR_ERR(tmp); ++ if (!IS_ERR(tmp)) { ++ err = 0; ++ if (path_init(tmp, flags, nd)) ++ err = path_walk_it(tmp, nd, it); ++ putname(tmp); ++ } ++ return err; ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -969,6 +1092,37 @@ static inline int lookup_flags(unsigned + return retval; + } + ++static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, ++ struct lookup_intent *it) ++{ ++ int error; ++ ++ mode &= S_IALLUGO; ++ mode |= S_IFREG; ++ ++ down(&dir->i_zombie); ++ error = may_create(dir, dentry); ++ if (error) ++ goto exit_lock; ++ ++ error = -EACCES; /* shouldn't it be ENOSYS? */ ++ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) ++ goto exit_lock; ++ ++ DQUOT_INIT(dir); ++ lock_kernel(); ++ if (dir->i_op->create_it) ++ error = dir->i_op->create_it(dir, dentry, mode, it); ++ else ++ error = dir->i_op->create(dir, dentry, mode); ++ unlock_kernel(); ++exit_lock: ++ up(&dir->i_zombie); ++ if (!error) ++ inode_dir_notify(dir, DN_CREATE); ++ return error; ++} ++ + int vfs_create(struct inode *dir, struct dentry *dentry, int mode) + { + int error; +@@ -987,7 +1141,7 @@ int vfs_create(struct inode *dir, struct + + DQUOT_INIT(dir); + lock_kernel(); +- error = dir->i_op->create(dir, dentry, mode); ++ error = dir->i_op->create(dir, dentry, mode); + unlock_kernel(); + exit_lock: + up(&dir->i_zombie); +@@ -1045,14 +1199,17 @@ int may_open(struct nameidata *nd, int a + return get_lease(inode, flag); + } + +-struct file *filp_open(const char * pathname, int open_flags, int mode) ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); ++ ++struct file *filp_open(const char * pathname, int open_flags, int mode) + { + int acc_mode, error = 0; +- struct inode *inode; + struct dentry *dentry; + struct dentry *dir; + int flag = open_flags; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags }; + int count = 0; + + if (!capable(CAP_SYS_ADMIN)) +@@ -1069,7 +1226,7 @@ struct file *filp_open(const char * path + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { +- error = path_lookup(pathname, lookup_flags(flag), &nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), &nd, &it); + if (error) + return ERR_PTR(error); + dentry = nd.dentry; +@@ -1079,6 +1236,8 @@ struct file *filp_open(const char * path + /* + * Create - we need to know the parent. + */ ++ it.it_mode = mode; ++ it.it_op |= IT_CREAT; + error = path_lookup(pathname, LOOKUP_PARENT, &nd); + if (error) + return ERR_PTR(error); +@@ -1094,7 +1253,7 @@ struct file *filp_open(const char * path + + dir = nd.dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + + do_last: + error = PTR_ERR(dentry); +@@ -1103,10 +1262,11 @@ do_last: + goto exit; + } + ++ it.it_mode = mode; + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { +- error = vfs_create(dir->d_inode, dentry, +- mode & ~current->fs->umask); ++ error = vfs_create_it(dir->d_inode, dentry, ++ mode & ~current->fs->umask, &it); + up(&dir->d_inode->i_sem); + dput(nd.dentry); + nd.dentry = dentry; +@@ -1132,12 +1292,12 @@ do_last: + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; +- while (__follow_down(&nd.mnt,&dentry) && d_mountpoint(dentry)); ++ while (__follow_down(&nd.mnt,&dentry, &it) && d_mountpoint(dentry)); + } + error = -ENOENT; + if (!dentry->d_inode) + goto exit_dput; +- if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) ++ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link)) + goto do_link; + + dput(nd.dentry); +@@ -1152,11 +1312,13 @@ ok: + if (!S_ISREG(nd.dentry->d_inode->i_mode)) + open_flags &= ~O_TRUNC; + +- return dentry_open(nd.dentry, nd.mnt, open_flags); ++ return dentry_open_it(nd.dentry, nd.mnt, open_flags, &it); + + exit_dput: ++ intent_release(&it); + dput(dentry); + exit: ++ intent_release(&it); + path_release(&nd); + return ERR_PTR(error); + +@@ -1175,10 +1337,19 @@ do_link: + * are done. Procfs-like symlinks just set LAST_BIND. + */ + UPDATE_ATIME(dentry->d_inode); +- error = dentry->d_inode->i_op->follow_link(dentry, &nd); ++ nd.it = ⁢ ++ error = dentry->d_inode->i_op->follow_link(dentry, &nd); ++ if (error) { ++ intent_release(&it); ++ } else if (!(it.it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(&it); ++ path_release(&nd); ++ error = -ENOLINK; ++ } + dput(dentry); + if (error) +- return error; ++ return ERR_PTR(error); + if (nd.last_type == LAST_BIND) { + dentry = nd.dentry; + goto ok; +@@ -1197,13 +1368,15 @@ do_link: + } + dir = nd.dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + putname(nd.last.name); + goto do_last; + } + ++ + /* SMP-safe */ +-static struct dentry *lookup_create(struct nameidata *nd, int is_dir) ++static struct dentry *lookup_create(struct nameidata *nd, int is_dir, ++ struct lookup_intent *it) + { + struct dentry *dentry; + +@@ -1211,7 +1384,7 @@ static struct dentry *lookup_create(stru + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) +@@ -1267,7 +1440,16 @@ asmlinkage long sys_mknod(const char * f + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + + mode &= ~current->fs->umask; +@@ -1288,6 +1470,7 @@ asmlinkage long sys_mknod(const char * f + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1335,7 +1518,14 @@ asmlinkage long sys_mkdir(const char * p + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 1); ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_mkdir(nd.dentry->d_inode, dentry, +@@ -1343,6 +1533,7 @@ asmlinkage long sys_mkdir(const char * p + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1443,8 +1634,16 @@ asmlinkage long sys_rmdir(const char * p + error = -EBUSY; + goto exit1; + } ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); +@@ -1502,8 +1701,15 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ +@@ -1570,15 +1776,23 @@ asmlinkage long sys_symlink(const char * + error = path_lookup(to, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1654,7 +1868,14 @@ asmlinkage long sys_link(const char * ol + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +- new_dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); +@@ -1698,7 +1919,7 @@ exit: + * locking]. + */ + int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + struct inode *target; +@@ -1777,7 +1998,7 @@ out_unlock: + } + + int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + +@@ -1865,9 +2086,18 @@ static inline int do_rename(const char * + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ unlock_kernel(); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + double_lock(new_dir, old_dir); + +- old_dentry = lookup_hash(&oldnd.last, old_dir); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; +@@ -1883,16 +2113,16 @@ static inline int do_rename(const char * + if (newnd.last.name[newnd.last.len]) + goto exit4; + } +- new_dentry = lookup_hash(&newnd.last, new_dir); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + ++ + lock_kernel(); + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); + unlock_kernel(); +- + dput(new_dentry); + exit4: + dput(old_dentry); +@@ -1943,12 +2173,19 @@ out: + } + + static inline int +-__vfs_follow_link(struct nameidata *nd, const char *link) ++__vfs_follow_link(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) + { + int res = 0; + char *name; + if (IS_ERR(link)) + goto fail; ++ if (it == NULL) ++ it = nd->it; ++ else if (it != nd->it) ++ printk("it != nd->it: tell phil@clusterfs.com\n"); ++ if (it != NULL) ++ it->it_int_flags |= IT_FL_FOLLOWED; + + if (*link == '/') { + path_release(nd); +@@ -1956,7 +2193,7 @@ __vfs_follow_link(struct nameidata *nd, + /* weird __emul_prefix() stuff did it */ + goto out; + } +- res = link_path_walk(link, nd); ++ res = link_path_walk_it(link, nd, it); + out: + if (current->link_count || res || nd->last_type!=LAST_NORM) + return res; +@@ -1978,7 +2215,13 @@ fail: + + int vfs_follow_link(struct nameidata *nd, const char *link) + { +- return __vfs_follow_link(nd, link); ++ return __vfs_follow_link(nd, link, NULL); ++} ++ ++int vfs_follow_link_it(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) ++{ ++ return __vfs_follow_link(nd, link, it); + } + + /* get the link contents into pagecache */ +@@ -2020,7 +2263,7 @@ int page_follow_link(struct dentry *dent + { + struct page *page = NULL; + char *s = page_getlink(dentry, &page); +- int res = __vfs_follow_link(nd, s); ++ int res = __vfs_follow_link(nd, s, NULL); + if (page) { + kunmap(page); + page_cache_release(page); +--- linux-2.4.18-p4smp/fs/open.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/open.c 2003-07-08 14:45:17.000000000 -0600 +@@ -19,6 +19,8 @@ + #include + + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) ++extern int path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it); + + int vfs_statfs(struct super_block *sb, struct statfs *buf) + { +@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct + write_unlock(&files->file_lock); + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + struct inode *inode = dentry->d_inode; ++ struct inode_operations *op = dentry->d_inode->i_op; + int error; + struct iattr newattrs; + +@@ -108,7 +111,14 @@ int do_truncate(struct dentry *dentry, l + down(&inode->i_sem); + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; +- error = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ newattrs.ia_ctime = CURRENT_TIME; ++ error = op->setattr_raw(inode, &newattrs); ++ } else ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); + return error; + } +@@ -118,12 +128,13 @@ static inline long do_sys_truncate(const + struct nameidata nd; + struct inode * inode; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -163,11 +174,13 @@ static inline long do_sys_truncate(const + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ intent_release(&it); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -215,7 +228,7 @@ static inline long do_sys_ftruncate(unsi + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -260,11 +273,13 @@ asmlinkage long sys_utime(char * filenam + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -279,11 +294,29 @@ asmlinkage long sys_utime(char * filenam + goto dput_and_out; + + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EROFS; ++ if (IS_RDONLY(inode)) ++ goto dput_and_out; ++ ++ error = -EPERM; ++ if (!times) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; + } ++ + error = notify_change(nd.dentry, &newattrs); + dput_and_out: + path_release(&nd); +@@ -304,12 +337,14 @@ asmlinkage long sys_utimes(char * filena + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -324,7 +359,20 @@ asmlinkage long sys_utimes(char * filena + newattrs.ia_atime = times[0].tv_sec; + newattrs.ia_mtime = times[1].tv_sec; + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!utimes) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; +@@ -347,6 +395,7 @@ asmlinkage long sys_access(const char * + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -364,13 +413,14 @@ asmlinkage long sys_access(const char * + else + current->cap_effective = current->cap_permitted; + +- res = user_path_walk(filename, &nd); ++ res = user_path_walk_it(filename, &nd, &it); + if (!res) { + res = permission(nd.dentry->d_inode, mode); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ intent_release(&it); + path_release(&nd); + } + +@@ -385,8 +435,11 @@ asmlinkage long sys_chdir(const char * f + { + int error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd); ++ error = __user_walk_it(filename, ++ LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, ++ &nd, &it); + if (error) + goto out; + +@@ -397,6 +450,7 @@ asmlinkage long sys_chdir(const char * f + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -436,9 +490,10 @@ asmlinkage long sys_chroot(const char * + { + int error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | +- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); + if (error) + goto out; + +@@ -454,6 +509,7 @@ asmlinkage long sys_chroot(const char * + set_fs_altroot(); + error = 0; + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -508,6 +564,18 @@ asmlinkage long sys_chmod(const char * f + if (IS_RDONLY(inode)) + goto dput_and_out; + ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto dput_and_out; +@@ -538,6 +606,20 @@ static int chown_common(struct dentry * + error = -EROFS; + if (IS_RDONLY(inode)) + goto out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; +@@ -628,7 +710,8 @@ extern ssize_t do_readahead(struct file + /* for files over a certains size it doesn't pay to do readahead on open */ + #define READAHEAD_CUTOFF 48000 + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -649,7 +732,7 @@ struct file *dentry_open(struct dentry * + error = locks_verify_locked(inode); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + if (error || !(f->f_mode & FMODE_WRITE)) + put_write_access(inode); +@@ -679,7 +762,9 @@ struct file *dentry_open(struct dentry * + } + + if (f->f_op && f->f_op->open) { ++ f->f_it = it; + error = f->f_op->open(inode,f); ++ f->f_it = NULL; + if (error) + goto cleanup_all; + } +@@ -693,6 +778,7 @@ struct file *dentry_open(struct dentry * + do_readahead(f, 0, (48 * 1024) >> PAGE_SHIFT); + + ++ intent_release(it); + return f; + + cleanup_all: +@@ -707,11 +793,17 @@ cleanup_all: + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ return dentry_open_it(dentry, mnt, flags, NULL); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +--- linux-2.4.18-p4smp/fs/stat.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/stat.c 2003-07-08 14:45:17.000000000 -0600 +@@ -17,21 +17,24 @@ + * Revalidate the inode. This is required for proper NFS attribute caching. + */ + static __inline__ int +-do_revalidate(struct dentry *dentry) ++do_revalidate(struct dentry *dentry, struct lookup_intent *it) + { + struct inode * inode = dentry->d_inode; +- if (inode->i_op && inode->i_op->revalidate) ++ if (inode->i_op && inode->i_op->revalidate_it) ++ return inode->i_op->revalidate_it(dentry, it); ++ else if (inode->i_op && inode->i_op->revalidate) + return inode->i_op->revalidate(dentry); + return 0; + } + +-static int do_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++static int do_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat, ++ struct lookup_intent *it) + { + int res = 0; + unsigned int blocks, indirect; + struct inode *inode = dentry->d_inode; + +- res = do_revalidate(dentry); ++ res = do_revalidate(dentry, it); + if (res) + return res; + +@@ -104,10 +107,12 @@ int vfs_stat(char *name, struct kstat *s + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd, &it); + if (!error) { +- error = do_getattr(nd.mnt, nd.dentry, stat); ++ error = do_getattr(nd.mnt, nd.dentry, stat, &it); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -117,10 +122,12 @@ int vfs_lstat(char *name, struct kstat * + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd, &it); + if (!error) { +- error = do_getattr(nd.mnt, nd.dentry, stat); ++ error = do_getattr(nd.mnt, nd.dentry, stat, &it); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -132,7 +139,7 @@ int vfs_fstat(unsigned int fd, struct ks + int error = -EBADF; + + if (f) { +- error = do_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ error = do_getattr(f->f_vfsmnt, f->f_dentry, stat, NULL); + fput(f); + } + return error; +@@ -279,7 +286,7 @@ asmlinkage long sys_readlink(const char + + error = -EINVAL; + if (inode->i_op && inode->i_op->readlink && +- !(error = do_revalidate(nd.dentry))) { ++ !(error = do_revalidate(nd.dentry, NULL))) { + UPDATE_ATIME(inode); + error = inode->i_op->readlink(nd.dentry, buf, bufsiz); + } +--- linux-2.4.18-p4smp/fs/proc/base.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:45:12.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/fs/proc/base.c 2003-07-08 14:45:32.000000000 -0600 +@@ -465,6 +465,9 @@ static int proc_pid_follow_link(struct d + + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; ++ ++ if (nd->it != NULL) ++ nd->it->it_int_flags |= IT_FL_FOLLOWED; + out: + return error; + } +--- linux-2.4.18-p4smp/include/linux/dcache.h~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:38.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/include/linux/dcache.h 2003-07-08 14:45:17.000000000 -0600 +@@ -6,6 +6,44 @@ + #include + #include + ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_GETXATTR (1<<6) ++#define IT_EXEC (1<<7) ++#define IT_PIN (1<<8) ++ ++#define IT_FL_LOCKED (1) ++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */ ++ ++#define INTENT_MAGIC 0x19620323 ++ ++struct lookup_intent { ++ int it_op; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_magic; ++ int it_mode; ++ int it_flags; ++ int it_disposition; ++ int it_status; ++ int it_int_flags; ++ __u64 it_lock_handle[2]; ++ int it_lock_mode; ++ void *it_data; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op, int flags) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++ it->it_flags = flags; ++} ++ ++ + /* + * linux/include/linux/dcache.h + * +@@ -91,8 +129,22 @@ struct dentry_operations { + int (*d_delete)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); ++ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); ++ void (*d_pin)(struct dentry *, struct vfsmount * , int); ++ void (*d_unpin)(struct dentry *, struct vfsmount *, int); + }; + ++#define PIN(de,mnt,flag) if (de->d_op && de->d_op->d_pin) \ ++ de->d_op->d_pin(de, mnt, flag); ++#define UNPIN(de,mnt,flag) if (de->d_op && de->d_op->d_unpin) \ ++ de->d_op->d_unpin(de, mnt, flag); ++ ++ ++/* defined in fs/namei.c */ ++extern void intent_release(struct lookup_intent *it); ++/* defined in fs/dcache.c */ ++extern void __d_rehash(struct dentry * entry, int lock); ++ + /* the dentry parameter passed to d_hash and d_compare is the parent + * directory of the entries to be compared. It is used in case these + * functions need any directory specific information for determining +@@ -124,6 +176,7 @@ d_iput: no no yes + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +--- linux-2.4.18-p4smp/include/linux/fs.h~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:47.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/include/linux/fs.h 2003-07-08 14:45:17.000000000 -0600 +@@ -339,6 +339,8 @@ extern void set_bh_page(struct buffer_he + #define ATTR_MTIME_SET 256 + #define ATTR_FORCE 512 /* Not a change, but a change it */ + #define ATTR_ATTR_FLAG 1024 ++#define ATTR_RAW 2048 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 4096 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -578,6 +580,7 @@ struct file { + + /* needed for tty driver, and maybe others */ + void *private_data; ++ struct lookup_intent *f_it; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; +@@ -707,6 +710,7 @@ struct nameidata { + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *it; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +@@ -840,7 +844,8 @@ extern int vfs_symlink(struct inode *, s + extern int vfs_link(struct dentry *, struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); +-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); ++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry); + + /* + * File types +@@ -900,21 +905,34 @@ struct file_operations { + + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); ++ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); ++ int (*follow_link2) (struct dentry *, struct nameidata *, ++ struct lookup_intent *it); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*revalidate) (struct dentry *); ++ int (*revalidate_it) (struct dentry *, struct lookup_intent *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); + }; + +@@ -1119,10 +1137,12 @@ static inline int get_lease(struct inode + + asmlinkage long sys_open(const char *, int, int); + asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1388,9 +1408,12 @@ typedef int (*read_actor_t)(read_descrip + extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + + extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); + extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(path_lookup_it(const char *path, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); + extern void path_release(struct nameidata *); + extern int follow_down(struct vfsmount **, struct dentry **); +@@ -1399,6 +1422,8 @@ extern struct dentry * lookup_one_len(co + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) ++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) ++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) + + extern void inode_init_once(struct inode *); + extern void iput(struct inode *); +@@ -1499,6 +1524,8 @@ extern struct file_operations generic_ro + + extern int vfs_readlink(struct dentry *, char *, int, const char *); + extern int vfs_follow_link(struct nameidata *, const char *); ++extern int vfs_follow_link_it(struct nameidata *, const char *, ++ struct lookup_intent *it); + extern int page_readlink(struct dentry *, char *, int); + extern int page_follow_link(struct dentry *, struct nameidata *); + extern struct inode_operations page_symlink_inode_operations; +--- linux-2.4.18-p4smp/kernel/fork.c~vfs_intent-2.4.18-18-chaos65 2003-03-24 11:22:37.000000000 -0700 ++++ linux-2.4.18-p4smp-braam/kernel/fork.c 2003-07-08 14:45:17.000000000 -0600 +@@ -399,10 +399,13 @@ static inline struct fs_struct *__copy_f + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); ++ PIN(old->pwd, old->pwdmnt, 0); ++ PIN(old->root, old->rootmnt, 1); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { ++ PIN(old->altroot, old->altrootmnt, 1); + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { +--- linux-2.4.18-p4smp/kernel/exit.c~vfs_intent-2.4.18-18-chaos65 2002-10-29 12:27:38.000000000 -0700 ++++ linux-2.4.18-p4smp-braam/kernel/exit.c 2003-07-08 14:45:17.000000000 -0600 +@@ -303,11 +303,14 @@ static inline void __put_fs_struct(struc + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } +--- linux-2.4.18-p4smp/kernel/ksyms.c~vfs_intent-2.4.18-18-chaos65 2003-07-08 14:41:49.000000000 -0600 ++++ linux-2.4.18-p4smp-braam/kernel/ksyms.c 2003-07-08 14:45:17.000000000 -0600 +@@ -294,6 +294,7 @@ EXPORT_SYMBOL(read_cache_page); + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); + +_ diff --git a/lustre/kernel_patches/pc/vfs_intent-2.4.18-18-chaos65.pc b/lustre/kernel_patches/pc/vfs_intent-2.4.18-18-chaos65.pc new file mode 100644 index 0000000..adb8100 --- /dev/null +++ b/lustre/kernel_patches/pc/vfs_intent-2.4.18-18-chaos65.pc @@ -0,0 +1,12 @@ +fs/exec.c +fs/dcache.c +fs/namespace.c +fs/namei.c +fs/open.c +fs/stat.c +fs/proc/base.c +include/linux/dcache.h +include/linux/fs.h +kernel/fork.c +kernel/exit.c +kernel/ksyms.c diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index d6029f3..1963e1f 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -47,7 +47,6 @@ int client_import_connect(struct lustre_handle *dlm_handle, char *tmp[] = {imp->imp_target_uuid.uuid, obd->obd_uuid.uuid, (char *)dlm_handle}; - int rq_opc = (obd->obd_type->typ_ops->o_brw) ? OST_CONNECT :MDS_CONNECT; int msg_flags; ENTRY; @@ -67,7 +66,7 @@ int client_import_connect(struct lustre_handle *dlm_handle, if (obd->obd_namespace == NULL) GOTO(out_disco, rc = -ENOMEM); - request = ptlrpc_prep_req(imp, rq_opc, 3, size, tmp); + request = ptlrpc_prep_req(imp, imp->imp_connect_op, 3, size, tmp); if (!request) GOTO(out_ldlm, rc = -ENOMEM); @@ -90,7 +89,7 @@ int client_import_connect(struct lustre_handle *dlm_handle, class_export_put(exp); msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); - if (rq_opc == MDS_CONNECT || msg_flags & MSG_CONNECT_REPLAYABLE) { + if (msg_flags & MSG_CONNECT_REPLAYABLE) { imp->imp_replayable = 1; CDEBUG(D_HA, "connected to replayable target: %s\n", imp->imp_target_uuid.uuid); @@ -132,7 +131,16 @@ int client_import_disconnect(struct lustre_handle *dlm_handle, int failover) RETURN(-EINVAL); } - rq_opc = obd->obd_type->typ_ops->o_brw ? OST_DISCONNECT:MDS_DISCONNECT; + switch (imp->imp_connect_op) { + case OST_CONNECT: rq_opc = OST_DISCONNECT; break; + case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break; + case MGMT_CONNECT:rq_opc = MGMT_DISCONNECT;break; + default: + CERROR("don't know how to disconnect from %s (connect_op %d)\n", + imp->imp_target_uuid.uuid, imp->imp_connect_op); + RETURN(-EINVAL); + } + down(&cli->cl_sem); if (!cli->cl_conn_count) { CERROR("disconnecting disconnected device (%s)\n", diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h new file mode 100644 index 0000000..cad14ee --- /dev/null +++ b/lustre/lov/lov_internal.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2003 Cluster File Systems, Inc. + * + * This code is issued under the GNU General Public License. + * See the file COPYING in this distribution + */ + +int lov_get_stripecnt(struct lov_obd *lov, int stripe_count); +int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count); +void lov_free_memmd(struct lov_stripe_md **lsmp); diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index a72e176..640614e 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -57,6 +57,7 @@ void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) fid->f_type = (S_IFMT & inode->i_mode); } +/* Note that we can copy all of the fields, just some will not be "valid" */ void mds_pack_inode2body(struct mds_body *b, struct inode *inode) { b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | diff --git a/lustre/mgmt/.cvsignore b/lustre/mgmt/.cvsignore new file mode 100644 index 0000000..067f05c --- /dev/null +++ b/lustre/mgmt/.cvsignore @@ -0,0 +1,9 @@ +.Xrefs +config.log +config.status +configure +Makefile +Makefile.in +.deps +tags +TAGS diff --git a/lustre/mgmt/mgmt_cli.c b/lustre/mgmt/mgmt_cli.c new file mode 100644 index 0000000..fba49ab --- /dev/null +++ b/lustre/mgmt/mgmt_cli.c @@ -0,0 +1,269 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Implementation of the management/health monitoring client. + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * Author: Mike Shaver + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_MGMT +#include +#include + +#include +#include +#include +#include +#include +#include + +/*** Registration and service/thread management. ***/ + +/* An entry representing one obd which has registered for management events. */ +struct mgmtcli_registrant { + struct list_head chain; + struct obd_device *notify_obd; + struct obd_uuid *relevant_uuid; +}; + +static int mgmtcli_pinger_main(void *arg) +{ + struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg; + struct ptlrpc_thread *thread = data->thread; + unsigned long flags; + struct l_wait_info lwi = { 0 }; + ENTRY; + + lock_kernel(); + /* vv ptlrpc_daemonize(); vv */ + exit_mm(current); + + current->session = 1; + current->pgrp = 1; + current->tty = NULL; + + exit_files(current); + reparent_to_init(); + /* ^^ ptlrpc_daemonize(); ^^ */ + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) + sprintf(current->comm, "%s|%d", data->name,current->thread.extern_pid); +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + sprintf(current->comm, "%s|%d", data->name, + current->thread.mode.tt.extern_pid); +#else + strcpy(current->comm, data->name); +#endif + unlock_kernel(); + + /* Record that the thread is running */ + thread->t_flags = SVC_RUNNING; + wake_up(&thread->t_ctl_waitq); + + /* And now, loop forever, pinging as needed. */ + l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPING, &lwi); + + thread->t_flags = SVC_STOPPED; + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_NET, "pinger thread exiting"); + return 0; +} + +static int mgmtcli_connect_to_svc(struct obd_device *obd) +{ + int rc; + struct mgmtcli_obd *mc = &obd->u.mgmtcli; + struct ptlrpc_svc_data svc_data; + struct ptlrpc_thread *thread; + struct l_wait_info lwi = { 0 }; + ENTRY; + + /* Connect to ourselves, and thusly to the mgmt service. */ + rc = client_import_connect(&mc->mc_ping_handle, obd, &obd->obd_uuid); + if (rc) { + CERROR("failed to connect to mgmt svc: %d\n", rc); + (void)client_obd_cleanup(obd, 0); + RETURN(rc); + } + + LASSERT(mc->mc_ping_thread == NULL); + OBD_ALLOC(thread, sizeof (*thread)); + if (thread == NULL) + RETURN(-ENOMEM); + mc->mc_ping_thread = thread; + init_waitqueue_head(&thread->t_ctl_waitq); + + svc_data.name = "mgmtcli"; + svc_data.thread = thread; + + rc = kernel_thread(mgmtcli_pinger_main, &svc_data, CLONE_VM | CLONE_FILES); + if (rc < 0) { + CERROR("can't start thread to ping mgmt svc %s: %d\n", + mc->mc_import->imp_target_uuid.uuid, rc); + OBD_FREE(mc->mc_ping_thread, sizeof (*mc->mc_ping_thread)); + (void)client_import_disconnect(&mc->mc_ping_handle, 0); + RETURN(rc); + } + l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi); + + RETURN(0); +} + +static int mgmtcli_disconnect_from_svc(struct obd_device *obd) +{ + struct mgmtcli_obd *mc = &obd->u.mgmtcli; + struct obd_import *imp = mc->mc_import; + struct ptlrpc_thread *thread = mc->mc_ping_thread; + struct l_wait_info lwi = { 0 }; + int rc; + + ENTRY; + rc = client_import_disconnect(&mc->mc_ping_handle, 0); + if (rc) { + CERROR("can't disconnect from %s: %d (%s)\n", + imp->imp_target_uuid.uuid, rc, + (thread ? + "stopping pinger thread anyway" : + "pinger thread already stopped")); + } + + if (thread) { + thread->t_flags = SVC_STOPPING; + wake_up(&thread->t_ctl_waitq); + l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED, &lwi); + + OBD_FREE(mc->mc_ping_thread, sizeof (*mc->mc_ping_thread)); + } + + RETURN(rc); +} + +static int mgmtcli_register_for_events(struct obd_device *mgmt_obd, + struct obd_device *notify_obd, + struct obd_uuid *relevant_uuid) +{ + int start_thread; + struct mgmtcli_registrant *reg; + struct mgmtcli_obd *mcobd = &mgmt_obd->u.mgmtcli; + + ENTRY; + if (strcmp(mgmt_obd->obd_type->typ_name, LUSTRE_MGMTCLI_NAME)) + RETURN(-EINVAL); + + OBD_ALLOC(reg, sizeof(*reg)); + if (reg == NULL) + RETURN(-ENOMEM); + + reg->notify_obd = notify_obd; + reg->relevant_uuid = relevant_uuid; /* XXX hash */ + + spin_lock(&mgmt_obd->obd_dev_lock); + start_thread = list_empty(&mcobd->mc_registered); + list_add(&mcobd->mc_registered, ®->chain); + spin_unlock(&mgmt_obd->obd_dev_lock); + + if (start_thread) + RETURN(mgmtcli_connect_to_svc(mgmt_obd)); + + RETURN(0); +} + +static int mgmtcli_deregister_for_events(struct obd_device *mgmt_obd, + struct obd_device *notify_obd) +{ + int stop_thread, found = 0; + struct mgmtcli_registrant *reg = NULL; + struct list_head *tmp, *n; + struct mgmtcli_obd *mc = &mgmt_obd->u.mgmtcli; + + ENTRY; + if (strcmp(mgmt_obd->obd_type->typ_name, LUSTRE_MGMTCLI_NAME)) + RETURN(-EINVAL); + + spin_lock(&mgmt_obd->obd_dev_lock); + list_for_each_safe(tmp, n, &mc->mc_registered) { + reg = list_entry(tmp, struct mgmtcli_registrant, chain); + if (reg->notify_obd == notify_obd) { + list_del(®->chain); + found = 1; + OBD_FREE(reg, sizeof(*reg)); + break; + } + } + stop_thread = list_empty(&mc->mc_registered); + spin_unlock(&mgmt_obd->obd_dev_lock); + + if (stop_thread) { + LASSERT(found); + RETURN(mgmtcli_disconnect_from_svc(mgmt_obd)); + } + + if (!found) + RETURN(-ENOENT); + RETURN(0); +} + +/*** OBD scaffolding and module paraphernalia. ***/ + +static int mgmtcli_setup(struct obd_device *obd, obd_count len, void *buf) +{ + struct mgmtcli_obd *mc = &obd->u.mgmtcli; + INIT_LIST_HEAD(&mc->mc_registered); + + /* Initialize our nested client_obd structure. */ + RETURN(client_obd_setup(obd, len, buf)); +} + +static struct obd_ops mgmtcli_obd_ops = { + o_owner: THIS_MODULE, + o_setup: mgmtcli_setup, + o_cleanup: client_obd_cleanup +}; + +static int __init mgmtcli_init(void) +{ + inter_module_register("mgmtcli_register_for_events", THIS_MODULE, + mgmtcli_register_for_events); + inter_module_register("mgmtcli_deregister_for_events", THIS_MODULE, + mgmtcli_deregister_for_events); + return class_register_type(&mgmtcli_obd_ops, 0, LUSTRE_MGMTCLI_NAME); +} + +static void __exit mgmtcli_exit(void) +{ + class_unregister_type(LUSTRE_MGMTCLI_NAME); + inter_module_unregister("mgmtcli_register_for_events"); + inter_module_unregister("mgmtcli_deregister_for_events"); +} + +#ifdef __KERNEL__ +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre monitoring client v0.1"); +MODULE_LICENSE("GPL"); + +module_init(mgmtcli_init); +module_exit(mgmtcli_exit); +#endif diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h new file mode 100644 index 0000000..94b5321 --- /dev/null +++ b/lustre/obdfilter/filter_internal.h @@ -0,0 +1,122 @@ +#ifndef _FILTER_INTERNAL_H +#define _FILTER_INTERNAL_H + + +#ifdef __KERNEL__ +# include +#endif +#include +#include + +#ifndef OBD_FILTER_DEVICENAME +# define OBD_FILTER_DEVICENAME "obdfilter" +#endif + +#ifndef OBD_FILTER_SAN_DEVICENAME +# define OBD_FILTER_SAN_DEVICENAME "sanobdfilter" +#endif + +#define FILTER_LR_SERVER_SIZE 512 + +#define FILTER_LR_CLIENT_START 8192 +#define FILTER_LR_CLIENT_SIZE 128 + +#define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */ + +#define FILTER_MOUNT_RECOV 2 +#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ + +/* Data stored per server at the head of the last_rcvd file. In le32 order. */ +struct filter_server_data { + __u8 fsd_uuid[37]; /* server UUID */ + __u8 fsd_uuid_padding[3]; /* unused */ + __u64 fsd_last_objid; /* last created object ID */ + __u64 fsd_last_transno; /* last completed transaction ID */ + __u64 fsd_mount_count; /* FILTER incarnation number */ + __u32 fsd_feature_compat; /* compatible feature flags */ + __u32 fsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 fsd_feature_incompat;/* incompatible feature flags */ + __u32 fsd_server_size; /* size of server data area */ + __u32 fsd_client_start; /* start of per-client data area */ + __u16 fsd_client_size; /* size of per-client data area */ + __u16 fsd_subdir_count; /* number of subdirectories for objects */ + __u64 fsd_catalog_oid; /* recovery catalog object id */ + __u32 fsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 fsd_peeruuid[37]; /* UUID of MDS associated with this OST */ + __u8 peer_padding[3]; /* unused */ + __u8 fsd_padding[FILTER_LR_SERVER_SIZE - 140]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct filter_client_data { + __u8 fcd_uuid[37]; /* client UUID */ + __u8 fcd_uuid_padding[3]; /* unused */ + __u64 fcd_last_rcvd; /* last completed transaction ID */ + __u64 fcd_mount_count; /* FILTER incarnation number */ + __u64 fcd_last_xid; /* client RPC xid for the last transaction */ + __u8 fcd_padding[FILTER_LR_CLIENT_SIZE - 64]; +}; + +/* file data for open files on OST */ +struct filter_file_data { + struct portals_handle ffd_handle; + atomic_t ffd_refcount; + struct list_head ffd_export_list; /* export open list - fed_lock */ + struct file *ffd_file; /* file handle */ +}; + +struct filter_dentry_data { + struct llog_cookie fdd_cookie; + obd_id fdd_objid; + __u32 fdd_magic; + atomic_t fdd_open_count; + int fdd_flags; +}; + +#define FILTER_DENTRY_MAGIC 0x9efba101 +#define FILTER_FLAG_DESTROY 0x0001 /* destroy dentry on last file close */ + +enum { + LPROC_FILTER_READ_BYTES = 0, + LPROC_FILTER_WRITE_BYTES = 1, + LPROC_FILTER_LAST, +}; + +/* filter.c */ +struct dentry *filter_parent(struct obd_device *, obd_mode mode, obd_id objid); +struct dentry *filter_parent_lock(struct obd_device *, obd_mode mode, + obd_id objid, ldlm_mode_t lock_mode, + struct lustre_handle *lockh); +void f_dput(struct dentry *); +struct dentry *filter_fid2dentry(struct obd_device *, struct dentry *dir, + obd_mode mode, obd_id id); +int filter_finish_transno(struct obd_export *, struct obd_trans_info *, int rc); +__u64 filter_next_id(struct filter_obd *); +int filter_update_server_data(struct file *, struct filter_server_data *); +int filter_common_setup(struct obd_device *, obd_count len, void *buf, + char *option); + +/* filter_io.c */ +int filter_preprw(int cmd, struct obd_export *, struct obdo *, int objcount, + struct obd_ioobj *, int niocount, struct niobuf_remote *, + struct niobuf_local *, struct obd_trans_info *); +int filter_commitrw(int cmd, struct obd_export *, int objcount, + struct obd_ioobj *, int niocount, struct niobuf_local *, + struct obd_trans_info *); +int filter_brw(int cmd, struct lustre_handle *, struct lov_stripe_md *, + obd_count oa_bufs, struct brw_page *, struct obd_trans_info *); + +/* filter_log.c */ +int filter_log_cancel(struct lustre_handle *, struct lov_stripe_md *, + int num_cookies, struct llog_cookie *, int flags); +int filter_log_op_create(struct llog_handle *cathandle, struct ll_fid *mds_fid, + obd_id oid, obd_count ogen, struct llog_cookie *); +int filter_log_op_orphan(struct llog_handle *cathandle, obd_id oid, + obd_count ogen, struct llog_cookie *); + +/* filter_san.c */ +int filter_san_setup(struct obd_device *obd, obd_count len, void *buf); +int filter_san_preprw(int cmd, struct lustre_handle *, int objcount, + struct obd_ioobj *, int niocount, struct niobuf_remote *); + +#endif diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c new file mode 100644 index 0000000..ee65d89 --- /dev/null +++ b/lustre/obdfilter/filter_io.c @@ -0,0 +1,764 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * linux/fs/obdfilter/filter_io.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include // XXX kill me soon +#include + +#include +#include +#include "filter_internal.h" + +static int filter_start_page_read(struct inode *inode, struct niobuf_local *lnb) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index = lnb->offset >> PAGE_SHIFT; + int rc; + + page = grab_cache_page(mapping, index); /* locked page */ + if (IS_ERR(page)) + return lnb->rc = PTR_ERR(page); + + lnb->page = page; + + if (inode->i_size < lnb->offset + lnb->len - 1) + lnb->rc = inode->i_size - lnb->offset; + else + lnb->rc = lnb->len; + + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + + rc = mapping->a_ops->readpage(NULL, page); + if (rc < 0) { + CERROR("page index %lu, rc = %d\n", index, rc); + lnb->page = NULL; + page_cache_release(page); + return lnb->rc = rc; + } + + return 0; +} + +static int filter_finish_page_read(struct niobuf_local *lnb) +{ + if (lnb->page == NULL) + return 0; + + if (PageUptodate(lnb->page)) + return 0; + + wait_on_page(lnb->page); + if (!PageUptodate(lnb->page)) { + CERROR("page index %lu/offset "LPX64" not uptodate\n", + lnb->page->index, lnb->offset); + GOTO(err_page, lnb->rc = -EIO); + } + if (PageError(lnb->page)) { + CERROR("page index %lu/offset "LPX64" has error\n", + lnb->page->index, lnb->offset); + GOTO(err_page, lnb->rc = -EIO); + } + + return 0; + +err_page: + page_cache_release(lnb->page); + lnb->page = NULL; + return lnb->rc; +} + +static struct page *lustre_get_page_write(struct inode *inode, + unsigned long index) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + int rc; + + page = grab_cache_page(mapping, index); /* locked page */ + + if (!IS_ERR(page)) { + /* Note: Called with "O" and "PAGE_SIZE" this is essentially + * a no-op for most filesystems, because we write the whole + * page. For partial-page I/O this will read in the page. + */ + rc = mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE); + if (rc) { + CERROR("page index %lu, rc = %d\n", index, rc); + if (rc != -ENOSPC) + LBUG(); + GOTO(err_unlock, rc); + } + /* XXX not sure if we need this if we are overwriting page */ + if (PageError(page)) { + CERROR("error on page index %lu, rc = %d\n", index, rc); + LBUG(); + GOTO(err_unlock, rc = -EIO); + } + } + return page; + +err_unlock: + unlock_page(page); + page_cache_release(page); + return ERR_PTR(rc); +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +/* We should only change the file mtime (and not the ctime, like + * update_inode_times() in generic_file_write()) when we only change data. */ +static inline void inode_update_time(struct inode *inode, int ctime_too) +{ + time_t now = CURRENT_TIME; + if (inode->i_mtime == now && (!ctime_too || inode->i_ctime == now)) + return; + inode->i_mtime = now; + if (ctime_too) + inode->i_ctime = now; + mark_inode_dirty_sync(inode); +} +#endif + +static int lustre_commit_write(struct niobuf_local *lnb) +{ + struct page *page = lnb->page; + unsigned from = lnb->offset & ~PAGE_MASK; + unsigned to = from + lnb->len; + struct inode *inode = page->mapping->host; + int err; + + LASSERT(to <= PAGE_SIZE); + err = page->mapping->a_ops->commit_write(NULL, page, from, to); + if (!err && IS_SYNC(inode)) +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + wait_on_page_locked(page); +#else + err = waitfor_one_page(page); +#endif + //SetPageUptodate(page); // the client commit_write will do this + + SetPageReferenced(page); + unlock_page(page); + page_cache_release(page); + return err; +} + +int filter_get_page_write(struct inode *inode, struct niobuf_local *lnb, + int *pglocked) +{ + unsigned long index = lnb->offset >> PAGE_SHIFT; + struct address_space *mapping = inode->i_mapping; + struct page *page; + int rc; + + //ASSERT_PAGE_INDEX(index, GOTO(err, rc = -EINVAL)); + if (*pglocked) + page = grab_cache_page_nowait(mapping, index); /* locked page */ + else + page = grab_cache_page(mapping, index); /* locked page */ + + + /* This page is currently locked, so get a temporary page instead. */ + if (page == NULL) { + CDEBUG(D_ERROR,"ino %lu page %ld locked\n", inode->i_ino,index); + page = alloc_pages(GFP_KERNEL, 0); /* locked page */ + if (page == NULL) { + CERROR("no memory for a temp page\n"); + GOTO(err, rc = -ENOMEM); + } + page->index = index; + lnb->page = page; + lnb->flags |= N_LOCAL_TEMP_PAGE; + } else if (!IS_ERR(page)) { + (*pglocked)++; + + rc = mapping->a_ops->prepare_write(NULL, page, + lnb->offset & ~PAGE_MASK, + lnb->len); + if (rc) { + if (rc != -ENOSPC) + CERROR("page index %lu, rc = %d\n", index, rc); + GOTO(err_unlock, rc); + } + /* XXX not sure if we need this if we are overwriting page */ + if (PageError(page)) { + CERROR("error on page index %lu, rc = %d\n", index, rc); + LBUG(); + GOTO(err_unlock, rc = -EIO); + } + lnb->page = page; + } + + return 0; + +err_unlock: + unlock_page(page); + page_cache_release(page); +err: + return lnb->rc = rc; +} + +static int filter_preprw_read(struct obd_export *exp, struct obdo *obdo, + int objcount, struct obd_ioobj *obj, + int niocount, struct niobuf_remote *nb, + struct niobuf_local *res, + struct obd_trans_info *oti) +{ + struct obd_run_ctxt saved; + struct obd_device *obd; + struct obd_ioobj *o; + struct niobuf_remote *rnb; + struct niobuf_local *lnb; + struct fsfilt_objinfo *fso; + struct dentry *dentry; + struct inode *inode; + int rc = 0, i, j, tot_bytes = 0, cleanup_phase = 0; + unsigned long now = jiffies; + ENTRY; + LASSERT(objcount == 1); + + obd = exp->exp_obd; + if (obd == NULL) + RETURN(-EINVAL); + OBD_ALLOC(fso, objcount * sizeof(*fso)); + if (fso == NULL) + RETURN(-ENOMEM); + + memset(res, 0, niocount * sizeof(*res)); + + push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + for (i = 0, o = obj; i < objcount; i++, o++) { + struct filter_dentry_data *fdd; + LASSERT(o->ioo_bufcnt); + + dentry = filter_fid2dentry(obd, NULL, o->ioo_type, o->ioo_id); + if (IS_ERR(dentry)) + GOTO(out_objinfo, rc = PTR_ERR(dentry)); + + if (dentry->d_inode == NULL) { + CERROR("trying to BRW to non-existent file "LPU64"\n", + o->ioo_id); + f_dput(dentry); + GOTO(out_objinfo, rc = -ENOENT); + } + + fso[i].fso_dentry = dentry; + fso[i].fso_bufcnt = o->ioo_bufcnt; + + fdd = dentry->d_fsdata; + if (fdd == NULL || !atomic_read(&fdd->fdd_open_count)) + CDEBUG(D_PAGE, "I/O to unopened object "LPU64"\n", + o->ioo_id); + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow prep setup %lus\n", (jiffies - now) / HZ); + + for (i = 0, o = obj, rnb = nb, lnb = res; i < objcount; i++, o++) { + dentry = fso[i].fso_dentry; + inode = dentry->d_inode; + + for (j = 0; j < o->ioo_bufcnt; j++, rnb++, lnb++) { + if (j == 0) + lnb->dentry = dentry; + else + lnb->dentry = dget(dentry); + + lnb->offset = rnb->offset; + lnb->len = rnb->len; + lnb->flags = rnb->flags; + lnb->start = jiffies; + + if (inode->i_size <= rnb->offset) { + /* If there's no more data, abort early. + * lnb->page == NULL and lnb->rc == 0, so it's + * easy to detect later. */ + f_dput(dentry); + lnb->dentry = NULL; + break; + } else { + rc = filter_start_page_read(inode, lnb); + } + + if (rc) { + CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, + "page err %u@"LPU64" %u/%u %p: rc %d\n", + lnb->len, lnb->offset, j, o->ioo_bufcnt, + dentry, rc); + f_dput(dentry); + GOTO(out_pages, rc); + } + + tot_bytes += lnb->rc; + if (lnb->rc < lnb->len) + break; /* short read */ + } + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow prep get page %lus\n", (jiffies - now) / HZ); + + lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_READ_BYTES, tot_bytes); + while (lnb-- > res) { + rc = filter_finish_page_read(lnb); + if (rc) { + CERROR("error page %u@"LPU64" %u %p: rc %d\n", lnb->len, + lnb->offset, (int)(lnb - res), lnb->dentry, rc); + f_dput(lnb->dentry); + GOTO(out_pages, rc); + } + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow prep finish page %lus\n", (jiffies - now) / HZ); + + EXIT; +out: + OBD_FREE(fso, objcount * sizeof(*fso)); + /* we saved the journal handle into oti->oti_handle instead */ + current->journal_info = NULL; + pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + return rc; + +out_pages: + while (lnb-- > res) { + page_cache_release(lnb->page); + f_dput(lnb->dentry); + } + goto out; /* dropped the dentry refs already (one per page) */ + +out_objinfo: + for (i = 0; i < objcount && fso[i].fso_dentry; i++) + f_dput(fso[i].fso_dentry); + goto out; +} + +/* We need to balance prepare_write() calls with commit_write() calls. + * If the page has been prepared, but we have no data for it, we don't + * want to overwrite valid data on disk, but we still need to zero out + * data for space which was newly allocated. Like part of what happens + * in __block_prepare_write() for newly allocated blocks. + * + * XXX currently __block_prepare_write() creates buffers for all the + * pages, and the filesystems mark these buffers as BH_New if they + * were newly allocated from disk. We use the BH_New flag similarly. */ +static int filter_commit_write(struct niobuf_local *lnb, int err) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + if (err) { + unsigned block_start, block_end; + struct buffer_head *bh, *head = lnb->page->buffers; + unsigned blocksize = head->b_size; + + /* debugging: just seeing if this ever happens */ + CDEBUG(err == -ENOSPC ? D_INODE : D_ERROR, + "called for ino %lu:%lu on err %d\n", + lnb->page->mapping->host->i_ino, lnb->page->index, err); + + /* Currently one buffer per page, but in the future... */ + for (bh = head, block_start = 0; bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (buffer_new(bh)) { + memset(kmap(lnb->page) + block_start, 0, + blocksize); + kunmap(lnb->page); + } + } + } +#endif + return lustre_commit_write(lnb); +} + +/* If we ever start to support multi-object BRW RPCs, we will need to get locks + * on mulitple inodes. That isn't all, because there still exists the + * possibility of a truncate starting a new transaction while holding the ext3 + * rwsem = write while some writes (which have started their transactions here) + * blocking on the ext3 rwsem = read => lock inversion. + * + * The handling gets very ugly when dealing with locked pages. It may be easier + * to just get rid of the locked page code (which has problems of its own) and + * either discover we do not need it anymore (i.e. it was a symptom of another + * bug) or ensure we get the page locks in an appropriate order. */ +static int filter_preprw_write(struct obd_export *exp, struct obdo *obdo, + int objcount, struct obd_ioobj *obj, + int niocount, struct niobuf_remote *nb, + struct niobuf_local *res, + struct obd_trans_info *oti) +{ + struct obd_run_ctxt saved; + struct obd_device *obd; + struct obd_ioobj *o; + struct niobuf_remote *rnb; + struct niobuf_local *lnb; + struct fsfilt_objinfo *fso; + struct dentry *dentry; + int pglocked = 0, rc = 0, i, j, tot_bytes = 0, cleanup_phase = 0; + unsigned long now = jiffies; + ENTRY; + LASSERT(objcount == 1); + + obd = exp->exp_obd; + if (obd == NULL) + RETURN(-EINVAL); + OBD_ALLOC(fso, objcount * sizeof(*fso)); + if (fso == NULL) + RETURN(-ENOMEM); + + memset(res, 0, niocount * sizeof(*res)); + + push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + for (i = 0, o = obj; i < objcount; i++, o++) { + struct filter_dentry_data *fdd; + LASSERT(o->ioo_bufcnt); + + dentry = filter_fid2dentry(obd, NULL, o->ioo_type, o->ioo_id); + if (IS_ERR(dentry)) + GOTO(out_objinfo, rc = PTR_ERR(dentry)); + + if (dentry->d_inode == NULL) { + CERROR("trying to BRW to non-existent file "LPU64"\n", + o->ioo_id); + f_dput(dentry); + GOTO(out_objinfo, rc = -ENOENT); + } + + fso[i].fso_dentry = dentry; + fso[i].fso_bufcnt = o->ioo_bufcnt; + + down(&dentry->d_inode->i_sem); + fdd = dentry->d_fsdata; + if (fdd == NULL || !atomic_read(&fdd->fdd_open_count)) + CDEBUG(D_PAGE, "I/O to unopened object "LPU64"\n", + o->ioo_id); + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow prep setup %lus\n", (jiffies - now) / HZ); + + LASSERT(oti != NULL); + oti->oti_handle = fsfilt_brw_start(obd, objcount, fso, niocount, oti); + if (IS_ERR(oti->oti_handle)) { + rc = PTR_ERR(oti->oti_handle); + CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, + "error starting transaction: rc = %d\n", rc); + oti->oti_handle = NULL; + GOTO(out_objinfo, rc); + } + + for (i = 0, o = obj, rnb = nb, lnb = res; i < objcount; i++, o++) { + dentry = fso[i].fso_dentry; + for (j = 0; j < o->ioo_bufcnt; j++, rnb++, lnb++) { + if (j == 0) + lnb->dentry = dentry; + else + lnb->dentry = dget(dentry); + + lnb->offset = rnb->offset; + lnb->len = rnb->len; + lnb->flags = rnb->flags; + lnb->start = jiffies; + + rc = filter_get_page_write(dentry->d_inode, lnb, + &pglocked); + if (rc) + up(&dentry->d_inode->i_sem); + + if (rc) { + CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, + "page err %u@"LPU64" %u/%u %p: rc %d\n", + lnb->len, lnb->offset, j, o->ioo_bufcnt, + dentry, rc); + f_dput(dentry); + GOTO(out_pages, rc); + } + tot_bytes += lnb->len; + } + } + + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow prep get page %lus\n", (jiffies - now) / HZ); + + lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_WRITE_BYTES,tot_bytes); + + EXIT; +out: + OBD_FREE(fso, objcount * sizeof(*fso)); + /* we saved the journal handle into oti->oti_handle instead */ + current->journal_info = NULL; + pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + return rc; + +out_pages: + while (lnb-- > res) { + filter_commit_write(lnb, rc); + up(&lnb->dentry->d_inode->i_sem); + f_dput(lnb->dentry); + } + filter_finish_transno(exp, oti, rc); + fsfilt_commit(obd, filter_parent(obd,S_IFREG,obj->ioo_id)->d_inode, + oti->oti_handle, 0); + goto out; /* dropped the dentry refs already (one per page) */ + +out_objinfo: + for (i = 0; i < objcount && fso[i].fso_dentry; i++) { + up(&fso[i].fso_dentry->d_inode->i_sem); + f_dput(fso[i].fso_dentry); + } + goto out; +} + +int filter_preprw(int cmd, struct obd_export *exp, struct obdo *obdo, + int objcount, struct obd_ioobj *obj, int niocount, + struct niobuf_remote *nb, struct niobuf_local *res, + struct obd_trans_info *oti) +{ + if (cmd == OBD_BRW_WRITE) + return filter_preprw_write(exp, obdo, objcount, obj, niocount, + nb, res, oti); + else if (cmd == OBD_BRW_READ) + return filter_preprw_read(exp, obdo, objcount, obj, niocount, + nb, res, oti); + else + LBUG(); +} + +/* It is highly unlikely that we would ever get an error here. The page we want + * to get was previously locked, so it had to have already allocated the space, + * and we were just writing over the same data, so there would be no hole in the + * file. + * + * XXX: possibility of a race with truncate could exist, need to check that. + * There are no guarantees w.r.t. write order even on a local filesystem, + * although the normal response would be to return the number of bytes + * successfully written and leave the rest to the app. */ +static int filter_write_locked_page(struct niobuf_local *lnb) +{ + struct page *lpage; + void *lpage_addr, *lnb_addr; + int rc; + ENTRY; + + lpage = lustre_get_page_write(lnb->dentry->d_inode, lnb->page->index); + if (IS_ERR(lpage)) { + rc = PTR_ERR(lpage); + CERROR("error getting locked page index %ld: rc = %d\n", + lnb->page->index, rc); + LBUG(); + lustre_commit_write(lnb); + RETURN(rc); + } + + /* 2 kmaps == vanishingly small deadlock opportunity */ + lpage_addr = kmap(lpage); + lnb_addr = kmap(lnb->page); + + memcpy(lpage_addr, lnb_addr, PAGE_SIZE); + + kunmap(lnb->page); + kunmap(lpage); + + page_cache_release(lnb->page); + + lnb->page = lpage; + rc = lustre_commit_write(lnb); + if (rc) + CERROR("error committing locked page %ld: rc = %d\n", + lnb->page->index, rc); + RETURN(rc); +} + +int filter_commitrw(int cmd, struct obd_export *exp, int objcount, + struct obd_ioobj *obj, int niocount, + struct niobuf_local *res, struct obd_trans_info *oti) +{ + struct obd_run_ctxt saved; + struct obd_ioobj *o; + struct niobuf_local *lnb; + struct obd_device *obd = exp->exp_obd; + int found_locked = 0, rc = 0, i; + int nested_trans = current->journal_info != NULL; + unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */ + ENTRY; + + push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + + if (cmd & OBD_BRW_WRITE) { + LASSERT(oti); + LASSERT(current->journal_info == NULL || + current->journal_info == oti->oti_handle); + current->journal_info = oti->oti_handle; + } + + for (i = 0, o = obj, lnb = res; i < objcount; i++, o++) { + int j; + + if (cmd & OBD_BRW_WRITE) { + inode_update_time(lnb->dentry->d_inode, 1); + up(&lnb->dentry->d_inode->i_sem); + } + for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { + if (lnb->page == NULL) { + continue; + } + + if (lnb->flags & N_LOCAL_TEMP_PAGE) { + found_locked++; + continue; + } + + if (time_after(jiffies, lnb->start + 15 * HZ)) + CERROR("slow commitrw %lus\n", + (jiffies - lnb->start) / HZ); + + if (cmd & OBD_BRW_WRITE) { + int err = filter_commit_write(lnb, 0); + + if (!rc) + rc = err; + } else { + page_cache_release(lnb->page); + } + + f_dput(lnb->dentry); + if (time_after(jiffies, lnb->start + 15 * HZ)) + CERROR("slow commit_write %lus\n", + (jiffies - lnb->start) / HZ); + } + } + + for (i = 0, o = obj, lnb = res; found_locked > 0 && i < objcount; + i++, o++) { + int j; + for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { + int err; + if (!(lnb->flags & N_LOCAL_TEMP_PAGE)) + continue; + + if (time_after(jiffies, lnb->start + 15 * HZ)) + CERROR("slow commitrw locked %lus\n", + (jiffies - lnb->start) / HZ); + + err = filter_write_locked_page(lnb); + if (!rc) + rc = err; + f_dput(lnb->dentry); + found_locked--; + + if (time_after(jiffies, lnb->start + 15 * HZ)) + CERROR("slow commit_write locked %lus\n", + (jiffies - lnb->start) / HZ); + } + } + + if (cmd & OBD_BRW_WRITE) { + /* We just want any dentry for the commit, for now */ + struct dentry *dparent = filter_parent(obd, S_IFREG, 0); + int err; + + rc = filter_finish_transno(exp, oti, rc); + err = fsfilt_commit(obd, dparent->d_inode, oti->oti_handle, + obd_sync_filter); + if (err) + rc = err; + if (obd_sync_filter) + LASSERT(oti->oti_transno <= obd->obd_last_committed); + if (time_after(jiffies, now + 15 * HZ)) + CERROR("slow commitrw commit %lus\n", (jiffies-now)/HZ); + } + + LASSERT(nested_trans || current->journal_info == NULL); + pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + RETURN(rc); +} + +int filter_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti) +{ + struct obd_export *exp = class_conn2export(conn); + struct obd_ioobj ioo; + struct niobuf_local *lnb; + struct niobuf_remote *rnb; + obd_count i; + int ret = 0; + ENTRY; + + if (exp == NULL) + RETURN(-EINVAL); + + OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local)); + OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote)); + + if (lnb == NULL || rnb == NULL) + GOTO(out, ret = -ENOMEM); + + for (i = 0; i < oa_bufs; i++) { + rnb[i].offset = pga[i].off; + rnb[i].len = pga[i].count; + } + + ioo.ioo_id = lsm->lsm_object_id; + ioo.ioo_gr = 0; + ioo.ioo_type = S_IFREG; + ioo.ioo_bufcnt = oa_bufs; + + ret = filter_preprw(cmd, exp, NULL, 1, &ioo, oa_bufs, rnb, lnb, oti); + if (ret != 0) + GOTO(out, ret); + + for (i = 0; i < oa_bufs; i++) { + void *virt = kmap(pga[i].pg); + obd_off off = pga[i].off & ~PAGE_MASK; + void *addr = kmap(lnb[i].page); + + /* 2 kmaps == vanishingly small deadlock opportunity */ + + if (cmd & OBD_BRW_WRITE) + memcpy(addr + off, virt + off, pga[i].count); + else + memcpy(virt + off, addr + off, pga[i].count); + + kunmap(addr); + kunmap(virt); + } + + ret = filter_commitrw(cmd, exp, 1, &ioo, oa_bufs, lnb, oti); + +out: + if (lnb) + OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local)); + if (rnb) + OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote)); + class_export_put(exp); + RETURN(ret); +} diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c new file mode 100644 index 0000000..790659d --- /dev/null +++ b/lustre/obdfilter/filter_log.c @@ -0,0 +1,379 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * linux/fs/obdfilter/filter_log.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include + +#include +#include +#include +#include + +#include "filter_internal.h" + +static struct llog_handle *filter_log_create(struct obd_device *obd); + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int filter_log_close(struct llog_handle *cathandle, + struct llog_handle *loghandle) +{ + struct llog_object_hdr *llh = loghandle->lgh_hdr; + struct file *file = loghandle->lgh_file; + struct dentry *dparent = NULL, *dchild = NULL; + struct lustre_handle parent_lockh; + struct llog_logid *lgl = &loghandle->lgh_cookie.lgc_lgl; + int rc; + ENTRY; + + /* If we are going to delete this log, grab a ref before we close + * it so we don't have to immediately do another lookup. */ + if (llh->llh_hdr.lth_type != LLOG_CATALOG_MAGIC && llh->llh_count == 0){ + CDEBUG(D_INODE, "deleting log file "LPX64":%x\n", + lgl->lgl_oid, lgl->lgl_ogen); + dparent = filter_parent_lock(loghandle->lgh_obd, S_IFREG, + lgl->lgl_oid,LCK_PW,&parent_lockh); + if (IS_ERR(dparent)) { + rc = PTR_ERR(dparent); + CERROR("error locking parent, orphan log %*s: rc %d\n", + file->f_dentry->d_name.len, + file->f_dentry->d_name.name, rc); + RETURN(rc); + } else { + dchild = dget(file->f_dentry); + llog_delete_log(cathandle, loghandle); + } + } else { + CDEBUG(D_INODE, "closing log file "LPX64":%x\n", + lgl->lgl_oid, lgl->lgl_ogen); + } + + rc = filp_close(file, 0); + + llog_free_handle(loghandle); /* also removes loghandle from list */ + + if (dchild != NULL) { + int err = vfs_unlink(dparent->d_inode, dchild); + if (err) { + CERROR("error unlinking empty log %*s: rc %d\n", + dchild->d_name.len, dchild->d_name.name, err); + if (!rc) + rc = err; + } + f_dput(dchild); + ldlm_lock_decref(&parent_lockh, LCK_PW); + } + RETURN(rc); +} + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static struct llog_handle *filter_log_open(struct obd_device *obd, + struct llog_cookie *logcookie) +{ + struct llog_logid *lgl = &logcookie->lgc_lgl; + struct llog_handle *loghandle; + struct dentry *dchild; + int rc; + ENTRY; + + loghandle = llog_alloc_handle(); + if (!loghandle) + RETURN(ERR_PTR(-ENOMEM)); + + dchild = filter_fid2dentry(obd, NULL, S_IFREG, lgl->lgl_oid); + if (IS_ERR(dchild)) + GOTO(out_handle, rc = PTR_ERR(dchild)); + + if (dchild->d_inode == NULL) { + CERROR("logcookie references non-existent object %*s\n", + dchild->d_name.len, dchild->d_name.name); + GOTO(out_dentry, rc = -ENOENT); + } + + if (dchild->d_inode->i_generation != lgl->lgl_ogen) { + CERROR("logcookie for %*s had different generation %x != %x\n", + dchild->d_name.len, dchild->d_name.name, + dchild->d_inode->i_generation, lgl->lgl_ogen); + GOTO(out_dentry, rc = -ESTALE); + } + + /* dentry_open does a dput(dchild) and mntput(mnt) on error */ + mntget(obd->u.filter.fo_vfsmnt); + loghandle->lgh_file = dentry_open(dchild, obd->u.filter.fo_vfsmnt, + O_RDWR); + if (IS_ERR(loghandle->lgh_file)) { + rc = PTR_ERR(loghandle->lgh_file); + CERROR("error opening logfile %*s: rc %d\n", + dchild->d_name.len, dchild->d_name.name, rc); + GOTO(out_dentry, rc); + } + memcpy(&loghandle->lgh_cookie, logcookie, sizeof(*logcookie)); + loghandle->lgh_log_create = filter_log_create; + loghandle->lgh_log_open = filter_log_open; + loghandle->lgh_log_close = filter_log_close; + loghandle->lgh_obd = obd; + RETURN(loghandle); + +out_dentry: + f_dput(dchild); +out_handle: + llog_free_handle(loghandle); + RETURN(ERR_PTR(rc)); +} + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static struct llog_handle *filter_log_create(struct obd_device *obd) +{ + struct filter_obd *filter = &obd->u.filter; + struct lustre_handle parent_lockh; + struct dentry *dparent, *dchild; + struct llog_handle *loghandle; + struct file *file; + int err, rc; + obd_id id; + ENTRY; + + loghandle = llog_alloc_handle(); + if (!loghandle) + RETURN(ERR_PTR(-ENOMEM)); + + retry: + id = filter_next_id(filter); + + dparent = filter_parent_lock(obd, S_IFREG, id, LCK_PW, &parent_lockh); + if (IS_ERR(dparent)) + GOTO(out_ctxt, rc = PTR_ERR(dparent)); + + dchild = filter_fid2dentry(obd, dparent, S_IFREG, id); + if (IS_ERR(dchild)) + GOTO(out_lock, rc = PTR_ERR(dchild)); + + if (dchild->d_inode != NULL) { + /* This would only happen if lastobjid was bad on disk */ + CERROR("Serious error: objid %*s already exists; is this " + "filesystem corrupt? I will try to work around it.\n", + dchild->d_name.len, dchild->d_name.name); + f_dput(dchild); + ldlm_lock_decref(&parent_lockh, LCK_PW); + goto retry; + } + + rc = vfs_create(dparent->d_inode, dchild, S_IFREG); + if (rc) { + CERROR("log create failed rc = %d\n", rc); + GOTO(out_child, rc); + } + + rc = filter_update_server_data(filter->fo_rcvd_filp, filter->fo_fsd); + if (rc) { + CERROR("can't write lastobjid but log created: rc %d\n",rc); + GOTO(out_destroy, rc); + } + + /* dentry_open does a dput(dchild) and mntput(mnt) on error */ + mntget(filter->fo_vfsmnt); + file = dentry_open(dchild, filter->fo_vfsmnt, O_RDWR | O_LARGEFILE); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + CERROR("error opening log file "LPX64": rc %d\n", id, rc); + GOTO(out_destroy, rc); + } + ldlm_lock_decref(&parent_lockh, LCK_PW); + + loghandle->lgh_file = file; + loghandle->lgh_cookie.lgc_lgl.lgl_oid = id; + loghandle->lgh_cookie.lgc_lgl.lgl_ogen = dchild->d_inode->i_generation; + loghandle->lgh_log_create = filter_log_create; + loghandle->lgh_log_open = filter_log_open; + loghandle->lgh_log_close = filter_log_close; + loghandle->lgh_obd = obd; + + RETURN(loghandle); + +out_destroy: + err = vfs_unlink(dparent->d_inode, dchild); + if (err) + CERROR("error unlinking %*s on error: rc %d\n", + dchild->d_name.len, dchild->d_name.name, err); +out_child: + f_dput(dchild); +out_lock: + ldlm_lock_decref(&parent_lockh, LCK_PW); +out_ctxt: + llog_free_handle(loghandle); + RETURN(ERR_PTR(rc)); +} + +/* This is called from filter_setup() and should be single threaded */ +static struct llog_handle *filter_get_catalog(struct obd_device *obd) +{ + struct filter_obd *filter = &obd->u.filter; + struct filter_server_data *fsd = filter->fo_fsd; + struct obd_run_ctxt saved; + struct llog_handle *cathandle = NULL; + int rc; + ENTRY; + + push_ctxt(&saved, &filter->fo_ctxt, NULL); + if (fsd->fsd_catalog_oid) { + struct llog_cookie catcookie; + + catcookie.lgc_lgl.lgl_oid = le64_to_cpu(fsd->fsd_catalog_oid); + catcookie.lgc_lgl.lgl_ogen = le32_to_cpu(fsd->fsd_catalog_ogen); + cathandle = filter_log_open(obd, &catcookie); + if (IS_ERR(cathandle)) { + CERROR("error opening catalog "LPX64":%x: rc %d\n", + catcookie.lgc_lgl.lgl_oid, + catcookie.lgc_lgl.lgl_ogen, + (int)PTR_ERR(cathandle)); + fsd->fsd_catalog_oid = 0; + fsd->fsd_catalog_ogen = 0; + } + } + + if (!fsd->fsd_catalog_oid) { + struct llog_logid *lgl; + + cathandle = filter_log_create(obd); + if (IS_ERR(cathandle)) { + CERROR("error creating new catalog: rc %d\n", + (int)PTR_ERR(cathandle)); + GOTO(out, cathandle); + } + lgl = &cathandle->lgh_cookie.lgc_lgl; + fsd->fsd_catalog_oid = cpu_to_le64(lgl->lgl_oid); + fsd->fsd_catalog_ogen = cpu_to_le32(lgl->lgl_ogen); + rc = filter_update_server_data(filter->fo_rcvd_filp, fsd); + if (rc) { + CERROR("error writing new catalog to disk: rc %d\n",rc); + GOTO(out_handle, rc); + } + } + + rc = llog_init_catalog(cathandle, &obd->u.filter.fo_mdc_uuid); + if (rc) + GOTO(out_handle, rc); +out: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + RETURN(cathandle); + +out_handle: + filter_log_close(cathandle, cathandle); + cathandle = ERR_PTR(rc); + goto out; +} + +static void filter_put_catalog(struct llog_handle *cathandle) +{ + struct llog_handle *loghandle, *n; + int rc; + ENTRY; + + list_for_each_entry_safe(loghandle, n, &cathandle->lgh_list, lgh_list) + filter_log_close(cathandle, loghandle); + + rc = filp_close(cathandle->lgh_file, 0); + if (rc) + CERROR("error closing catalog: rc %d\n", rc); + + llog_free_handle(cathandle); + EXIT; +} + +int filter_log_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, + int num_cookies, struct llog_cookie *logcookies, + int flags) +{ + struct obd_device *obd = class_conn2obd(conn); + struct obd_run_ctxt saved; + int rc; + ENTRY; + + push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + rc = llog_cancel_records(obd->u.filter.fo_catalog, num_cookies, + logcookies); + pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + + RETURN(rc); +} + +int filter_log_op_create(struct llog_handle *cathandle, struct ll_fid *mds_fid, + obd_id oid, obd_count ogen, + struct llog_cookie *logcookie) +{ + struct llog_create_rec *lcr; + int rc; + ENTRY; + + OBD_ALLOC(lcr, sizeof(*lcr)); + if (lcr == NULL) + RETURN(-ENOMEM); + lcr->lcr_hdr.lth_len = lcr->lcr_end_len = sizeof(*lcr); + lcr->lcr_hdr.lth_type = OST_CREATE_REC; + lcr->lcr_fid.id = mds_fid->id; + lcr->lcr_fid.generation = mds_fid->generation; + lcr->lcr_fid.f_type = mds_fid->f_type; + lcr->lcr_oid = oid; + lcr->lcr_ogen = ogen; + + rc = llog_add_record(cathandle, &lcr->lcr_hdr, logcookie); + OBD_FREE(lcr, sizeof(*lcr)); + + if (rc > 0) { + LASSERT(rc == sizeof(*logcookie)); + rc = 0; + } + RETURN(rc); +} + +int filter_log_op_orphan(struct llog_handle *cathandle, obd_id oid, + obd_count ogen, struct llog_cookie *logcookie) +{ + struct llog_orphan_rec *lor; + int rc; + ENTRY; + + OBD_ALLOC(lor, sizeof(*lor)); + if (lor == NULL) + RETURN(-ENOMEM); + lor->lor_hdr.lth_len = lor->lor_end_len = sizeof(*lor); + lor->lor_hdr.lth_type = OST_ORPHAN_REC; + lor->lor_oid = oid; + lor->lor_ogen = ogen; + + rc = llog_add_record(cathandle, &lor->lor_hdr, logcookie); + + if (rc > 0) { + LASSERT(rc == sizeof(*logcookie)); + rc = 0; + } + RETURN(rc); +} diff --git a/lustre/obdfilter/filter_san.c b/lustre/obdfilter/filter_san.c new file mode 100644 index 0000000..5345957 --- /dev/null +++ b/lustre/obdfilter/filter_san.c @@ -0,0 +1,130 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * linux/fs/obdfilter/filter_san.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include // XXX kill me soon +#include + +#include +#include +#include "filter_internal.h" + +/* sanobd setup methods - use a specific mount option */ +int filter_san_setup(struct obd_device *obd, obd_count len, void *buf) +{ + struct obd_ioctl_data* data = buf; + char *option = NULL; + + if (!data->ioc_inlbuf2) + RETURN(-EINVAL); + + /* for extN/ext3 filesystem, we must mount it with 'writeback' mode */ + if (!strcmp(data->ioc_inlbuf2, "extN")) + option = "data=writeback"; + else if (!strcmp(data->ioc_inlbuf2, "ext3")) + option = "data=writeback,asyncdel"; + else + LBUG(); /* just a reminder */ + + return filter_common_setup(obd, len, buf, option); +} + +int filter_san_preprw(int cmd, struct lustre_handle *conn, int objcount, + struct obd_ioobj *obj, int niocount, + struct niobuf_remote *nb) +{ + struct obd_device *obd; + struct obd_ioobj *o = obj; + struct niobuf_remote *rnb = nb; + int rc = 0; + int i; + ENTRY; + + obd = class_conn2obd(conn); + if (!obd) { + CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n", + conn->cookie); + RETURN(-EINVAL); + } + + for (i = 0; i < objcount; i++, o++) { + struct dentry *dentry; + struct inode *inode; +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + sector_t (*fs_bmap)(struct address_space *, sector_t); +#else + int (*fs_bmap)(struct address_space *, long); +#endif + int j; + + dentry = filter_fid2dentry(obd, NULL, o->ioo_type, o->ioo_id); + if (IS_ERR(dentry)) + GOTO(out, rc = PTR_ERR(dentry)); + inode = dentry->d_inode; + if (!inode) { + CERROR("trying to BRW to non-existent file "LPU64"\n", + o->ioo_id); + f_dput(dentry); + GOTO(out, rc = -ENOENT); + } + fs_bmap = inode->i_mapping->a_ops->bmap; + + for (j = 0; j < o->ioo_bufcnt; j++, rnb++) { + long block; + + block = rnb->offset >> inode->i_blkbits; + + if (cmd == OBD_BRW_READ) { + block = fs_bmap(inode->i_mapping, block); + } else { + loff_t newsize = rnb->offset + rnb->len; + /* fs_prep_san_write will also update inode + * size for us: + * (1) new alloced block + * (2) existed block but size extented + */ + /* FIXME We could call fs_prep_san_write() + * only once for all the blocks allocation. + * Now call it once for each block, for + * simplicity. And if error happens, we + * probably need to release previous alloced + * block */ + rc = fs_prep_san_write(obd, inode, &block, + 1, newsize); + if (rc) + break; + } + + rnb->offset = block; + } + f_dput(dentry); + } +out: + RETURN(rc); +} + -- 1.8.3.1