From 480e87e8e7f2a766c07ec24f97fa88763c01965e Mon Sep 17 00:00:00 2001 From: shaver Date: Tue, 15 Jul 2003 08:54:03 +0000 Subject: [PATCH] Land b_unify on b_devel. Highlights include: - intent rework from Peter and Phil - unification of VFS intent strategy for 2.4 and 2.5/2.6 - select 2.5 kernel patches from b_ad - all the whitespace fixes you can imagine. Complete diff for sanity-checking at http://off.net/~shaver/unify-landing.diff. --- lnet/tests/Makefile.mk | 9 + .../patches/ext3_delete_thread_2.4.20_chaos.patch | 477 ++++++ .../patches/invalidate_show_2.4.20_chaos.patch | 112 ++ .../patches/tcp_zero_copy_2.4.20_chaos.patch | 459 ++++++ .../patches/vfs_intent_2.4.20_chaos.patch | 1715 ++++++++++++++++++++ .../pc/ext3_delete_thread_2.4.20_chaos.pc | 5 + .../pc/invalidate_show_2.4.20_chaos.pc | 4 + .../pc/tcp_zero_copy_2.4.20_chaos.pc | 5 + .../kernel_patches/pc/vfs_intent_2.4.20_chaos.pc | 14 + lustre/llite/llite_lib.c | 938 +++++++++++ lustre/mdc/Makefile.mk | 9 + lustre/ost/Makefile.mk | 9 + lustre/portals/tests/Makefile.mk | 9 + 13 files changed, 3765 insertions(+) create mode 100644 lnet/tests/Makefile.mk create mode 100644 lustre/kernel_patches/patches/ext3_delete_thread_2.4.20_chaos.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show_2.4.20_chaos.patch create mode 100644 lustre/kernel_patches/patches/tcp_zero_copy_2.4.20_chaos.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent_2.4.20_chaos.patch create mode 100644 lustre/kernel_patches/pc/ext3_delete_thread_2.4.20_chaos.pc create mode 100644 lustre/kernel_patches/pc/invalidate_show_2.4.20_chaos.pc create mode 100644 lustre/kernel_patches/pc/tcp_zero_copy_2.4.20_chaos.pc create mode 100644 lustre/kernel_patches/pc/vfs_intent_2.4.20_chaos.pc create mode 100644 lustre/llite/llite_lib.c create mode 100644 lustre/mdc/Makefile.mk create mode 100644 lustre/ost/Makefile.mk create mode 100644 lustre/portals/tests/Makefile.mk diff --git a/lnet/tests/Makefile.mk b/lnet/tests/Makefile.mk new file mode 100644 index 0000000..751c0a0 --- /dev/null +++ b/lnet/tests/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../Kernelenv + +obj-y += ping_cli.o +obj-y += ping_srv.o diff --git a/lustre/kernel_patches/patches/ext3_delete_thread_2.4.20_chaos.patch b/lustre/kernel_patches/patches/ext3_delete_thread_2.4.20_chaos.patch new file mode 100644 index 0000000..ad873a9 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3_delete_thread_2.4.20_chaos.patch @@ -0,0 +1,477 @@ + fs/ext3/file.c | 4 + fs/ext3/inode.c | 116 ++++++++++++++++++++++ + fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 5 + include/linux/ext3_fs_sb.h | 10 + + 5 files changed, 365 insertions(+) + +--- kernel-2.4.20-6chaos_18_7/fs/ext3/super.c~ext3_delete_thread_2.4.20_chaos 2003-07-12 15:35:26.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/ext3/super.c 2003-07-12 15:36:19.000000000 -0600 +@@ -400,6 +400,220 @@ static void dump_orphan_list(struct supe + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ wait_event_interruptible(sbi->s_delete_thread_queue, ++ !list_empty(&sbi->s_delete_list) || ++ !test_opt(sb, ASYNCDEL)); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ } ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ ++ if (!test_opt(sb, ASYNCDEL)) ++ return; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ if (sbi->s_delete_list.next == 0) /* thread never started */ ++ return; ++ ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * create a new inode locally and put it on a list for the truncate thread. ++ * We need large parts of the inode struct in order to complete the ++ * truncate and unlink, so we may as well just have a real inode to do it. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_delete; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) ++ goto out_delete; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_delete; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ sbi->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug("delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ goto out_delete; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&oei->i_orphan)); ++ ++ nei = EXT3_I(new_inode); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * in the same order as the on-disk orphan list (badness otherwise). ++ */ ++ nei->i_orphan = oei->i_orphan; ++ nei->i_orphan.next->prev = &nei->i_orphan; ++ nei->i_orphan.prev->next = &nei->i_orphan; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_delete: ++ ext3_delete_inode(old_inode); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -407,6 +621,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -455,7 +670,11 @@ static struct super_operations ext3_sops + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + sync_fs: ext3_sync_fs, +@@ -524,6 +743,13 @@ static int parse_options (char * options + clear_opt (*mount_options, XATTR_USER); + else + #endif ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else if (!strcmp(this_char, "noasyncdel")) ++ clear_opt(*mount_options, ASYNCDEL); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1223,6 +1449,7 @@ struct super_block * ext3_read_super (st + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +@@ -1678,6 +1905,9 @@ int ext3_remount (struct super_block * s + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + ++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) ++ ext3_stop_delete_thread(sbi); ++ + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + +--- kernel-2.4.20-6chaos_18_7/fs/ext3/inode.c~ext3_delete_thread_2.4.20_chaos 2003-07-12 15:34:44.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/ext3/inode.c 2003-07-12 15:36:19.000000000 -0600 +@@ -2017,6 +2017,122 @@ out_stop: + ext3_journal_stop(handle, inode); + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ * ++ * During normal filesystem usage, we are always called here with a ++ * transaction already started. The only time ext3_truncate is called ++ * without a started transaction is from ext3_orphan_cleanup(), and we ++ * currently just do a direct truncate in that case. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 0; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. +--- kernel-2.4.20-6chaos_18_7/fs/ext3/file.c~ext3_delete_thread_2.4.20_chaos 2003-07-12 15:34:44.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/ext3/file.c 2003-07-12 15:36:19.000000000 -0600 +@@ -125,7 +125,11 @@ struct file_operations ext3_file_operati + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + setxattr: ext3_setxattr, /* BKL held */ + getxattr: ext3_getxattr, /* BKL held */ +--- kernel-2.4.20-6chaos_18_7/include/linux/ext3_fs.h~ext3_delete_thread_2.4.20_chaos 2003-07-12 15:34:44.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/ext3_fs.h 2003-07-12 15:37:13.000000000 -0600 +@@ -193,6 +193,7 @@ struct ext3_group_desc + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -320,6 +321,7 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -695,6 +697,9 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++#ifdef EXT3_DELETE_THREAD ++extern void ext3_truncate_thread(struct inode *inode); ++#endif + extern void ext3_set_inode_flags(struct inode *); + + /* ioctl.c */ +--- kernel-2.4.20-6chaos_18_7/include/linux/ext3_fs_sb.h~ext3_delete_thread_2.4.20_chaos 2003-07-12 15:35:26.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/ext3_fs_sb.h 2003-07-12 15:36:19.000000000 -0600 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 32 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -76,6 +78,14 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/invalidate_show_2.4.20_chaos.patch b/lustre/kernel_patches/patches/invalidate_show_2.4.20_chaos.patch new file mode 100644 index 0000000..7c98c45 --- /dev/null +++ b/lustre/kernel_patches/patches/invalidate_show_2.4.20_chaos.patch @@ -0,0 +1,112 @@ + fs/inode.c | 21 ++++++++++++++------- + fs/smbfs/inode.c | 2 +- + fs/super.c | 4 ++-- + include/linux/fs.h | 2 +- + 4 files changed, 18 insertions(+), 11 deletions(-) + +--- kernel-2.4.20-6chaos_18_7/fs/inode.c~invalidate_show_2.4.20_chaos 2003-05-15 21:14:25.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/inode.c 2003-07-12 15:33:08.000000000 -0600 +@@ -604,7 +604,8 @@ static void dispose_list(struct list_hea + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) ++static int invalidate_list(struct list_head *head, struct super_block * sb, ++ struct list_head * dispose, int show) + { + struct list_head *next; + int busy = 0, count = 0; +@@ -629,6 +630,11 @@ static int invalidate_list(struct list_h + count++; + continue; + } ++ if (show) ++ printk(KERN_ERR ++ "inode busy: dev %s:%lu (%p) mode %o count %u\n", ++ kdevname(sb->s_dev), inode->i_ino, inode, ++ inode->i_mode, atomic_read(&inode->i_count)); + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ +@@ -647,22 +653,23 @@ static int invalidate_list(struct list_h + /** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock ++ * @show: whether we should display any busy inodes found + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int show) + { + int busy; + LIST_HEAD(throw_away); + + spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); ++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -688,7 +695,7 @@ int invalidate_device(kdev_t dev, int do + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, 0); + drop_super(sb); + } + invalidate_buffers(dev); +--- kernel-2.4.20-6chaos_18_7/fs/super.c~invalidate_show_2.4.20_chaos 2003-05-15 21:14:25.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/super.c 2003-07-12 15:31:35.000000000 -0600 +@@ -943,7 +943,7 @@ void kill_super(struct super_block *sb) + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; +- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ ++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ + if (sop) { + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); +@@ -952,7 +952,7 @@ void kill_super(struct super_block *sb) + } + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk(KERN_ERR "VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } +--- kernel-2.4.20-6chaos_18_7/include/linux/fs.h~invalidate_show_2.4.20_chaos 2003-07-12 15:14:02.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/fs.h 2003-07-12 15:31:35.000000000 -0600 +@@ -1284,7 +1284,7 @@ static inline void mark_buffer_dirty_ino + extern void set_buffer_flushtime(struct buffer_head *); + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes(struct super_block *, int); + extern int invalidate_device(kdev_t, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); +--- kernel-2.4.20-6chaos_18_7/fs/smbfs/inode.c~invalidate_show_2.4.20_chaos 2003-02-14 15:59:13.000000000 -0700 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/smbfs/inode.c 2003-07-12 15:31:35.000000000 -0600 +@@ -167,7 +167,7 @@ smb_invalidate_inodes(struct smb_sb_info + { + VERBOSE("\n"); + shrink_dcache_sb(SB_of(server)); +- invalidate_inodes(SB_of(server)); ++ invalidate_inodes(SB_of(server), 0); + } + + /* + +_ diff --git a/lustre/kernel_patches/patches/tcp_zero_copy_2.4.20_chaos.patch b/lustre/kernel_patches/patches/tcp_zero_copy_2.4.20_chaos.patch new file mode 100644 index 0000000..dfb4de5 --- /dev/null +++ b/lustre/kernel_patches/patches/tcp_zero_copy_2.4.20_chaos.patch @@ -0,0 +1,459 @@ + include/linux/skbuff.h | 30 +++++ + include/net/tcp.h | 5 + net/core/skbuff.c | 25 ++++ + net/ipv4/tcp.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++- + net/netsyms.c | 2 + 5 files changed, 311 insertions(+), 3 deletions(-) + +--- kernel-2.4.20-6chaos_18_7/include/linux/skbuff.h~tcp_zero_copy_2.4.20_chaos 2003-06-24 11:31:17.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/skbuff.h 2003-07-12 15:38:07.000000000 -0600 +@@ -116,6 +116,30 @@ struct skb_frag_struct + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -123,6 +147,12 @@ struct skb_shared_info { + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +--- kernel-2.4.20-6chaos_18_7/include/net/tcp.h~tcp_zero_copy_2.4.20_chaos 2003-06-24 11:31:17.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/net/tcp.h 2003-07-12 15:38:07.000000000 -0600 +@@ -643,6 +643,8 @@ extern int tcp_v4_tw_remember_stam + + extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -737,6 +739,9 @@ extern int tcp_recvmsg(struct sock *sk + struct msghdr *msg, + int len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); + + extern int tcp_listen_start(struct sock *sk); + +--- kernel-2.4.20-6chaos_18_7/net/netsyms.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:18.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/net/netsyms.c 2003-07-12 15:38:54.000000000 -0600 +@@ -397,6 +397,8 @@ EXPORT_SYMBOL(sysctl_tcp_wmem); + EXPORT_SYMBOL(sysctl_tcp_ecn); + EXPORT_SYMBOL(tcp_cwnd_application_limited); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(sysctl_tcp_low_latency); + + EXPORT_SYMBOL(tcp_write_xmit); +--- kernel-2.4.20-6chaos_18_7/net/core/skbuff.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:21.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/net/core/skbuff.c 2003-07-12 15:38:07.000000000 -0600 +@@ -208,6 +208,8 @@ struct sk_buff *alloc_skb(unsigned int s + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; + return skb; + + nodata: +@@ -276,6 +278,10 @@ static void skb_release_data(struct sk_b + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -532,6 +538,8 @@ int skb_linearize(struct sk_buff *skb, i + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ skb_shinfo(skb)->zccd2 = NULL; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; +@@ -578,6 +586,14 @@ struct sk_buff *pskb_copy(struct sk_buff + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -620,6 +636,8 @@ int pskb_expand_head(struct sk_buff *skb + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ + + if (skb_shared(skb)) + BUG(); +@@ -641,6 +659,11 @@ int pskb_expand_head(struct sk_buff *skb + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data+nhead) - skb->head; +@@ -655,6 +678,8 @@ int pskb_expand_head(struct sk_buff *skb + skb->nh.raw += off; + skb->cloned = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +--- kernel-2.4.20-6chaos_18_7/net/ipv4/tcp.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:21.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/net/ipv4/tcp.c 2003-07-12 15:38:07.000000000 -0600 +@@ -747,7 +747,7 @@ do_interrupted: + goto out; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd); + + static inline int + can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +@@ -826,7 +826,8 @@ static int tcp_error(struct sock *sk, in + return err; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; +@@ -874,6 +875,17 @@ new_segment: + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -884,6 +896,20 @@ new_segment: + goto new_segment; + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; +@@ -947,7 +973,31 @@ ssize_t tcp_sendpage(struct socket *sock + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +@@ -1771,6 +1821,202 @@ recv_urg: + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->err || ++ sk->state == TCP_CLOSE || ++ (sk->shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sk->done) ++ break; ++ ++ if (sk->err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->state == TCP_CLOSE) { ++ if (!sk->done) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ timeo = tcp_data_wait(sk, timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!sk->urginline) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + +_ diff --git a/lustre/kernel_patches/patches/vfs_intent_2.4.20_chaos.patch b/lustre/kernel_patches/patches/vfs_intent_2.4.20_chaos.patch new file mode 100644 index 0000000..7a3b2ab --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent_2.4.20_chaos.patch @@ -0,0 +1,1715 @@ + fs/dcache.c | 19 ++ + fs/exec.c | 15 +- + fs/namei.c | 324 ++++++++++++++++++++++++++++++++++++++-------- + fs/namespace.c | 28 ++- + fs/open.c | 126 +++++++++++++++-- + fs/proc/base.c | 3 + fs/stat.c | 26 ++- + include/linux/dcache.h | 53 +++++++ + include/linux/fs.h | 30 +++- + include/linux/fs_struct.h | 4 + kernel/exit.c | 3 + kernel/fork.c | 3 + kernel/ksyms.c | 1 + 13 files changed, 537 insertions(+), 98 deletions(-) + +--- kernel-2.4.20-6chaos_18_7/fs/exec.c~vfs_intent_2.4.20_chaos 2003-06-19 11:06:09.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/exec.c 2003-07-12 15:14:02.000000000 -0600 +@@ -113,8 +113,9 @@ asmlinkage long sys_uselib(const char * + struct file * file; + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; + +- error = user_path_walk(library, &nd); ++ error = user_path_walk_it(library, &nd, &it); + if (error) + goto out; + +@@ -126,7 +127,8 @@ asmlinkage long sys_uselib(const char * + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -381,8 +383,9 @@ struct file *open_exec(const char *name) + struct inode *inode; + struct file *file; + int err = 0; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; + +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); ++ err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; +@@ -394,7 +397,8 @@ struct file *open_exec(const char *name) + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(&it); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +@@ -406,6 +410,7 @@ out: + return file; + } + } ++ intent_release(&it); + path_release(&nd); + } + goto out; +@@ -1134,7 +1139,7 @@ int do_coredump(long signr, struct pt_re + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +--- kernel-2.4.20-6chaos_18_7/fs/dcache.c~vfs_intent_2.4.20_chaos 2003-05-15 21:14:24.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/dcache.c 2003-07-12 15:14:02.000000000 -0600 +@@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -839,13 +846,19 @@ void d_delete(struct dentry * dentry) + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ +--- kernel-2.4.20-6chaos_18_7/fs/namespace.c~vfs_intent_2.4.20_chaos 2003-05-15 21:14:25.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/namespace.c 2003-07-12 15:14:02.000000000 -0600 +@@ -99,6 +99,7 @@ static void detach_mnt(struct vfsmount * + { + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; ++ UNPIN(old_nd->dentry, old_nd->mnt, 1); + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt_root; + list_del_init(&mnt->mnt_child); +@@ -110,6 +111,7 @@ static void attach_mnt(struct vfsmount * + { + mnt->mnt_parent = mntget(nd->mnt); + mnt->mnt_mountpoint = dget(nd->dentry); ++ PIN(nd->dentry, nd->mnt, 1); + list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); + list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); + nd->dentry->d_mounted++; +@@ -485,14 +487,17 @@ static int do_loopback(struct nameidata + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int err = mount_is_safe(nd); + if (err) + return err; + if (!old_name || !*old_name) + return -EINVAL; +- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd); +- if (err) ++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it); ++ if (err) { ++ intent_release(&it); + return err; ++ } + + down_write(¤t->namespace->sem); + err = -EINVAL; +@@ -515,6 +520,7 @@ static int do_loopback(struct nameidata + } + + up_write(¤t->namespace->sem); ++ intent_release(&it); + path_release(&old_nd); + return err; + } +@@ -698,6 +704,7 @@ long do_mount(char * dev_name, char * di + unsigned long flags, void *data_page) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int retval = 0; + int mnt_flags = 0; + +@@ -722,10 +729,11 @@ long do_mount(char * dev_name, char * di + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV); + + /* ... and get the mountpoint */ +- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); +- if (retval) ++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); ++ if (retval) { ++ intent_release(&it); + return retval; +- ++ } + if (flags & MS_REMOUNT) + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + data_page); +@@ -736,6 +744,8 @@ long do_mount(char * dev_name, char * di + else + retval = do_add_mount(&nd, type_page, flags, mnt_flags, + dev_name, data_page); ++ ++ intent_release(&it); + path_release(&nd); + return retval; + } +@@ -901,6 +911,8 @@ asmlinkage long sys_pivot_root(const cha + { + struct vfsmount *tmp; + struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; ++ struct lookup_intent new_it = { .it_op = IT_GETATTR }; ++ struct lookup_intent old_it = { .it_op = IT_GETATTR }; + int error; + + if (!capable(CAP_SYS_ADMIN)) +@@ -908,14 +920,14 @@ asmlinkage long sys_pivot_root(const cha + + lock_kernel(); + +- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd); ++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it); + if (error) + goto out0; + error = -EINVAL; + if (!check_mnt(new_nd.mnt)) + goto out1; + +- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd); ++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it); + if (error) + goto out1; + +@@ -970,8 +982,10 @@ out2: + up(&old_nd.dentry->d_inode->i_zombie); + up_write(¤t->namespace->sem); + path_release(&user_nd); ++ intent_release(&old_it); + path_release(&old_nd); + out1: ++ intent_release(&new_it); + path_release(&new_nd); + out0: + unlock_kernel(); +--- kernel-2.4.20-6chaos_18_7/fs/namei.c~vfs_intent_2.4.20_chaos 2003-05-15 21:14:25.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/namei.c 2003-07-12 15:19:01.000000000 -0600 +@@ -94,6 +94,13 @@ + * XEmacs seems to be relying on it... + */ + ++void intent_release(struct lookup_intent *it) ++{ ++ if (it && it->it_op_release) ++ it->it_op_release(it); ++ ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd) + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * dentry = d_lookup(parent, name); + ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); +@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * result; + struct inode *dir = parent->d_inode; + ++again: ++ + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc + result = ERR_PTR(-ENOMEM); + if (dentry) { + lock_kernel(); ++ if (dir->i_op->lookup_it) ++ result = dir->i_op->lookup_it(dir, dentry, it, flags); ++ else + result = dir->i_op->lookup(dir, dentry); + unlock_kernel(); + if (result) +@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc + dput(result); + result = ERR_PTR(-ENOENT); + } ++ } else if (result->d_op && result->d_op->d_revalidate_it) { ++ if (!result->d_op->d_revalidate_it(result, flags, it) && ++ !d_invalidate(result)) { ++ dput(result); ++ goto again; ++ } + } + return result; + } +@@ -334,7 +362,8 @@ int max_recursive_link = 5; + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++ struct lookup_intent *it) + { + int err; + if (current->link_count >= max_recursive_link) +@@ -348,10 +377,18 @@ static inline int do_follow_link(struct + current->link_count++; + current->total_link_count++; + UPDATE_ATIME(dentry->d_inode); ++ nd->it = it; + err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(it); ++ path_release(nd); ++ err = -ENOLINK; ++ } + current->link_count--; + return err; + loop: ++ intent_release(it); + path_release(nd); + return -ELOOP; + } +@@ -381,15 +418,26 @@ int follow_up(struct vfsmount **mnt, str + return __follow_up(mnt, dentry); + } + +-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) ++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry, ++ struct lookup_intent *it) + { + struct vfsmount *mounted; + + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (mounted) { ++ int opc = 0, mode = 0; + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); ++ if (it) { ++ opc = it->it_op; ++ mode = it->it_mode; ++ } ++ intent_release(it); ++ if (it) { ++ it->it_op = opc; ++ it->it_mode = mode; ++ } + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); +@@ -401,7 +449,7 @@ static inline int __follow_down(struct v + + int follow_down(struct vfsmount **mnt, struct dentry **dentry) + { +- return __follow_down(mnt,dentry); ++ return __follow_down(mnt,dentry,NULL); + } + + static inline void follow_dotdot(struct nameidata *nd) +@@ -437,7 +485,7 @@ static inline void follow_dotdot(struct + mntput(nd->mnt); + nd->mnt = parent; + } +- while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry)) ++ while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry, NULL)) + ; + } + +@@ -449,7 +497,8 @@ static inline void follow_dotdot(struct + * + * We expect 'base' to be positive and a directory. + */ +-int link_path_walk(const char * name, struct nameidata *nd) ++int link_path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it) + { + struct dentry *dentry; + struct inode *inode; +@@ -526,19 +575,18 @@ int link_path_walk(const char * name, st + break; + } + /* This does the actual lookups.. */ +- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + if (!dentry) { + err = -EWOULDBLOCKIO; + if (atomic) + break; +- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; + } + /* Check mountpoints.. */ +- while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) +- ; ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL)); + + err = -ENOENT; + inode = dentry->d_inode; +@@ -549,7 +597,7 @@ int link_path_walk(const char * name, st + goto out_dput; + + if (inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ err = do_follow_link(dentry, nd, NULL); + dput(dentry); + if (err) + goto return_err; +@@ -565,7 +613,7 @@ int link_path_walk(const char * name, st + nd->dentry = dentry; + } + err = -ENOTDIR; +- if (!inode->i_op->lookup) ++ if (!inode->i_op->lookup && !inode->i_op->lookup_it) + break; + continue; + /* here ends the main loop */ +@@ -592,22 +640,22 @@ last_component: + if (err < 0) + break; + } +- dentry = cached_lookup(nd->dentry, &this, 0); ++ dentry = cached_lookup(nd->dentry, &this, 0, it); + if (!dentry) { + err = -EWOULDBLOCKIO; + if (atomic) + break; +- dentry = real_lookup(nd->dentry, &this, 0); ++ dentry = real_lookup(nd->dentry, &this, 0, it); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; + } +- while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, it)) + ; + inode = dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) + && inode && inode->i_op && inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ err = do_follow_link(dentry, nd, it); + dput(dentry); + if (err) + goto return_err; +@@ -621,7 +669,8 @@ last_component: + goto no_inode; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; +- if (!inode->i_op || !inode->i_op->lookup) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup_it)) + break; + } + goto return_base; +@@ -645,6 +694,23 @@ return_reval: + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; ++ revalidate_again: ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, NULL); ++ d_invalidate(dentry); ++ dput(dentry); ++ dentry = new; ++ goto revalidate_again; ++ } ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, 0)) { +@@ -658,15 +724,28 @@ out_dput: + dput(dentry); + break; + } ++ if (err) ++ intent_release(it); + path_release(nd); + return_err: + return err; + } + ++int link_path_walk(const char * name, struct nameidata *nd) ++{ ++ return link_path_walk_it(name, nd, NULL); ++} ++ ++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) ++{ ++ current->total_link_count = 0; ++ return link_path_walk_it(name, nd, it); ++} ++ + int path_walk(const char * name, struct nameidata *nd) + { + current->total_link_count = 0; +- return link_path_walk(name, nd); ++ return link_path_walk_it(name, nd, NULL); + } + + /* SMP-safe */ +@@ -751,6 +830,17 @@ walk_init_root(const char *name, struct + } + + /* SMP-safe */ ++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ int error = 0; ++ if (path_init(path, flags, nd)) ++ error = path_walk_it(path, nd, it); ++ return error; ++} ++ ++ ++/* SMP-safe */ + int path_lookup(const char *path, unsigned flags, struct nameidata *nd) + { + int error = 0; +@@ -765,6 +855,7 @@ int path_init(const char *name, unsigned + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->it = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -779,7 +870,8 @@ int path_init(const char *name, unsigned + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, ++ struct lookup_intent *it) + { + struct dentry * dentry; + struct inode *inode; +@@ -802,13 +894,16 @@ struct dentry * lookup_hash(struct qstr + goto out; + } + +- dentry = cached_lookup(base, name, 0); ++ dentry = cached_lookup(base, name, 0, it); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; + lock_kernel(); ++ if (inode->i_op->lookup_it) ++ dentry = inode->i_op->lookup_it(inode, new, it, 0); ++ else + dentry = inode->i_op->lookup(inode, new); + unlock_kernel(); + if (!dentry) +@@ -820,6 +915,12 @@ out: + return dentry; + } + ++struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++{ ++ return lookup_hash_it(name, base, NULL); ++} ++ ++ + /* SMP-safe */ + struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) + { +@@ -841,7 +942,7 @@ struct dentry * lookup_one_len(const cha + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return lookup_hash_it(&this, base, NULL); + access: + return ERR_PTR(-EACCES); + } +@@ -872,6 +973,23 @@ int __user_walk(const char *name, unsign + return err; + } + ++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ char *tmp; ++ int err; ++ ++ tmp = getname(name); ++ err = PTR_ERR(tmp); ++ if (!IS_ERR(tmp)) { ++ err = 0; ++ if (path_init(tmp, flags, nd)) ++ err = path_walk_it(tmp, nd, it); ++ putname(tmp); ++ } ++ return err; ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -969,7 +1087,8 @@ static inline int lookup_flags(unsigned + return retval; + } + +-int vfs_create(struct inode *dir, struct dentry *dentry, int mode) ++static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, ++ struct lookup_intent *it) + { + int error; + +@@ -982,12 +1101,15 @@ int vfs_create(struct inode *dir, struct + goto exit_lock; + + error = -EACCES; /* shouldn't it be ENOSYS? */ +- if (!dir->i_op || !dir->i_op->create) ++ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); +- error = dir->i_op->create(dir, dentry, mode); ++ if (dir->i_op->create_it) ++ error = dir->i_op->create_it(dir, dentry, mode, it); ++ else ++ error = dir->i_op->create(dir, dentry, mode); + unlock_kernel(); + exit_lock: + up(&dir->i_zombie); +@@ -996,6 +1118,11 @@ exit_lock: + return error; + } + ++int vfs_create(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ return vfs_create_it(dir, dentry, mode, NULL); ++} ++ + /* + * open_namei() + * +@@ -1010,7 +1137,8 @@ exit_lock: + * for symlinks (where the permissions are checked later). + * SMP-safe + */ +-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) ++int open_namei_it(const char *pathname, int flag, int mode, ++ struct nameidata *nd, struct lookup_intent *it) + { + int acc_mode, error = 0; + struct inode *inode; +@@ -1024,7 +1152,7 @@ int open_namei(const char * pathname, in + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { +- error = path_lookup(pathname, lookup_flags(flag), nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), nd, it); + if (error) + return error; + dentry = nd->dentry; +@@ -1034,6 +1162,10 @@ int open_namei(const char * pathname, in + /* + * Create - we need to know the parent. + */ ++ if (it) { ++ it->it_mode = mode; ++ it->it_op |= IT_CREAT; ++ } + error = path_lookup(pathname, LOOKUP_PARENT, nd); + if (error) + return error; +@@ -1049,7 +1181,7 @@ int open_namei(const char * pathname, in + + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + + do_last: + error = PTR_ERR(dentry); +@@ -1058,10 +1190,11 @@ do_last: + goto exit; + } + ++ it->it_mode = mode; + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { +- error = vfs_create(dir->d_inode, dentry, +- mode & ~current->fs->umask); ++ error = vfs_create_it(dir->d_inode, dentry, ++ mode & ~current->fs->umask, it); + up(&dir->d_inode->i_sem); + dput(nd->dentry); + nd->dentry = dentry; +@@ -1086,7 +1219,7 @@ do_last: + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; +- while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry)); ++ while (__follow_down(&nd->mnt,&dentry,it) && d_mountpoint(dentry)); + } + error = -ENOENT; + if (!dentry->d_inode) +@@ -1165,7 +1298,7 @@ ok: + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +@@ -1177,8 +1310,10 @@ ok: + return 0; + + exit_dput: ++ intent_release(it); + dput(dentry); + exit: ++ intent_release(it); + path_release(nd); + return error; + +@@ -1197,7 +1332,16 @@ do_link: + * are done. Procfs-like symlinks just set LAST_BIND. + */ + UPDATE_ATIME(dentry->d_inode); ++ nd->it = it; + error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (error) { ++ intent_release(it); ++ } else if (it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(it); ++ path_release(nd); ++ error = -ENOLINK; ++ } + dput(dentry); + if (error) + return error; +@@ -1219,13 +1363,20 @@ do_link: + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); + goto do_last; + } + ++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) ++{ ++ return open_namei_it(pathname, flag, mode, nd, NULL); ++} ++ ++ + /* SMP-safe */ +-static struct dentry *lookup_create(struct nameidata *nd, int is_dir) ++static struct dentry *lookup_create(struct nameidata *nd, int is_dir, ++ struct lookup_intent *it) + { + struct dentry *dentry; + +@@ -1233,7 +1384,7 @@ static struct dentry *lookup_create(stru + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) +@@ -1289,7 +1440,16 @@ asmlinkage long sys_mknod(const char * f + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + + mode &= ~current->fs->umask; +@@ -1310,6 +1470,7 @@ asmlinkage long sys_mknod(const char * f + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1357,7 +1518,14 @@ asmlinkage long sys_mkdir(const char * p + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 1); ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_mkdir(nd.dentry->d_inode, dentry, +@@ -1365,6 +1533,7 @@ asmlinkage long sys_mkdir(const char * p + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1465,8 +1634,16 @@ asmlinkage long sys_rmdir(const char * p + error = -EBUSY; + goto exit1; + } ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); +@@ -1524,8 +1701,15 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ +@@ -1592,15 +1776,23 @@ asmlinkage long sys_symlink(const char * + error = path_lookup(to, LOOKUP_PARENT, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1676,7 +1868,14 @@ asmlinkage long sys_link(const char * ol + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +- new_dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); +@@ -1720,7 +1919,7 @@ exit: + * locking]. + */ + int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + struct inode *target; +@@ -1799,7 +1998,7 @@ out_unlock: + } + + int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry) + { + int error; + +@@ -1887,9 +2086,18 @@ static inline int do_rename(const char * + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ unlock_kernel(); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + double_lock(new_dir, old_dir); + +- old_dentry = lookup_hash(&oldnd.last, old_dir); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; +@@ -1905,16 +2113,16 @@ static inline int do_rename(const char * + if (newnd.last.name[newnd.last.len]) + goto exit4; + } +- new_dentry = lookup_hash(&newnd.last, new_dir); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + ++ + lock_kernel(); + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); + unlock_kernel(); +- + dput(new_dentry); + exit4: + dput(old_dentry); +@@ -1965,20 +2173,28 @@ out: + } + + static inline int +-__vfs_follow_link(struct nameidata *nd, const char *link) ++__vfs_follow_link(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) + { + int res = 0; + char *name; + if (IS_ERR(link)) + goto fail; + ++ if (it == NULL) ++ it = nd->it; ++ else if (it != nd->it) ++ printk("it != nd->it: tell phil@clusterfs.com\n"); ++ if (it != NULL) ++ it->it_int_flags |= IT_FL_FOLLOWED; ++ + if (*link == '/') { + path_release(nd); + if (!walk_init_root(link, nd)) + /* weird __emul_prefix() stuff did it */ + goto out; + } +- res = link_path_walk(link, nd); ++ res = link_path_walk_it(link, nd, it); + out: + if (current->link_count || res || nd->last_type!=LAST_NORM) + return res; +@@ -2002,7 +2218,13 @@ fail: + + int vfs_follow_link(struct nameidata *nd, const char *link) + { +- return __vfs_follow_link(nd, link); ++ return __vfs_follow_link(nd, link, NULL); ++} ++ ++int vfs_follow_link_it(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) ++{ ++ return __vfs_follow_link(nd, link, it); + } + + /* get the link contents into pagecache */ +@@ -2044,7 +2266,7 @@ int page_follow_link(struct dentry *dent + { + struct page *page = NULL; + char *s = page_getlink(dentry, &page); +- int res = __vfs_follow_link(nd, s); ++ int res = __vfs_follow_link(nd, s, NULL); + if (page) { + kunmap(page); + page_cache_release(page); +--- kernel-2.4.20-6chaos_18_7/fs/open.c~vfs_intent_2.4.20_chaos 2003-06-24 10:11:51.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/open.c 2003-07-12 15:21:35.000000000 -0600 +@@ -19,6 +19,8 @@ + #include + + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) ++extern int path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it); + + int vfs_statfs(struct super_block *sb, struct statfs *buf) + { +@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct + write_unlock(&files->file_lock); + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + struct inode *inode = dentry->d_inode; ++ struct inode_operations *op = dentry->d_inode->i_op; + int error; + struct iattr newattrs; + +@@ -108,7 +111,13 @@ int do_truncate(struct dentry *dentry, l + down(&inode->i_sem); + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; +- error = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ } else ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); + return error; + } +@@ -118,12 +127,13 @@ static inline long do_sys_truncate(const + struct nameidata nd; + struct inode * inode; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -163,11 +173,13 @@ static inline long do_sys_truncate(const + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ intent_release(&it); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -215,7 +227,7 @@ static inline long do_sys_ftruncate(unsi + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -260,11 +272,13 @@ asmlinkage long sys_utime(char * filenam + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -279,11 +293,25 @@ asmlinkage long sys_utime(char * filenam + goto dput_and_out; + + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!times) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; + } ++ + error = notify_change(nd.dentry, &newattrs); + dput_and_out: + path_release(&nd); +@@ -304,12 +332,14 @@ asmlinkage long sys_utimes(char * filena + struct inode * inode; + struct iattr newattrs; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, NULL); + + if (error) + goto out; + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -324,7 +354,20 @@ asmlinkage long sys_utimes(char * filena + newattrs.ia_atime = times[0].tv_sec; + newattrs.ia_mtime = times[1].tv_sec; + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!utimes) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; +@@ -347,6 +390,7 @@ asmlinkage long sys_access(const char * + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -364,13 +408,14 @@ asmlinkage long sys_access(const char * + else + current->cap_effective = current->cap_permitted; + +- res = user_path_walk(filename, &nd); ++ res = user_path_walk_it(filename, &nd, &it); + if (!res) { + res = permission(nd.dentry->d_inode, mode); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ intent_release(&it); + path_release(&nd); + } + +@@ -385,8 +430,9 @@ asmlinkage long sys_chdir(const char * f + { + int error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd); ++ error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it); + if (error) + goto out; + +@@ -397,6 +443,7 @@ asmlinkage long sys_chdir(const char * f + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -436,9 +483,10 @@ asmlinkage long sys_chroot(const char * + { + int error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | +- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); + if (error) + goto out; + +@@ -454,6 +502,7 @@ asmlinkage long sys_chroot(const char * + set_fs_altroot(); + error = 0; + dput_and_out: ++ intent_release(&it); + path_release(&nd); + out: + return error; +@@ -508,6 +557,18 @@ asmlinkage long sys_chmod(const char * f + if (IS_RDONLY(inode)) + goto dput_and_out; + ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto dput_and_out; +@@ -538,6 +599,20 @@ static int chown_common(struct dentry * + error = -EROFS; + if (IS_RDONLY(inode)) + goto out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; +@@ -642,8 +717,9 @@ struct file *filp_open(const char * file + { + int namei_flags, error; + struct nameidata nd; +- +- flags &= ~O_DIRECT; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = flags }; ++ ++ //flags &= ~O_DIRECT; + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -651,14 +727,15 @@ struct file *filp_open(const char * file + if (namei_flags & O_TRUNC) + namei_flags |= 2; + +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ error = open_namei_it(filename, namei_flags, mode, &nd, &it); ++ if (error) ++ return ERR_PTR(error); + +- return ERR_PTR(error); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); + } + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -695,12 +772,15 @@ struct file *dentry_open(struct dentry * + } + + if (f->f_op && f->f_op->open) { ++ f->f_it = it; + error = f->f_op->open(inode,f); ++ f->f_it = NULL; + if (error) + goto cleanup_all; + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + ++ intent_release(it); + return f; + + cleanup_all: +@@ -715,11 +795,17 @@ cleanup_all: + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ return dentry_open_it(dentry, mnt, flags, NULL); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +--- kernel-2.4.20-6chaos_18_7/fs/stat.c~vfs_intent_2.4.20_chaos 2003-05-15 21:14:25.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/stat.c 2003-07-12 15:29:57.000000000 -0600 +@@ -17,21 +17,23 @@ + * Revalidate the inode. This is required for proper NFS attribute caching. + */ + static __inline__ int +-do_revalidate(struct dentry *dentry) ++do_revalidate(struct dentry *dentry, struct lookup_intent *it) + { + struct inode * inode = dentry->d_inode; +- if (inode->i_op && inode->i_op->revalidate) ++ if (inode->i_op && inode->i_op->revalidate_it) ++ return inode->i_op->revalidate_it(dentry, it); ++ else if (inode->i_op && inode->i_op->revalidate) + return inode->i_op->revalidate(dentry); + return 0; + } + +-static int do_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++static int do_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat, struct lookup_intent *it) + { + int res = 0; + unsigned int blocks, indirect; + struct inode *inode = dentry->d_inode; + +- res = do_revalidate(dentry); ++ res = do_revalidate(dentry, it); + if (res) + return res; + +@@ -104,10 +106,12 @@ int vfs_stat(char *name, struct kstat *s + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd, &it); + if (!error) { +- error = do_getattr(nd.mnt, nd.dentry, stat); ++ error = do_getattr(nd.mnt, nd.dentry, stat, &it); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -117,10 +121,12 @@ int vfs_lstat(char *name, struct kstat * + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd, &it); + if (!error) { +- error = do_getattr(nd.mnt, nd.dentry, stat); ++ error = do_getattr(nd.mnt, nd.dentry, stat, &it); ++ intent_release(&it); + path_release(&nd); + } + return error; +@@ -132,7 +138,7 @@ int vfs_fstat(unsigned int fd, struct ks + int error = -EBADF; + + if (f) { +- error = do_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ error = do_getattr(f->f_vfsmnt, f->f_dentry, stat, NULL); + fput(f); + } + return error; +@@ -279,7 +285,7 @@ asmlinkage long sys_readlink(const char + + error = -EINVAL; + if (inode->i_op && inode->i_op->readlink && +- !(error = do_revalidate(nd.dentry))) { ++ !(error = do_revalidate(nd.dentry, NULL))) { + UPDATE_ATIME(inode); + error = inode->i_op->readlink(nd.dentry, buf, bufsiz); + } +--- kernel-2.4.20-6chaos_18_7/fs/proc/base.c~vfs_intent_2.4.20_chaos 2003-06-23 06:49:00.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/fs/proc/base.c 2003-07-12 15:14:02.000000000 -0600 +@@ -465,6 +465,9 @@ static int proc_pid_follow_link(struct d + + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; ++ ++ if (nd->it != NULL) ++ nd->it->it_int_flags |= IT_FL_FOLLOWED; + out: + return error; + } +--- kernel-2.4.20-6chaos_18_7/include/linux/dcache.h~vfs_intent_2.4.20_chaos 2003-06-24 11:31:16.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/dcache.h 2003-07-12 15:14:02.000000000 -0600 +@@ -7,6 +7,44 @@ + #include + #include + ++#define IT_OPEN 0x0001 ++#define IT_CREAT 0x0002 ++#define IT_READDIR 0x0004 ++#define IT_GETATTR 0x0008 ++#define IT_LOOKUP 0x0010 ++#define IT_UNLINK 0x0020 ++#define IT_GETXATTR 0x0040 ++#define IT_EXEC 0x0080 ++#define IT_PIN 0x0100 ++ ++#define IT_FL_LOCKED 0x0001 ++#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ ++ ++#define INTENT_MAGIC 0x19620323 ++ ++struct lookup_intent { ++ int it_op; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_magic; ++ int it_mode; ++ int it_flags; ++ int it_disposition; ++ int it_status; ++ int it_int_flags; ++ __u64 it_lock_handle[2]; ++ int it_lock_mode; ++ void *it_data; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op, int flags) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++ it->it_flags = flags; ++} ++ ++ + /* + * linux/include/linux/dcache.h + * +@@ -96,8 +134,22 @@ struct dentry_operations { + int (*d_delete)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); ++ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); ++ void (*d_pin)(struct dentry *, struct vfsmount * , int); ++ void (*d_unpin)(struct dentry *, struct vfsmount *, int); + }; + ++#define PIN(de,mnt,flag) if (de->d_op && de->d_op->d_pin) \ ++ de->d_op->d_pin(de, mnt, flag); ++#define UNPIN(de,mnt,flag) if (de->d_op && de->d_op->d_unpin) \ ++ de->d_op->d_unpin(de, mnt, flag); ++ ++ ++/* defined in fs/namei.c */ ++extern void intent_release(struct lookup_intent *it); ++/* defined in fs/dcache.c */ ++extern void __d_rehash(struct dentry * entry, int lock); ++ + /* the dentry parameter passed to d_hash and d_compare is the parent + * directory of the entries to be compared. It is used in case these + * functions need any directory specific information for determining +@@ -129,6 +181,7 @@ d_iput: no no yes + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +--- kernel-2.4.20-6chaos_18_7/include/linux/fs.h~vfs_intent_2.4.20_chaos 2003-07-12 15:12:12.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/fs.h 2003-07-12 15:14:02.000000000 -0600 +@@ -337,6 +337,9 @@ extern void set_bh_page(struct buffer_he + #define ATTR_MTIME_SET 256 + #define ATTR_FORCE 512 /* Not a change, but a change it */ + #define ATTR_ATTR_FLAG 1024 ++#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ ++#define ATTR_CTIME_SET 0x2000 + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -574,6 +577,7 @@ struct file { + + /* needed for tty driver, and maybe others */ + void *private_data; ++ struct lookup_intent *f_it; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; +@@ -701,6 +705,7 @@ struct nameidata { + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *it; + }; + + /* +@@ -821,7 +826,8 @@ extern int vfs_symlink(struct inode *, s + extern int vfs_link(struct dentry *, struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); +-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); ++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry); + + /* + * File types +@@ -881,21 +887,32 @@ struct file_operations { + + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); ++ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*revalidate) (struct dentry *); ++ int (*revalidate_it) (struct dentry *, struct lookup_intent *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); + int (*setxattr) (struct dentry *, const char *, void *, size_t, int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); +@@ -1093,10 +1110,14 @@ static inline int get_lease(struct inode + + asmlinkage long sys_open(const char *, int, int); + asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern int open_namei_it(const char *filename, int namei_flags, int mode, ++ struct nameidata *nd, struct lookup_intent *it); ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1387,6 +1408,7 @@ typedef int (*read_actor_t)(read_descrip + extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + + extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); + extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); +@@ -1398,6 +1420,8 @@ extern struct dentry * lookup_one_len(co + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) ++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) ++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) + + extern void inode_init_once(struct inode *); + extern void iput(struct inode *); +@@ -1497,6 +1521,8 @@ extern struct file_operations generic_ro + + extern int vfs_readlink(struct dentry *, char *, int, const char *); + extern int vfs_follow_link(struct nameidata *, const char *); ++extern int vfs_follow_link_it(struct nameidata *, const char *, ++ struct lookup_intent *it); + extern int page_readlink(struct dentry *, char *, int); + extern int page_follow_link(struct dentry *, struct nameidata *); + extern struct inode_operations page_symlink_inode_operations; +--- kernel-2.4.20-6chaos_18_7/include/linux/fs_struct.h~vfs_intent_2.4.20_chaos 2003-06-24 11:31:16.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/include/linux/fs_struct.h 2003-07-12 15:14:02.000000000 -0600 +@@ -37,10 +37,12 @@ static inline void set_fs_root(struct fs + write_lock(&fs->lock); + old_root = fs->root; + old_rootmnt = fs->rootmnt; ++ PIN(dentry, mnt, 1); + fs->rootmnt = mntget(mnt); + fs->root = dget(dentry); + write_unlock(&fs->lock); + if (old_root) { ++ UNPIN(old_root, old_rootmnt, 1); + dput(old_root); + mntput(old_rootmnt); + } +@@ -60,10 +62,12 @@ static inline void set_fs_pwd(struct fs_ + write_lock(&fs->lock); + old_pwd = fs->pwd; + old_pwdmnt = fs->pwdmnt; ++ PIN(dentry, mnt, 0); + fs->pwdmnt = mntget(mnt); + fs->pwd = dget(dentry); + write_unlock(&fs->lock); + if (old_pwd) { ++ UNPIN(old_pwd, old_pwdmnt, 0); + dput(old_pwd); + mntput(old_pwdmnt); + } +--- kernel-2.4.20-6chaos_18_7/kernel/ksyms.c~vfs_intent_2.4.20_chaos 2003-07-12 15:12:25.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/kernel/ksyms.c 2003-07-12 15:14:02.000000000 -0600 +@@ -299,6 +299,7 @@ EXPORT_SYMBOL(read_cache_page); + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); +--- kernel-2.4.20-6chaos_18_7/kernel/fork.c~vfs_intent_2.4.20_chaos 2003-06-19 11:06:09.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/kernel/fork.c 2003-07-12 15:14:02.000000000 -0600 +@@ -385,10 +385,13 @@ static inline struct fs_struct *__copy_f + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); ++ PIN(old->pwd, old->pwdmnt, 0); ++ PIN(old->root, old->rootmnt, 1); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { ++ PIN(old->altroot, old->altrootmnt, 1); + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { +--- kernel-2.4.20-6chaos_18_7/kernel/exit.c~vfs_intent_2.4.20_chaos 2003-06-19 11:06:09.000000000 -0600 ++++ kernel-2.4.20-6chaos_18_7-braam/kernel/exit.c 2003-07-12 15:14:02.000000000 -0600 +@@ -241,11 +241,14 @@ static inline void __put_fs_struct(struc + { + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { ++ UNPIN(fs->pwd, fs->pwdmnt, 0); ++ UNPIN(fs->root, fs->rootmnt, 1); + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { ++ UNPIN(fs->altroot, fs->altrootmnt, 1); + dput(fs->altroot); + mntput(fs->altrootmnt); + } + +_ diff --git a/lustre/kernel_patches/pc/ext3_delete_thread_2.4.20_chaos.pc b/lustre/kernel_patches/pc/ext3_delete_thread_2.4.20_chaos.pc new file mode 100644 index 0000000..a2c3109 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3_delete_thread_2.4.20_chaos.pc @@ -0,0 +1,5 @@ +fs/ext3/super.c +fs/ext3/inode.c +fs/ext3/file.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/invalidate_show_2.4.20_chaos.pc b/lustre/kernel_patches/pc/invalidate_show_2.4.20_chaos.pc new file mode 100644 index 0000000..1d4ed77 --- /dev/null +++ b/lustre/kernel_patches/pc/invalidate_show_2.4.20_chaos.pc @@ -0,0 +1,4 @@ +fs/inode.c +fs/super.c +include/linux/fs.h +fs/smbfs/inode.c diff --git a/lustre/kernel_patches/pc/tcp_zero_copy_2.4.20_chaos.pc b/lustre/kernel_patches/pc/tcp_zero_copy_2.4.20_chaos.pc new file mode 100644 index 0000000..02877c0 --- /dev/null +++ b/lustre/kernel_patches/pc/tcp_zero_copy_2.4.20_chaos.pc @@ -0,0 +1,5 @@ +include/linux/skbuff.h +include/net/tcp.h +net/netsyms.c +net/core/skbuff.c +net/ipv4/tcp.c diff --git a/lustre/kernel_patches/pc/vfs_intent_2.4.20_chaos.pc b/lustre/kernel_patches/pc/vfs_intent_2.4.20_chaos.pc new file mode 100644 index 0000000..f3375a3 --- /dev/null +++ b/lustre/kernel_patches/pc/vfs_intent_2.4.20_chaos.pc @@ -0,0 +1,14 @@ +fs/exec.c +fs/dcache.c +fs/namespace.c +fs/namei.c +fs/nfsd/vfs.c +fs/open.c +fs/stat.c +fs/proc/base.c +include/linux/dcache.h +include/linux/fs.h +include/linux/fs_struct.h +kernel/ksyms.c +kernel/fork.c +kernel/exit.c diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c new file mode 100644 index 0000000..d2126db --- /dev/null +++ b/lustre/llite/llite_lib.c @@ -0,0 +1,938 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +kmem_cache_t *ll_file_data_slab; + +extern struct address_space_operations ll_aops; +extern struct address_space_operations ll_dir_aops; +extern struct super_operations ll_super_operations; + +#ifndef log2 +#define log2(n) ffz(~(n)) +#endif + +char *ll_read_opt(const char *opt, char *data) +{ + char *value; + char *retval; + ENTRY; + + CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data); + if (strncmp(opt, data, strlen(opt))) + RETURN(NULL); + if ((value = strchr(data, '=')) == NULL) + RETURN(NULL); + + value++; + OBD_ALLOC(retval, strlen(value) + 1); + if (!retval) { + CERROR("out of memory!\n"); + RETURN(NULL); + } + + memcpy(retval, value, strlen(value)+1); + CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval); + RETURN(retval); +} + +int ll_set_opt(const char *opt, char *data, int fl) +{ + ENTRY; + + CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data); + if (strncmp(opt, data, strlen(opt))) + RETURN(0); + else + RETURN(fl); +} + +void ll_options(char *options, char **ost, char **mds, int *flags) +{ + char *this_char; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + char *opt_ptr = options; +#endif + ENTRY; + + if (!options) { + EXIT; + return; + } + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + for (this_char = strtok (options, ","); + this_char != NULL; + this_char = strtok (NULL, ",")) { +#else + while ((this_char = strsep (&opt_ptr, ",")) != NULL) { +#endif + CDEBUG(D_SUPER, "this_char %s\n", this_char); + if ((!*ost && (*ost = ll_read_opt("osc", this_char)))|| + (!*mds && (*mds = ll_read_opt("mdc", this_char)))|| + (!(*flags & LL_SBI_NOLCK) && + ((*flags) = (*flags) | + ll_set_opt("nolock", this_char, LL_SBI_NOLCK)))) + continue; + } + EXIT; +} + +void ll_lli_init(struct ll_inode_info *lli) +{ + sema_init(&lli->lli_open_sem, 1); + spin_lock_init(&lli->lli_read_extent_lock); + INIT_LIST_HEAD(&lli->lli_read_extents); + lli->lli_flags = 0; + lli->lli_maxbytes = PAGE_CACHE_MAXBYTES; +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + ll_lldo_init(&lli->lli_dirty); + spin_lock_init(&lli->lli_pg_lock); + INIT_LIST_HEAD(&lli->lli_lc_item); + plist_init(&lli->lli_pl_read); + plist_init(&lli->lli_pl_write); + atomic_set(&lli->lli_in_writepages, 0); +#endif +} + +int ll_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *root = 0; + struct obd_device *obd; + struct ll_sb_info *sbi; + struct obd_export *mdc_export; + char *osc = NULL; + char *mdc = NULL; + int err; + struct ll_fid rootfid; + struct obd_statfs osfs; + struct ptlrpc_request *request = NULL; + struct ptlrpc_connection *mdc_conn; + struct lustre_md md; + class_uuid_t uuid; + + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); + OBD_ALLOC(sbi, sizeof(*sbi)); + if (!sbi) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&sbi->ll_conn_chain); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); + sb->u.generic_sbp = sbi; +#else + INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); + spin_lock_init(&sbi->ll_iostats.fis_lock); + ll_s2sbi(sb) = sbi; +#endif + generate_random_uuid(uuid); + class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + + ll_options(data, &osc, &mdc, &sbi->ll_flags); + + if (!osc) { + CERROR("no osc\n"); + GOTO(out_free, err = -EINVAL); + } + + if (!mdc) { + CERROR("no mdc\n"); + GOTO(out_free, err = -EINVAL); + } + + obd = class_name2obd(mdc); + if (!obd) { + CERROR("MDC %s: not setup or attached\n", mdc); + GOTO(out_free, err = -EINVAL); + } + + err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid); + if (err) { + CERROR("cannot connect to %s: rc = %d\n", mdc, err); + GOTO(out_free, err); + } + + mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection; + + obd = class_name2obd(osc); + if (!obd) { + CERROR("OSC %s: not setup or attached\n", osc); + GOTO(out_mdc, err); + } + + err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid); + if (err) { + CERROR("cannot connect to %s: rc = %d\n", osc, err); + GOTO(out_mdc, err); + } + + err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + GOTO(out_osc, err); + } + CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id); + sbi->ll_rootino = rootfid.id; + + memset(&osfs, 0, sizeof(osfs)); + mdc_export = class_conn2export(&sbi->ll_mdc_conn); + if (mdc_export == NULL) { + CERROR("null mdc_export\n"); + GOTO(out_osc, sb = NULL); + } + err = obd_statfs(mdc_export, &osfs); + class_export_put(mdc_export); + sb->s_blocksize = osfs.os_bsize; + sb->s_blocksize_bits = log2(osfs.os_bsize); + sb->s_magic = LL_SUPER_MAGIC; + sb->s_maxbytes = PAGE_CACHE_MAXBYTES; + + sb->s_op = &ll_super_operations; + + /* make root inode + * XXX: move this to after cbd setup? */ + err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid, + OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request); + if (err) { + CERROR("mdc_getattr failed for root: rc = %d\n", err); + GOTO(out_osc, err); + } + + /* initialize committed transaction callback daemon */ + spin_lock_init(&sbi->ll_commitcbd_lock); + init_waitqueue_head(&sbi->ll_commitcbd_waitq); + init_waitqueue_head(&sbi->ll_commitcbd_ctl_waitq); + sbi->ll_commitcbd_flags = 0; + err = ll_commitcbd_setup(sbi); + if (err) { + CERROR("failed to start commit callback daemon: rc = %d\n",err); + ptlrpc_req_finished (request); + GOTO(out_lliod, err); + } + + err = mdc_req2lustre_md(request, 0, &sbi->ll_osc_conn, &md); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n",err); + ptlrpc_req_finished (request); + GOTO(out_lliod, err); + } + + LASSERT(sbi->ll_rootino != 0); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + root = iget4(sb, sbi->ll_rootino, NULL, &md); +#else + root = ll_iget(sb, sbi->ll_rootino, &md); +#endif + + ptlrpc_req_finished(request); + + if (root == NULL || is_bad_inode(root)) { + /* XXX might need iput() for bad inode */ + CERROR("lustre_lite: bad iget4 for root\n"); + GOTO(out_cbd, err = -EBADF); + } + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + /* initialize the pagecache writeback thread */ + err = lliod_start(sbi, root); + if (err) { + CERROR("failed to start lliod: rc = %d\n",err); + GOTO(out_root, sb = NULL); + } +#endif + sb->s_root = d_alloc_root(root); + + if (proc_lustre_fs_root) { + err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, + osc, mdc); + if (err < 0) + CERROR("could not register mount in /proc/lustre"); + } + +out_dev: + if (mdc) + OBD_FREE(mdc, strlen(mdc) + 1); + if (osc) + OBD_FREE(osc, strlen(osc) + 1); + + RETURN(err); + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +out_root: + iput(root); +#endif +out_cbd: + ll_commitcbd_cleanup(sbi); +out_lliod: +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + lliod_stop(sbi); +#endif +out_osc: + obd_disconnect(&sbi->ll_osc_conn, 0); +out_mdc: + obd_disconnect(&sbi->ll_mdc_conn, 0); +out_free: + lprocfs_unregister_mountpoint(sbi); + OBD_FREE(sbi, sizeof(*sbi)); + + goto out_dev; +} /* ll_read_super */ + +void ll_put_super(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn); + struct list_head *tmp, *next; +#else + struct hlist_node *tmp, *next; +#endif + struct ll_fid rootfid; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); + list_del(&sbi->ll_conn_chain); + ll_commitcbd_cleanup(sbi); +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + lliod_stop(sbi); +#endif + obd_disconnect(&sbi->ll_osc_conn, 0); + + /* NULL request to force sync on the MDS, and get the last_committed + * value to flush remaining RPCs from the sending queue on client. + * + * XXX This should be an mdc_sync() call to sync the whole MDS fs, + * which we can call for other reasons as well. + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + if (!obd->obd_no_recov) +#endif + mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); + + lprocfs_unregister_mountpoint(sbi); + if (sbi->ll_proc_root) { + lprocfs_remove(sbi->ll_proc_root); + sbi->ll_proc_root = NULL; + } + + obd_disconnect(&sbi->ll_mdc_conn, 0); + + spin_lock(&dcache_lock); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + list_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) { + struct dentry *dentry = list_entry(tmp, struct dentry, d_hash); + shrink_dcache_parent(dentry); + } +#else + hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) { + struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash); + shrink_dcache_parent(dentry); + } +#endif + spin_unlock(&dcache_lock); + + OBD_FREE(sbi, sizeof(*sbi)); + + EXIT; +} /* ll_put_super */ + +void ll_clear_inode(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + rc = ll_mdc_cancel_unused(&sbi->ll_mdc_conn, inode, + LDLM_FL_WARN | LDLM_FL_NO_CALLBACK, inode); + if (rc < 0) { + CERROR("ll_mdc_cancel_unused: %d\n", rc); + /* XXX FIXME do something dramatic */ + } + + if (atomic_read(&inode->i_count) != 0) + CERROR("clearing in-use inode %lu: count = %d\n", + inode->i_ino, atomic_read(&inode->i_count)); + + if (lli->lli_smd) { + rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, + LDLM_FL_WARN, inode); + if (rc < 0) { + CERROR("obd_cancel_unused: %d\n", rc); + /* XXX FIXME do something dramatic */ + } + obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + lli->lli_smd = NULL; + } + + if (lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); + lli->lli_symlink_name = NULL; + } + + EXIT; +} + +#if 0 +static void ll_delete_inode(struct inode *inode) +{ + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu(%p)\n", inode->i_ino, inode); + if (S_ISREG(inode->i_mode)) { + int err; + struct obdo *oa; + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + + /* mcreate with no open */ + if (!lsm) + GOTO(out, 0); + + if (lsm->lsm_object_id == 0) { + CERROR("This really happens\n"); + /* No obdo was ever created */ + GOTO(out, 0); + } + + oa = obdo_alloc(); + if (oa == NULL) + GOTO(out, -ENOMEM); + + oa->o_id = lsm->lsm_object_id; + oa->o_valid = OBD_MD_FLID; + obdo_from_inode(oa, inode, OBD_MD_FLTYPE); + + err = obd_destroy(ll_i2obdconn(inode), oa, lsm, NULL); + obdo_free(oa); + if (err) + CDEBUG(D_INODE, + "inode %lu obd_destroy objid "LPX64" error %d\n", + inode->i_ino, lsm->lsm_object_id, err); + } +out: + clear_inode(inode); + EXIT; +} +#endif + +/* like inode_setattr, but doesn't mark the inode dirty */ +int ll_attr2inode(struct inode *inode, struct iattr *attr, int trunc) +{ + unsigned int ia_valid = attr->ia_valid; + int error = 0; + + if ((ia_valid & ATTR_SIZE) && trunc) { + if (attr->ia_size > ll_file_maxbytes(inode)) { + error = -EFBIG; + goto out; + } + error = vmtruncate(inode, attr->ia_size); + if (error) + goto out; + } else if (ia_valid & ATTR_SIZE) + inode->i_size = attr->ia_size; + + if (ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + inode->i_mode = attr->ia_mode; + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + inode->i_mode &= ~S_ISGID; + } +out: + return error; +} + +int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(inode); + int err = 0; + ENTRY; + + /* change incore inode */ + err = ll_attr2inode(inode, attr, do_trunc); + if (err) + RETURN(err); + + /* Don't send size changes to MDS to avoid "fast EA" problems, and + * also avoid a pointless RPC (we get file size from OST anyways). + */ + attr->ia_valid &= ~ATTR_SIZE; + if (attr->ia_valid) { + struct mdc_op_data op_data; + + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, NULL, 0, &request); + if (err) + CERROR("mdc_setattr fails: err = %d\n", err); + + ptlrpc_req_finished(request); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct obdo oa; + int err2; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", + inode->i_ino, attr->ia_mtime); + oa.o_mtime = attr->ia_mtime; +#else + CDEBUG(D_INODE, "set mtime on OST inode %lu to " + LPU64"\n", inode->i_ino, + ll_ts2u64(&attr->ia_mtime)); + oa.o_mtime = ll_ts2u64(&attr->ia_mtime); +#endif + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME; + err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (err2) { + CERROR("obd_setattr fails: rc=%d\n", err); + if (!err) + err = err2; + } + } + } + + RETURN(err); +} + +/* If this inode has objects allocated to it (lsm != NULL), then the OST + * object(s) determine the file size and mtime. Otherwise, the MDS will + * keep these values until such a time that objects are allocated for it. + * We do the MDS operations first, as it is checking permissions for us. + * We don't to the MDS RPC if there is nothing that we want to store there, + * otherwise there is no harm in updating mtime/atime on the MDS if we are + * going to do an RPC anyways. + * + * If we are doing a truncate, we will send the mtime and ctime updates + * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. + * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE + * at the same time. + */ +#define OST_ATTR (ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME | \ + ATTR_ATIME | ATTR_ATIME_SET | ATTR_SIZE) +int ll_setattr_raw(struct inode *inode, struct iattr *attr) +{ + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; + time_t now = LTIME_S(CURRENT_TIME); + int ia_valid = attr->ia_valid; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_SETATTR); +#endif + + if ((ia_valid & ATTR_SIZE) && attr->ia_size > ll_file_maxbytes(inode)){ + CDEBUG(D_INODE, "file too large %llu > "LPU64"\n", + attr->ia_size, ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + /* We mark all of the fields "set" so the MDS does not re-set them */ + if (ia_valid & ATTR_CTIME) { + attr->ia_ctime = now; + attr->ia_valid |= ATTR_CTIME_SET; + } + if (!(ia_valid & ATTR_ATIME_SET) && (ia_valid & ATTR_ATIME)) { + attr->ia_atime = now; + attr->ia_valid |= ATTR_ATIME_SET; + } + if (!(ia_valid & ATTR_MTIME_SET) && (ia_valid & ATTR_MTIME)) { + attr->ia_mtime = now; + attr->ia_valid |= ATTR_MTIME_SET; + } + + if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n", + attr->ia_mtime, attr->ia_ctime, now); + if (lsm) + attr->ia_valid &= ~ATTR_SIZE; + + /* If only OST attributes being set on objects, don't do MDS RPC. + * In that case, we need to check permissions and update the local + * inode ourselves so we can call obdo_from_inode() always. */ + if (ia_valid & (lsm ? ~(OST_ATTR | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) { + struct lustre_md md; + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + rc = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, NULL, 0, &request); + + if (rc) { + ptlrpc_req_finished(request); + if (rc != -EPERM) + CERROR("mdc_setattr fails: err = %d\n", rc); + RETURN(rc); + } + + rc = mdc_req2lustre_md(request, 0, &sbi->ll_osc_conn, &md); + if (rc && rc != -EPERM) { + ptlrpc_req_finished(request); + CERROR("mdc_setattr fails: err = %d\n", rc); + RETURN(rc); + } + ll_update_inode(inode, md.body, md.lsm); + ptlrpc_req_finished(request); + + if (!md.lsm || !S_ISREG(inode->i_mode)) { + CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n"); + RETURN(0); + } + } else { + /* The OST doesn't check permissions, but the alternative is + * a gratuitous RPC to the MDS. We already rely on the client + * to do read/write/truncate permission checks, so is mtime OK? + */ + if (ia_valid & (ATTR_MTIME | ATTR_ATIME)) { + /* from sys_utime() */ + if (!(ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))) { + if (current->fsuid != inode->i_uid && + (rc = permission(inode, MAY_WRITE)) != 0) + RETURN(rc); + } + + if ((rc = inode_change_ok(inode, attr))) + RETURN(rc); + } + + /* Won't invoke vmtruncate, as we already cleared ATTR_SIZE */ + inode_setattr(inode, attr); + } + + if (ia_valid & ATTR_SIZE) { + struct ldlm_extent extent = { .start = attr->ia_size, + .end = OBD_OBJECT_EOF }; + struct lustre_handle lockh = { 0 }; + int err; + + /* Writeback uses inode->i_size to determine how far out + * its cached pages go. ll_truncate gets a PW lock, canceling + * our lock, _after_ it has updated i_size. this can confuse + * + * We really need to get our PW lock before we change + * inode->i_size. If we don't we can race with other + * i_size updaters on our node, like ll_file_read. We + * can also race with i_size propogation to other + * nodes through dirtying and writeback of final cached + * pages. This last one is especially bad for racing + * o_append users on other nodes. */ + rc = ll_extent_lock_no_validate(NULL, inode, lsm, LCK_PW, + &extent, &lockh); + if (rc != ELDLM_OK) { + if (rc > 0) + RETURN(-ENOLCK); + RETURN(rc); + } + + rc = vmtruncate(inode, attr->ia_size); + if (rc == 0) + set_bit(LLI_F_HAVE_SIZE_LOCK, + &ll_i2info(inode)->lli_flags); + + /* unlock now as we don't mind others file lockers racing with + * the mds updates below? */ + err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); + if (err) { + CERROR("ll_extent_unlock failed: %d\n", err); + if (!rc) + rc = err; + } + } else if (ia_valid & ATTR_MTIME_SET) { + struct obdo oa; + + CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", + inode->i_ino, attr->ia_mtime); + oa.o_id = lsm->lsm_object_id; + oa.o_valid = OBD_MD_FLID; + obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + rc = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (rc) + CERROR("obd_setattr fails: rc=%d\n", rc); + } + RETURN(rc); +} + +int ll_setattr(struct dentry *de, struct iattr *attr) +{ + int rc = inode_change_ok(de->d_inode, attr); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s\n", de->d_name.name); + if (rc) + return rc; + + lprocfs_counter_incr(ll_i2sbi(de->d_inode)->ll_stats, LPROC_LL_SETATTR); + return ll_inode_setattr(de->d_inode, attr, 1); +} + +int ll_statfs(struct super_block *sb, struct kstatfs *sfs) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_export *mdc_exp = class_conn2export(&sbi->ll_mdc_conn); + struct obd_export *osc_exp; + struct obd_statfs osfs; + int rc; + ENTRY; + + if (mdc_exp == NULL) + RETURN(-EINVAL); + + CDEBUG(D_VFSTRACE, "VFS Op:\n"); + lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_STAFS); + memset(sfs, 0, sizeof(*sfs)); + rc = obd_statfs(mdc_exp, &osfs); + statfs_unpack(sfs, &osfs); + if (rc) + CERROR("mdc_statfs fails: rc = %d\n", rc); + else + CDEBUG(D_SUPER, "mdc_statfs shows blocks "LPU64"/"LPU64 + " objects "LPU64"/"LPU64"\n", + osfs.os_bavail, osfs.os_blocks, + osfs.os_ffree, osfs.os_files); + + /* temporary until mds_statfs returns statfs info for all OSTs */ + if (!rc) { + osc_exp = class_conn2export(&sbi->ll_osc_conn); + if (osc_exp == NULL) + GOTO(out, rc = -EINVAL); + rc = obd_statfs(osc_exp, &osfs); + class_export_put(osc_exp); + if (rc) { + CERROR("obd_statfs fails: rc = %d\n", rc); + GOTO(out, rc); + } + CDEBUG(D_SUPER, "obd_statfs shows blocks "LPU64"/"LPU64 + " objects "LPU64"/"LPU64"\n", + osfs.os_bavail, osfs.os_blocks, + osfs.os_ffree, osfs.os_files); + + while (osfs.os_blocks > ~0UL) { + sfs->f_bsize <<= 1; + + osfs.os_blocks >>= 1; + osfs.os_bfree >>= 1; + osfs.os_bavail >>= 1; + } + + sfs->f_blocks = osfs.os_blocks; + sfs->f_bfree = osfs.os_bfree; + sfs->f_bavail = osfs.os_bavail; + + /* If we don't have as many objects free on the OST as inodes + * on the MDS, we reduce the total number of inodes to + * compensate, so that the "inodes in use" number is correct. + */ + if (osfs.os_ffree < (__u64)sfs->f_ffree) { + sfs->f_files = (sfs->f_files - sfs->f_ffree) + + osfs.os_ffree; + sfs->f_ffree = osfs.os_ffree; + } + } + +out: + class_export_put(mdc_exp); + RETURN(rc); +} + +void dump_lsm(int level, struct lov_stripe_md *lsm) +{ + CDEBUG(level, "objid "LPX64", maxbytes "LPX64", magic %#08x, " + "stripe_size %#08x, offset %u, stripe_count %u\n", + lsm->lsm_object_id, lsm->lsm_maxbytes, lsm->lsm_magic, + lsm->lsm_stripe_size, lsm->lsm_stripe_offset, + lsm->lsm_stripe_count); +} + +void ll_update_inode(struct inode *inode, struct mds_body *body, + struct lov_stripe_md *lsm) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + if (lsm != NULL) { + if (lli->lli_smd == NULL) { + lli->lli_smd = lsm; + lli->lli_maxbytes = lsm->lsm_maxbytes; + if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES) + lli->lli_maxbytes = PAGE_CACHE_MAXBYTES; + } else { + if (memcmp(lli->lli_smd, lsm, sizeof(*lsm))) { + CERROR("lsm mismatch for inode %ld\n", + inode->i_ino); + CERROR("lli_smd:\n"); + dump_lsm(D_ERROR, lli->lli_smd); + CERROR("lsm:\n"); + dump_lsm(D_ERROR, lsm); + LBUG(); + } + } + } + + if (body->valid & OBD_MD_FLID) + inode->i_ino = body->ino; + if (body->valid & OBD_MD_FLATIME) + LTIME_S(inode->i_atime) = body->atime; + if (body->valid & OBD_MD_FLMTIME) + LTIME_S(inode->i_mtime) = body->mtime; + if (body->valid & OBD_MD_FLCTIME) + LTIME_S(inode->i_ctime) = body->ctime; + if (body->valid & OBD_MD_FLMODE) + inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT); + if (body->valid & OBD_MD_FLTYPE) + inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT); + if (body->valid & OBD_MD_FLUID) + inode->i_uid = body->uid; + if (body->valid & OBD_MD_FLGID) + inode->i_gid = body->gid; + if (body->valid & OBD_MD_FLFLAGS) + inode->i_flags = body->flags; + if (body->valid & OBD_MD_FLNLINK) + inode->i_nlink = body->nlink; + if (body->valid & OBD_MD_FLGENER) + inode->i_generation = body->generation; + if (body->valid & OBD_MD_FLRDEV) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + inode->i_rdev = body->rdev; +#else + inode->i_rdev = to_kdev_t(body->rdev); +#endif + if (body->valid & OBD_MD_FLSIZE) + inode->i_size = body->size; + if (body->valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->blocks; +} + +void ll_read_inode2(struct inode *inode, void *opaque) +{ + struct lustre_md *md = opaque; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + + ll_lli_init(lli); + + LASSERT(!lli->lli_smd); + + /* core attributes from the MDS first */ + ll_update_inode(inode, md->body, md->lsm); + + /* OIDEBUG(inode); */ + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ll_file_inode_operations; + inode->i_fop = &ll_file_operations; + inode->i_mapping->a_ops = &ll_aops; + EXIT; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + inode->i_mapping->a_ops = &ll_dir_aops; + EXIT; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ll_fast_symlink_inode_operations; + EXIT; + } else { + inode->i_op = &ll_special_inode_operations; +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + init_special_inode(inode, inode->i_mode, + kdev_t_to_nr(inode->i_rdev)); +#else + init_special_inode(inode, inode->i_mode, inode->i_rdev); +#endif + EXIT; + } +} + +int it_disposition(struct lookup_intent *it, int flag) +{ + return it->it_disposition & flag; +} + +void it_set_disposition(struct lookup_intent *it, int flag) +{ + it->it_disposition |= flag; +} + +void ll_umount_begin(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_ioctl_data ioc_data = { 0 }; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:\n"); + + obd = class_conn2obd(&sbi->ll_mdc_conn); + obd->obd_no_recov = 1; + obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_mdc_conn, sizeof ioc_data, + &ioc_data, NULL); + + obd = class_conn2obd(&sbi->ll_osc_conn); + obd->obd_no_recov = 1; + obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_osc_conn, sizeof ioc_data, + &ioc_data, NULL); + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just invalidate the requests, + * schedule, and hope. + */ + schedule(); + + EXIT; +} + + diff --git a/lustre/mdc/Makefile.mk b/lustre/mdc/Makefile.mk new file mode 100644 index 0000000..b12e5fc --- /dev/null +++ b/lustre/mdc/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2003 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../portals/Kernelenv + +obj-y += mdc.o +mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o diff --git a/lustre/ost/Makefile.mk b/lustre/ost/Makefile.mk new file mode 100644 index 0000000..08c7dae --- /dev/null +++ b/lustre/ost/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2003 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../portals/Kernelenv + +obj-y += ost.o +ost-objs := ost_handler.o lproc_ost.o diff --git a/lustre/portals/tests/Makefile.mk b/lustre/portals/tests/Makefile.mk new file mode 100644 index 0000000..751c0a0 --- /dev/null +++ b/lustre/portals/tests/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../Kernelenv + +obj-y += ping_cli.o +obj-y += ping_srv.o -- 1.8.3.1