From f6ff1b923d5790d46fffdd05eb14d1ebad4216e9 Mon Sep 17 00:00:00 2001 From: pschwan Date: Fri, 31 Jan 2003 21:37:51 +0000 Subject: [PATCH] Merge b_intent into b_md: - New kernel patch (version 9) - DLM hooks to revalidate locked data, once the lock is granted (604) - Further MDS reorganization, particularly of the open and o_creat paths --- .../kernel_patches/patches/invalidate_show.patch | 125 ++++++++++++ .../kernel_patches/patches/iod-rmap-exports.patch | 64 ++++++ lustre/kernel_patches/pc/iod-rmap-exports.pc | 6 + lustre/mds/mds_open.c | 224 +++++++++++++++++++++ lustre/tests/open_delay.c | 25 +++ 5 files changed, 444 insertions(+) create mode 100644 lustre/kernel_patches/patches/invalidate_show.patch create mode 100644 lustre/kernel_patches/patches/iod-rmap-exports.patch create mode 100644 lustre/kernel_patches/pc/iod-rmap-exports.pc create mode 100644 lustre/mds/mds_open.c create mode 100644 lustre/tests/open_delay.c diff --git a/lustre/kernel_patches/patches/invalidate_show.patch b/lustre/kernel_patches/patches/invalidate_show.patch new file mode 100644 index 0000000..7e27e3a --- /dev/null +++ b/lustre/kernel_patches/patches/invalidate_show.patch @@ -0,0 +1,125 @@ +--- lum/fs/inode.c Sat Oct 19 11:42:42 2002 ++++ linux-2.4.18-uml35-ext3online/fs/inode.c Mon Oct 14 00:41:20 2002 +@@ -606,7 +553,8 @@ static void dispose_list(struct list_hea + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) ++static int invalidate_list(struct list_head *head, struct super_block * sb, ++ struct list_head * dispose, int show) + { + struct list_head *next; + int busy = 0, count = 0; +@@ -631,6 +579,10 @@ static int invalidate_list(struct list_h + count++; + continue; + } ++ if (show) ++ printk(KERN_ERR "inode busy: %s: %d (count %ld)\n", ++ kdevname(sb->s_dev), inode->i_ino, ++ atomic_read(&inode->i_count)); + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ +@@ -649,22 +601,23 @@ static int invalidate_list(struct list_h + /** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock ++ * @show: whether we should display any busy inodes found + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int show) + { + int busy; + LIST_HEAD(throw_away); + + spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); ++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -672,7 +625,7 @@ int invalidate_inodes(struct super_block + return busy; + } + +-int invalidate_device(kdev_t dev, int do_sync) ++int invalidate_device(kdev_t dev, int do_sync, int show) + { + struct super_block *sb; + int res; +@@ -690,7 +643,7 @@ int invalidate_device(kdev_t dev, int do + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, show); + drop_super(sb); + } + invalidate_buffers(dev); +--- lum/fs/devfs/base.c.orig Sat Oct 19 11:42:16 2002 ++++ lum/fs/devfs/base.c Wed Oct 30 17:12:33 2002 +@@ -2448,7 +2448,7 @@ + retval = 1; + printk (KERN_DEBUG "VFS: Disk change detected on device %s\n", + kdevname (dev) ); +- if ( invalidate_device (dev, 0) ) ++ if ( invalidate_device (dev, 0, 1) ) + printk (KERN_WARNING "VFS: busy inodes on changed media..\n"); + /* Ugly hack to disable messages about unable to read partition table */ + tmp = warn_no_part; +--- lum/fs/super.c.orig Sat Oct 19 11:42:42 2002 ++++ lum/fs/super.c Wed Oct 30 17:16:55 2002 +@@ -936,7 +936,7 @@ + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; +- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ ++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ + if (sop) { + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); +@@ -945,7 +945,7 @@ + } + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk(KERN_ERR "VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } +--- lum/fs/block_dev.c.orig Sat Oct 19 11:42:16 2002 ++++ lum/fs/block_dev.c Wed Oct 30 17:18:15 2002 +@@ -533,7 +533,7 @@ + if (!bdops->check_media_change(dev)) + return 0; + +- if (invalidate_device(dev, 0)) ++ if (invalidate_device(dev, 0, 1)) + printk("VFS: busy inodes on changed media.\n"); + + if (bdops->revalidate) +--- lum/include/linux/fs.h Wed Oct 30 17:10:42 2002 ++++ lum/include/linux/fs.h.orig Tue Oct 22 23:15:00 2002 +@@ -1261,8 +1261,8 @@ + extern void set_buffer_flushtime(struct buffer_head *); + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); +-extern int invalidate_inodes(struct super_block *); +-extern int invalidate_device(kdev_t, int); ++extern int invalidate_inodes(struct super_block *, int); ++extern int invalidate_device(kdev_t, int, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); + extern void invalidate_inode_buffers(struct inode *); diff --git a/lustre/kernel_patches/patches/iod-rmap-exports.patch b/lustre/kernel_patches/patches/iod-rmap-exports.patch new file mode 100644 index 0000000..00eba97 --- /dev/null +++ b/lustre/kernel_patches/patches/iod-rmap-exports.patch @@ -0,0 +1,64 @@ +--- linux-chaos/fs/inode.c.b_io_export Wed Jan 29 16:56:15 2003 ++++ linux-chaos/fs/inode.c Wed Jan 29 16:56:27 2003 +@@ -66,7 +66,8 @@ + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(inode_lock); + + /* + * Statistics gathering.. +--- linux-chaos/fs/Makefile.b_io_export Wed Jan 29 16:56:45 2003 ++++ linux-chaos/fs/Makefile Wed Jan 29 16:56:53 2003 +@@ -7,7 +7,7 @@ + + O_TARGET := fs.o + +-export-objs := filesystems.o open.o dcache.o buffer.o ++export-objs := filesystems.o open.o dcache.o buffer.o inode.o + mod-subdirs := nls + + obj-y := open.o read_write.o devices.o file_table.o buffer.o \ +--- linux-chaos/mm/filemap.c.b_io_export Wed Jan 29 16:50:39 2003 ++++ linux-chaos/mm/filemap.c Wed Jan 29 16:51:11 2003 +@@ -65,6 +65,7 @@ + * pagecache_lock + */ + spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; ++EXPORT_SYMBOL(pagemap_lru_lock_cacheline); + + #define CLUSTER_PAGES (1 << page_cluster) + #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) +--- linux-chaos/mm/vmscan.c.b_io_export Wed Jan 29 16:51:58 2003 ++++ linux-chaos/mm/vmscan.c Wed Jan 29 16:55:16 2003 +@@ -839,6 +839,7 @@ + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); + } ++EXPORT_SYMBOL(wakeup_kswapd); + + static void wakeup_memwaiters(void) + { +--- linux-chaos/mm/Makefile.b_io_export Wed Jan 29 16:52:46 2003 ++++ linux-chaos/mm/Makefile Wed Jan 29 16:54:23 2003 +@@ -9,7 +9,7 @@ + + O_TARGET := mm.o + +-export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o ++export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o vmscan.c + + obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ + vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ +--- linux-chaos/mm/page_alloc.c.b_io_export Wed Jan 29 17:00:32 2003 ++++ linux-chaos/mm/page_alloc.c Wed Jan 29 17:01:31 2003 +@@ -31,6 +31,7 @@ + int nr_inactive_dirty_pages; + int nr_inactive_clean_pages; + pg_data_t *pgdat_list; ++EXPORT_SYMBOL(pgdat_list); + + /* + * The zone_table array is used to look up the address of the diff --git a/lustre/kernel_patches/pc/iod-rmap-exports.pc b/lustre/kernel_patches/pc/iod-rmap-exports.pc new file mode 100644 index 0000000..1218f55 --- /dev/null +++ b/lustre/kernel_patches/pc/iod-rmap-exports.pc @@ -0,0 +1,6 @@ +fs/inode.c +fs/Makefile +mm/filemap.c +mm/vmscan.c +mm/Makefile +mm/page_alloc.c diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c new file mode 100644 index 0000000..f4bac4a --- /dev/null +++ b/lustre/mds/mds_open.c @@ -0,0 +1,224 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mds/handler.c + * Lustre Metadata Server (mds) request handler + * + * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan + * Author: Mike Shaver + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#include +#endif +#include +#include +#include +#include + +extern kmem_cache_t *mds_file_cache; +extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req); +extern void mds_start_transno(struct mds_obd *mds); +extern int mds_finish_transno(struct mds_obd *mds, void *handle, + struct ptlrpc_request *req, int rc); + +int mds_open(struct mds_update_record *rec, int offset, + struct ptlrpc_request *req) +{ + struct mds_obd *mds = mds_req2mds(req); + struct obd_device *obd = req->rq_export->exp_obd; + struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0); + struct obd_ucred uc; + struct obd_run_ctxt saved; + struct lustre_handle lockh; + int lock_mode; + struct file *file; + struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + struct dentry *dchild, *parent; + struct inode *dir; + struct mds_export_data *med; + struct mds_file_data *mfd = NULL; + struct vfsmount *mnt = mds->mds_vfsmnt; + __u32 flags; + struct list_head *tmp; + int rc = 0; + ENTRY; + +#warning replay of open needs to be redone + /* was this animal open already and the client lost the reply? */ + /* XXX need some way to detect a reopen, to avoid locked list walks */ + med = &req->rq_export->exp_mds_data; +#if 0 + spin_lock(&med->med_open_lock); + list_for_each(tmp, &med->med_open_head) { + mfd = list_entry(tmp, typeof(*mfd), mfd_list); + if (!memcmp(&mfd->mfd_clienthandle, &body->handle, + sizeof(mfd->mfd_clienthandle)) && + body->fid1.id == mfd->mfd_file->f_dentry->d_inode->i_ino) { + dchild = mfd->mfd_file->f_dentry; + spin_unlock(&med->med_open_lock); + CERROR("Re opening "LPD64"\n", body->fid1.id); + GOTO(out_pack, rc = 0); + } + } + spin_unlock(&med->med_open_lock); +#endif + rep->lock_policy_res1 |= IT_OPEN_LOOKUP; + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK)) { + CERROR("test case OBD_FAIL_MDS_OPEN_PACK\n"); + req->rq_status = -ENOMEM; + RETURN(-ENOMEM); + } + + lock_mode = (rec->ur_flags & O_CREAT) ? LCK_PW : LCK_PR; + parent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, lock_mode, + &lockh); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + CERROR("parent lookup error %d\n", rc); + LBUG(); + RETURN(rc); + } + dir = parent->d_inode; + + down(&dir->i_sem); + dchild = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3), + parent, req->rq_reqmsg->buflens[3] - 1); + if (IS_ERR(dchild)) { + up(&dir->i_sem); + GOTO(out_unlock, rc = PTR_ERR(dchild)); + } + + if (dchild->d_inode) + rep->lock_policy_res1 |= IT_OPEN_POS; + else + rep->lock_policy_res1 |= IT_OPEN_NEG; + + /* Negative dentry, just create the file */ + if (dchild->d_inode) { + up(&dir->i_sem); + if ((rec->ur_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) { + mds_pack_inode2fid(&body->fid1, dchild->d_inode); + mds_pack_inode2body(body, dchild->d_inode); + if (S_ISREG(dchild->d_inode->i_mode)) + rc = mds_pack_md(obd, req->rq_repmsg, 3, body, + dchild->d_inode); + if (rc == 0) + rc = -EEXIST; + GOTO(out_ldput, rc); + } + } else if ((rec->ur_flags & O_CREAT) && !dchild->d_inode) { + int err; + void *handle; + mds_start_transno(mds); + rep->lock_policy_res1 |= IT_OPEN_CREATE; + handle = fsfilt_start(obd, dir, FSFILT_OP_CREATE); + if (IS_ERR(handle)) { + rc = PTR_ERR(handle); + mds_finish_transno(mds, handle, req, rc); + up(&dir->i_sem); + GOTO(out_ldput, rc); + } + rc = vfs_create(dir, dchild, rec->ur_mode); + up(&dir->i_sem); + rc = mds_finish_transno(mds, handle, req, rc); + err = fsfilt_commit(obd, dir, handle); + if (rc || err) { + CERROR("error on commit: err = %d\n", err); + if (!rc) + rc = err; + GOTO(out_ldput, rc); + } + } else if (!dchild->d_inode) { + up(&dir->i_sem); + GOTO(out_ldput, rc = 0); + } + + /* + * It already exists. + */ + mds_pack_inode2fid(&body->fid1, dchild->d_inode); + mds_pack_inode2body(body, dchild->d_inode); + + if (!S_ISREG(dchild->d_inode->i_mode)) + GOTO(out_ldput, rc = 0); + + rc = mds_pack_md(obd, req->rq_repmsg, 3, body, dchild->d_inode); + if (rc) { + CERROR("failure to get EA for %ld\n", dchild->d_inode->i_ino); + GOTO(out_ldput, req->rq_status = rc); + } + + rep->lock_policy_res1 |= IT_OPEN_OPEN; + mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL); + if (!mfd) { + CERROR("mds: out of memory\n"); + GOTO(out_ldput, req->rq_status = -ENOMEM); + } + + flags = rec->ur_flags; + /* dentry_open does a dput(de) and mntput(mnt) on error */ + mntget(mnt); + file = dentry_open(dchild, mnt, flags & ~O_DIRECT & ~O_TRUNC); + if (IS_ERR(file)) + GOTO(out_unlock, req->rq_status = PTR_ERR(file)); + + file->private_data = mfd; + mfd->mfd_file = file; + get_random_bytes(&mfd->mfd_servercookie, sizeof(mfd->mfd_servercookie)); + spin_lock(&med->med_open_lock); + list_add(&mfd->mfd_list, &med->med_open_head); + spin_unlock(&med->med_open_lock); + + out_unlock: + l_dput(parent); + ldlm_lock_decref(&lockh, lock_mode); + if (rc && rc != -EEXIST && mfd != NULL) { + kmem_cache_free(mds_file_cache, mfd); + mfd = NULL; + } + if (rc) + RETURN(rc); + + out_pack: + if (mfd) { + body->handle.addr = (__u64)(unsigned long)mfd; + body->handle.cookie = mfd->mfd_servercookie; + CDEBUG(D_INODE, "file %p: mfd %p, cookie "LPX64"\n", + mfd->mfd_file, mfd, mfd->mfd_servercookie); + } + RETURN(0); + + out_ldput: + l_dput(dchild); + goto out_unlock; +} diff --git a/lustre/tests/open_delay.c b/lustre/tests/open_delay.c new file mode 100644 index 0000000..2f418846 --- /dev/null +++ b/lustre/tests/open_delay.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + int fd; + + if (argc != 2) { + printf("Usage %s \n", argv[0]); + exit(1); + } + + fd = open(argv[1], O_RDONLY | O_LOV_DELAY_CREATE); + if (fd == -1) { + printf("Error opening %s\n", argv[1]); + exit(1); + } + + return 0; +} -- 1.8.3.1