From 221dcd6b54dd3efe75955a7cb7bc774c173f7c19 Mon Sep 17 00:00:00 2001 From: nic Date: Fri, 11 Feb 2005 23:42:08 +0000 Subject: [PATCH] land b1_4_mmap on b1_4 (20050211_1829) --- lustre/include/linux/lustre_compat25.h | 16 + lustre/include/linux/lustre_dlm.h | 5 + lustre/include/linux/lustre_lite.h | 3 +- lustre/include/linux/obd.h | 2 + lustre/include/linux/obd_class.h | 12 + .../patches/export-filemap_populate.patch | 25 + lustre/kernel_patches/series/2.6-suse-lnxi.series | 1 + lustre/kernel_patches/series/2.6-suse.series | 1 + lustre/kernel_patches/series/vanilla-2.4.24 | 1 + lustre/ldlm/ldlm_lock.c | 3 +- lustre/ldlm/ldlm_lockd.c | 1 + lustre/ldlm/ldlm_request.c | 49 ++ lustre/llite/Makefile.in | 4 +- lustre/llite/Makefile.mk | 2 +- lustre/llite/file.c | 68 ++- lustre/llite/llite_internal.h | 23 + lustre/llite/llite_lib.c | 1 + lustre/llite/llite_mmap.c | 602 +++++++++++++++++++++ lustre/llite/rw.c | 45 +- lustre/llite/rw24.c | 52 +- lustre/llite/rw26.c | 48 +- lustre/lov/lov_obd.c | 37 ++ lustre/obdclass/lprocfs_status.c | 1 + lustre/osc/osc_request.c | 23 +- lustre/tests/Makefile.am | 1 + lustre/tests/mmap_sanity.c | 397 +++++++------- lustre/tests/sanity.sh | 35 ++ lustre/tests/sanityN.sh | 7 +- 28 files changed, 1137 insertions(+), 328 deletions(-) create mode 100644 lustre/kernel_patches/patches/export-filemap_populate.patch create mode 100644 lustre/llite/llite_mmap.c diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index d3770b6..51e8054 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -215,8 +215,24 @@ static inline void cond_resched(void) #define __set_page_ll_data(page, llap) page->private = (unsigned long)llap #define __clear_page_ll_data(page) page->private = 0 #define PageWriteback(page) 0 +#define set_page_writeback(page) #define end_page_writeback(page) +static inline int mapping_mapped(struct address_space *mapping) +{ + if (mapping->i_mmap_shared) + return 1; + if (mapping->i_mmap) + return 1; + return 0; +} + +#ifdef ZAP_PAGE_RANGE_VMA +#define ll_zap_page_range(vma, addr, len) zap_page_range(vma, addr, len) +#else +#define ll_zap_page_range(vma, addr, len) zap_page_range(vma->vm_mm, addr, len) +#endif + #endif /* end of 2.4 compat macros */ #ifdef HAVE_PAGE_LIST diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index 08dd922..bf4e9e2 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -95,6 +95,9 @@ typedef enum { * list. */ #define LDLM_FL_KMS_IGNORE 0x200000 +/* Don't drop lock covering mmapped file in LRU */ +#define LDLM_FL_NO_LRU 0x400000 + /* The blocking callback is overloaded to perform two functions. These flags * indicate which operation should be performed. */ #define LDLM_CB_BLOCKING 1 @@ -536,6 +539,8 @@ int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags); int ldlm_cli_cancel(struct lustre_handle *lockh); int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *, int flags, void *opaque); +int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *, + int join); /* mds/handler.c */ /* This has to be here because recursive inclusion sucks. */ diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index b2adffd..97600ea 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -76,10 +76,11 @@ struct ll_inode_info { __u64 lli_io_epoch; unsigned long lli_flags; - /* this lock protects s_d_w and p_w_ll */ + /* this lock protects s_d_w and p_w_ll and mmap_cnt */ spinlock_t lli_lock; int lli_send_done_writing; struct list_head lli_pending_write_llaps; + atomic_t lli_mmap_cnt; struct list_head lli_close_item; diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 3b20425..b0d2685 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -658,6 +658,8 @@ struct obd_ops { __u32 mode, struct lustre_handle *); int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *, int flags, void *opaque); + int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *, + int join); int (*o_san_preprw)(int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, int niocount, diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index 4e7a8e5..9b393f0 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -866,6 +866,18 @@ static inline int obd_cancel_unused(struct obd_export *exp, RETURN(rc); } +static inline int obd_join_lru(struct obd_export *exp, + struct lov_stripe_md *ea, int join) +{ + int rc; + ENTRY; + + EXP_CHECK_OP(exp, join_lru); + OBD_COUNTER_INCREMENT(exp->exp_obd, join_lru); + + rc = OBP(exp->exp_obd, join_lru)(exp, ea, join); + RETURN(rc); +} static inline int obd_san_preprw(int cmd, struct obd_export *exp, struct obdo *oa, diff --git a/lustre/kernel_patches/patches/export-filemap_populate.patch b/lustre/kernel_patches/patches/export-filemap_populate.patch new file mode 100644 index 0000000..8f78a79 --- /dev/null +++ b/lustre/kernel_patches/patches/export-filemap_populate.patch @@ -0,0 +1,25 @@ +Index: linux-2.6.7/mm/filemap.c +=================================================================== +--- linux-2.6.7.orig/mm/filemap.c 2004-11-15 12:02:35.000000000 +0800 ++++ linux-2.6.7/mm/filemap.c 2004-11-15 12:04:38.000000000 +0800 +@@ -1409,6 +1409,7 @@ + + return 0; + } ++EXPORT_SYMBOL_GPL(filemap_populate); + + static struct vm_operations_struct generic_file_vm_ops = { + .nopage = filemap_nopage, +Index: linux-2.6.7/include/linux/mm.h +=================================================================== +--- linux-2.6.7.orig/include/linux/mm.h 2004-11-15 12:02:43.000000000 +0800 ++++ linux-2.6.7/include/linux/mm.h 2004-11-15 12:04:23.000000000 +0800 +@@ -661,6 +661,8 @@ + + /* generic vm_area_ops exported for stackable file systems */ + struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); ++int filemap_populate(struct vm_area_struct *, unsigned long, unsigned long, ++ pgprot_t, unsigned long, int); + + /* mm/page-writeback.c */ + int write_one_page(struct page *page, int wait); diff --git a/lustre/kernel_patches/series/2.6-suse-lnxi.series b/lustre/kernel_patches/series/2.6-suse-lnxi.series index 4e4adf5..5669eb8 100644 --- a/lustre/kernel_patches/series/2.6-suse-lnxi.series +++ b/lustre/kernel_patches/series/2.6-suse-lnxi.series @@ -4,4 +4,5 @@ bluesmoke-2.6-suse-lnxi.patch mtd-2.6-suse-lnxi.patch perfctr-2.6-suse-lnxi.patch kexec-2.6-suse-lnxi.patch +export-filemap_populate.patch grab_cache_page_nowait_gfp-2.6-suse.patch diff --git a/lustre/kernel_patches/series/2.6-suse.series b/lustre/kernel_patches/series/2.6-suse.series index a30d9f1..d7a9e7e 100644 --- a/lustre/kernel_patches/series/2.6-suse.series +++ b/lustre/kernel_patches/series/2.6-suse.series @@ -13,3 +13,4 @@ header-guards-2.6-suse.patch md_path_lookup-2.6-suse.patch ext3-super-ntohl.patch export-show_task-2.6-vanilla.patch +export-filemap_populate.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.24 b/lustre/kernel_patches/series/vanilla-2.4.24 index 735db03..379e4cb 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.24 +++ b/lustre/kernel_patches/series/vanilla-2.4.24 @@ -41,3 +41,4 @@ ext3-mballoc-2.4.24.patch export_num_siblings.patch ext3-nlinks-2.4.24.patch export-show_task-2.4-vanilla.patch +export-zap-page-range.patch diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 2168f1f2..9e3add6 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -482,7 +482,8 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) if (ldlm_bl_to_thread(ns, NULL, lock) != 0) ldlm_handle_bl_callback(ns, NULL, lock); } else if (ns->ns_client == LDLM_NAMESPACE_CLIENT && - !lock->l_readers && !lock->l_writers) { + !lock->l_readers && !lock->l_writers && + !(lock->l_flags & LDLM_FL_NO_LRU)) { /* If this is a client-side namespace and this was the last * reference, put it on the LRU. */ LASSERT(list_empty(&lock->l_lru)); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 0647f98..b3e11ce0 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1513,6 +1513,7 @@ EXPORT_SYMBOL(ldlm_cli_convert); EXPORT_SYMBOL(ldlm_cli_enqueue); EXPORT_SYMBOL(ldlm_cli_cancel); EXPORT_SYMBOL(ldlm_cli_cancel_unused); +EXPORT_SYMBOL(ldlm_cli_join_lru); EXPORT_SYMBOL(ldlm_replay_locks); EXPORT_SYMBOL(ldlm_resource_foreach); EXPORT_SYMBOL(ldlm_namespace_foreach); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index ed7e842..eec23ba 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -766,6 +766,55 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, RETURN(ELDLM_OK); } +/* join/split resource locks to/from lru list */ +int ldlm_cli_join_lru(struct ldlm_namespace *ns, + struct ldlm_res_id *res_id, int join) +{ + struct ldlm_resource *res; + struct ldlm_lock *lock, *n; + int count = 0; + ENTRY; + + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT); + + res = ldlm_resource_get(ns, NULL, *res_id, LDLM_EXTENT, 0); + if (res == NULL) + RETURN(count); + LASSERT(res->lr_type == LDLM_EXTENT); + + l_lock(&ns->ns_lock); + if (!join) + goto split; + + list_for_each_entry_safe (lock, n, &res->lr_granted, l_res_link) { + if (list_empty(&lock->l_lru) && + !lock->l_readers && !lock->l_writers && + !(lock->l_flags & LDLM_FL_LOCAL) && + !(lock->l_flags & LDLM_FL_CBPENDING)) { + LASSERT(ns->ns_nr_unused >= 0); + list_add_tail(&lock->l_lru, &ns->ns_unused_list); + ns->ns_nr_unused++; + lock->l_flags &= ~LDLM_FL_NO_LRU; + LDLM_DEBUG(lock, "join lock to lru"); + count++; + } + } + goto unlock; +split: + list_for_each_entry_safe (lock, n, &ns->ns_unused_list, l_lru) { + if (lock->l_resource == res) { + ldlm_lock_remove_from_lru(lock); + lock->l_flags |= LDLM_FL_NO_LRU; + LDLM_DEBUG(lock, "split lock from lru"); + count++; + } + } +unlock: + l_unlock(&ns->ns_lock); + ldlm_resource_putref(res); + RETURN(count); +} + /* Lock iterators. */ int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 9492120..4daad42 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -1,5 +1,5 @@ MODULES := llite -llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o +llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o ifeq ($(PATCHLEVEL),4) llite-objs += rw24.o super.o @@ -7,4 +7,4 @@ else llite-objs += rw26.o super25.o endif -@INCLUDE_RULES@ \ No newline at end of file +@INCLUDE_RULES@ diff --git a/lustre/llite/Makefile.mk b/lustre/llite/Makefile.mk index 06dd10e..dabbd9e 100644 --- a/lustre/llite/Makefile.mk +++ b/lustre/llite/Makefile.mk @@ -8,4 +8,4 @@ include $(src)/../portals/Kernelenv obj-y += llite.o llite-objs := llite_lib.o dcache.o super.o rw.o \ super25.o file.o dir.o symlink.o namei.o lproc_llite.o \ - rw26.o llite_nfs.o llite_close.o special.o + rw26.o llite_nfs.o llite_close.o special.o llite_mmap.o diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 801f3d7..1763524 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -376,6 +376,21 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu " "count: %lu skip: %lu end: %lu%s\n", start, start % count, count, skip, end, discard ? " (DISCARDING)" : ""); + + /* walk through the vmas on the inode and tear down mmaped pages that + * intersect with the lock. this stops immediately if there are no + * mmap()ed regions of the file. This is not efficient at all and + * should be short lived. We'll associate mmap()ed pages with the lock + * and will be able to find them directly */ + for (i = start; i <= end; i += (j + skip)) { + j = min(count - (i % count), end - i + 1); + LASSERT(j > 0); + LASSERT(inode->i_mapping); + if (ll_teardown_mmaps(inode->i_mapping, + (__u64)i << PAGE_CACHE_SHIFT, + ((__u64)(i+j) << PAGE_CACHE_SHIFT) - 1) ) + break; + } /* this is the simplistic implementation of page eviction at * cancelation. It is careful to get races with other page @@ -680,6 +695,10 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, LASSERT(lockh->cookie == 0); + /* don't drop the mmapped file to LRU */ + if (mapping_mapped(inode->i_mapping)) + ast_flags |= LDLM_FL_NO_LRU; + /* XXX phil: can we do this? won't it screw the file size up? */ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || (sbi->ll_flags & LL_SBI_NOLCK)) @@ -737,12 +756,11 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { - struct ll_file_data *fd = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct lustre_handle lockh = { 0 }; - ldlm_policy_data_t policy; + struct ll_lock_tree tree; + struct ll_lock_tree_node *node; int rc; ssize_t retval; __u64 kms; @@ -760,11 +778,12 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, if (!lsm) RETURN(0); - - policy.l_extent.start = *ppos; - policy.l_extent.end = *ppos + count - 1; - - rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, 0); + + node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, + LCK_PR); + tree.lt_fd = filp->private_data; + rc = ll_tree_lock(&tree, node, buf, count, + filp->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0); if (rc != 0) RETURN(rc); @@ -791,7 +810,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, retval = generic_file_read(filp, buf, count, ppos); out: - ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); + ll_tree_unlock(&tree); RETURN(retval); } @@ -801,11 +820,10 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { - struct ll_file_data *fd = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct lustre_handle lockh = { 0 }; - ldlm_policy_data_t policy; + struct ll_lock_tree tree; + struct ll_lock_tree_node *node; loff_t maxbytes = ll_file_maxbytes(inode); ssize_t retval; int rc; @@ -825,16 +843,18 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, RETURN(-EBADF); LASSERT(lsm); - - if (file->f_flags & O_APPEND) { - policy.l_extent.start = 0; - policy.l_extent.end = OBD_OBJECT_EOF; - } else { - policy.l_extent.start = *ppos; - policy.l_extent.end = *ppos + count - 1; - } - - rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, 0); + + if (file->f_flags & O_APPEND) + node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW); + else + node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, + LCK_PW); + if (IS_ERR(node)) + RETURN(PTR_ERR(node)); + + tree.lt_fd = file->private_data; + rc = ll_tree_lock(&tree, node, buf, count, + file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0); if (rc != 0) RETURN(rc); @@ -859,7 +879,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, retval = generic_file_write(file, buf, count, ppos); out: - ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh); + ll_tree_unlock(&tree); lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES, retval > 0 ? retval : 0); RETURN(retval); @@ -1410,7 +1430,7 @@ struct file_operations ll_file_operations = { .ioctl = ll_file_ioctl, .open = ll_file_open, .release = ll_file_release, - .mmap = generic_file_mmap, + .mmap = ll_file_mmap, .llseek = ll_file_seek, #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) .sendfile = generic_file_sendfile, diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index d218e69..7640446 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -292,6 +292,29 @@ void ll_queue_done_writing(struct inode *inode); void ll_close_thread_shutdown(struct ll_close_queue *lcq); int ll_close_thread_start(struct ll_close_queue **lcq_ret); +/* llite/llite_mmap.c */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +typedef struct rb_root rb_root_t; +typedef struct rb_node rb_node_t; +#endif + +struct ll_lock_tree_node; +struct ll_lock_tree { + rb_root_t lt_root; + struct list_head lt_locked_list; + struct ll_file_data *lt_fd; +}; + +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); +int ll_file_mmap(struct file * file, struct vm_area_struct * vma); +struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, + __u64 end, ldlm_mode_t mode); +int ll_tree_lock(struct ll_lock_tree *tree, + struct ll_lock_tree_node *first_node, + const char *buf, size_t count, int ast_flags); +int ll_tree_unlock(struct ll_lock_tree *tree); + + #define LL_SBI_NOLCK 0x1 #define LL_MAX_BLKSIZE (4UL * 1024 * 1024) diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 913f64a..deadf5b 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -148,6 +148,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) devno = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid, strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid)); + /* s_dev is also used in lt_compare() to compare two fs */ sb->s_dev = devno; obd = class_name2obd(osc); diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c new file mode 100644 index 0000000..9aab20a --- /dev/null +++ b/lustre/llite/llite_mmap.c @@ -0,0 +1,602 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +#endif + + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include "llite_internal.h" +#include + +#define VMA_DEBUG(vma, fmt, arg...) \ + CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) " \ + "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end, \ + vma->vm_pgoff, vma->vm_file->f_dentry->d_inode, \ + vma->vm_file->f_dentry->d_inode->i_ino, \ + vma->vm_file->f_dentry->d_iname, ## arg); \ + + +struct ll_lock_tree_node { + rb_node_t lt_node; + struct list_head lt_locked_item; + __u64 lt_oid; + ldlm_policy_data_t lt_policy; + struct lustre_handle lt_lockh; + ldlm_mode_t lt_mode; + struct inode *lt_inode; +}; + +__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms); +int lt_get_mmap_locks(struct ll_lock_tree *tree, + unsigned long addr, size_t count); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int *type); +#else + +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int unused); +#endif + +struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, + __u64 end, ldlm_mode_t mode) +{ + struct ll_lock_tree_node *node; + + OBD_ALLOC(node, sizeof(*node)); + if (node == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + node->lt_inode = inode; + node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id; + node->lt_policy.l_extent.start = start; + node->lt_policy.l_extent.end = end; + memset(&node->lt_lockh, 0, sizeof(node->lt_lockh)); + INIT_LIST_HEAD(&node->lt_locked_item); + node->lt_mode = mode; + + return node; +} + +int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two) +{ + /* To avoid multiple fs deadlock */ + if (one->lt_inode->i_sb->s_dev < two->lt_inode->i_sb->s_dev) + return -1; + if (one->lt_inode->i_sb->s_dev > two->lt_inode->i_sb->s_dev) + return 1; + + if (one->lt_oid < two->lt_oid) + return -1; + if (one->lt_oid > two->lt_oid) + return 1; + + if (one->lt_policy.l_extent.end < two->lt_policy.l_extent.start) + return -1; + if (one->lt_policy.l_extent.start > two->lt_policy.l_extent.end) + return 1; + + return 0; /* they are the same object and overlap */ +} + +static void lt_merge(struct ll_lock_tree_node *dst, + struct ll_lock_tree_node *src) +{ + dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start, + src->lt_policy.l_extent.start); + dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end, + src->lt_policy.l_extent.end); + + /* XXX could be a real call to the dlm to find superset modes */ + if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW) + dst->lt_mode = LCK_PW; +} + +static void lt_insert(struct ll_lock_tree *tree, + struct ll_lock_tree_node *node) +{ + struct ll_lock_tree_node *walk; + rb_node_t **p, *parent; + ENTRY; + +restart: + p = &tree->lt_root.rb_node; + parent = NULL; + while (*p) { + parent = *p; + walk = rb_entry(parent, struct ll_lock_tree_node, lt_node); + switch (lt_compare(node, walk)) { + case -1: + p = &(*p)->rb_left; + break; + case 1: + p = &(*p)->rb_right; + break; + case 0: + lt_merge(node, walk); + rb_erase(&walk->lt_node, &tree->lt_root); + OBD_FREE(walk, sizeof(*walk)); + goto restart; + break; + default: + LBUG(); + break; + } + } + rb_link_node(&node->lt_node, parent, p); + rb_insert_color(&node->lt_node, &tree->lt_root); + EXIT; +} + +static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree) +{ + rb_node_t *rbnode; + struct ll_lock_tree_node *node = NULL; + + for ( rbnode = tree->lt_root.rb_node; rbnode != NULL; + rbnode = rbnode->rb_left) { + if (rbnode->rb_left == NULL) { + node = rb_entry(rbnode, struct ll_lock_tree_node, + lt_node); + break; + } + } + RETURN(node); +} + +int ll_tree_unlock(struct ll_lock_tree *tree) +{ + struct ll_lock_tree_node *node; + struct list_head *pos, *n; + struct inode *inode; + int rc = 0; + ENTRY; + + list_for_each_safe(pos, n, &tree->lt_locked_list) { + node = list_entry(pos, struct ll_lock_tree_node, + lt_locked_item); + + inode = node->lt_inode; + rc = ll_extent_unlock(tree->lt_fd, inode, + ll_i2info(inode)->lli_smd, node->lt_mode, + &node->lt_lockh); + if (rc != 0) { + /* XXX better message */ + CERROR("couldn't unlock %d\n", rc); + } + list_del(&node->lt_locked_item); + OBD_FREE(node, sizeof(*node)); + } + + while ((node = lt_least_node(tree))) { + rb_erase(&node->lt_node, &tree->lt_root); + OBD_FREE(node, sizeof(*node)); + } + + RETURN(rc); +} + +int ll_tree_lock(struct ll_lock_tree *tree, + struct ll_lock_tree_node *first_node, + const char *buf, size_t count, int ast_flags) +{ + struct ll_lock_tree_node *node; + int rc = 0; + ENTRY; + + tree->lt_root.rb_node = NULL; + INIT_LIST_HEAD(&tree->lt_locked_list); + if (first_node != NULL) + lt_insert(tree, first_node); + + /* To avoid such subtle deadlock case: client1 try to read file1 to + * mmapped file2, on the same time, client2 try to read file2 to + * mmapped file1.*/ + rc = lt_get_mmap_locks(tree, (unsigned long)buf, count); + if (rc) + GOTO(out, rc); + + while ((node = lt_least_node(tree))) { + struct inode *inode = node->lt_inode; + rc = ll_extent_lock(tree->lt_fd, inode, + ll_i2info(inode)->lli_smd, node->lt_mode, + &node->lt_policy, &node->lt_lockh, + ast_flags); + if (rc != 0) + GOTO(out, rc); + + rb_erase(&node->lt_node, &tree->lt_root); + list_add_tail(&node->lt_locked_item, &tree->lt_locked_list); + } + RETURN(rc); +out: + ll_tree_unlock(tree); + RETURN(rc); +} + +static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma) +{ + /* we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return LCK_PW; + return LCK_PR; +} + +static void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, + size_t count) +{ + policy->l_extent.start = ((addr - vma->vm_start) & PAGE_CACHE_MASK) + + (vma->vm_pgoff << PAGE_CACHE_SHIFT); + policy->l_extent.end = (policy->l_extent.start + count - 1) | + (PAGE_CACHE_SIZE - 1); +} + +static struct vm_area_struct * our_vma(unsigned long addr, size_t count) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *ret = NULL; + ENTRY; + + spin_lock(&mm->page_table_lock); + for(vma = find_vma(mm, addr); + vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage && + vma->vm_flags & VM_SHARED) { + ret = vma; + break; + } + } + spin_unlock(&mm->page_table_lock); + RETURN(ret); +} + +int lt_get_mmap_locks(struct ll_lock_tree *tree, + unsigned long addr, size_t count) +{ + struct vm_area_struct *vma; + struct ll_lock_tree_node *node; + ldlm_policy_data_t policy; + struct inode *inode; + ENTRY; + + if (count == 0) + RETURN(0); + + /* we need to look up vmas on page aligned addresses */ + count += addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + while ((vma = our_vma(addr, count)) != NULL) { + LASSERT(vma->vm_file); + + inode = vma->vm_file->f_dentry->d_inode; + policy_from_vma(&policy, vma, addr, count); + node = ll_node_from_inode(inode, policy.l_extent.start, + policy.l_extent.end, + mode_from_vma(vma)); + if (IS_ERR(node)) { + CERROR("not enough mem for lock_tree_node!\n"); + RETURN(-ENOMEM); + } + lt_insert(tree, node); + + if (vma->vm_end - addr >= count) + break; + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + RETURN(0); +} + +/* FIXME: there is a pagefault race goes as follow (only 2.4): + * 1. A user process on node A accesses a portion of a mapped file, + * resulting in a page fault. The pagefault handler invokes the + * ll_nopage function, which reads the page into memory. + * 2. A user process on node B writes to the same portion of the file + * (either via mmap or write()), that cause node A to cancel the + * lock and truncate the page. + * 3. Node A then executes the rest of do_no_page(), entering the + * now-invalid page into the PTEs. + * + * Make the whole do_no_page as a hook to cover both the page cache + * and page mapping installing with dlm lock would eliminate this race. + * + * In 2.6, the truncate_count of address_space can cover this race. + */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int *type) +#else +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int unused) +#endif +{ + struct file *filp = vma->vm_file; + struct ll_file_data *fd = filp->private_data; + struct inode *inode = filp->f_dentry->d_inode; + struct lustre_handle lockh = { 0 }; + ldlm_policy_data_t policy; + ldlm_mode_t mode; + struct page *page = NULL; + __u64 kms, old_mtime; + unsigned long pgoff, size, rand_read, seq_read; + int rc = 0; + ENTRY; + + if (ll_i2info(inode)->lli_smd == NULL) { + CERROR("No lsm on fault?\n"); + RETURN(NULL); + } + + /* start and end the lock on the first and last bytes in the page */ + policy_from_vma(&policy, vma, address, PAGE_CACHE_SIZE); + + CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n", + vma, inode->i_ino, policy.l_extent.start, + policy.l_extent.end); + + mode = mode_from_vma(vma); + old_mtime = LTIME_S(inode->i_mtime); + + rc = ll_extent_lock(fd, inode, ll_i2info(inode)->lli_smd, mode, &policy, + &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU); + if (rc != 0) + RETURN(NULL); + + if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime) + CWARN("binary changed. inode %lu\n", inode->i_ino); + + /* XXX change inode size without i_sem hold! there is a race condition + * with truncate path. (see ll_extent_lock) */ + kms = lov_merge_size(ll_i2info(inode)->lli_smd, 1); + pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; + size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + if (pgoff >= size) + ll_glimpse_size(inode); + else + inode->i_size = kms; + + /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that + * the kernel will not read other pages not covered by ldlm in + * filemap_nopage. we do our readahead in ll_readpage. + */ + rand_read = vma->vm_flags & VM_RAND_READ; + seq_read = vma->vm_flags & VM_SEQ_READ; + vma->vm_flags &= ~ VM_SEQ_READ; + vma->vm_flags |= VM_RAND_READ; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + page = filemap_nopage(vma, address, type); +#else + page = filemap_nopage(vma, address, unused); +#endif + vma->vm_flags &= ~VM_RAND_READ; + vma->vm_flags |= (rand_read | seq_read); + + ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh); + RETURN(page); +} + +/* To avoid cancel the locks covering mmapped region for lock cache pressure, + * we track the mapped vma count by lli_mmap_cnt. + * ll_vm_open(): when first vma is linked, split locks from lru. + * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru. + * + * XXX we don't check the if the region of vma/lock for performance. + */ +static void ll_vm_open(struct vm_area_struct * vma) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + LASSERT(vma->vm_file); + + spin_lock(&lli->lli_lock); + LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0); + + atomic_inc(&lli->lli_mmap_cnt); + if (atomic_read(&lli->lli_mmap_cnt) == 1) { + struct lov_stripe_md *lsm = lli->lli_smd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + int count; + + spin_unlock(&lli->lli_lock); + count = obd_join_lru(sbi->ll_osc_exp, lsm, 0); + VMA_DEBUG(vma, "split %d unused locks from lru", count); + } else { + spin_unlock(&lli->lli_lock); + } + +} + +static void ll_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + LASSERT(vma->vm_file); + + spin_lock(&lli->lli_lock); + LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0); + + atomic_dec(&lli->lli_mmap_cnt); + if (atomic_read(&lli->lli_mmap_cnt) == 0) { + struct lov_stripe_md *lsm = lli->lli_smd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + int count; + + spin_unlock(&lli->lli_lock); + count = obd_join_lru(sbi->ll_osc_exp, lsm, 1); + VMA_DEBUG(vma, "join %d unused locks to lru", count); + } else { + spin_unlock(&lli->lli_lock); + } +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +static int ll_populate(struct vm_area_struct *area, unsigned long address, + unsigned long len, pgprot_t prot, unsigned long pgoff, + int nonblock) +{ + int rc = 0; + ENTRY; + + /* always set nonblock as true to avoid page read ahead */ + rc = filemap_populate(area, address, len, prot, pgoff, 1); + RETURN(rc); +} +#endif + +/* return the user space pointer that maps to a file offset via a vma */ +static inline unsigned long file_to_user(struct vm_area_struct *vma, + __u64 byte) +{ + return vma->vm_start + + (byte - ((__u64)vma->vm_pgoff << PAGE_SHIFT)); + +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +/* [first, last] are the byte offsets affected. + * vm_{start, end} are user addresses of the first byte of the mapping and + * the next byte beyond it + * vm_pgoff is the page index of the first byte in the mapping */ +static void teardown_vmas(struct vm_area_struct *vma, __u64 first, + __u64 last) +{ + unsigned long address, len; + for (; vma ; vma = vma->vm_next_share) { + if (last >> PAGE_SHIFT < vma->vm_pgoff) + continue; + if (first >> PAGE_SHIFT >= (vma->vm_pgoff + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) + continue; + + /* XXX in case of unmap the cow pages of a running file, + * don't unmap these private writeable mapping here! + * though that will break private mappping a little. + * + * the clean way is to check the mapping of every page + * and just unmap the non-cow pages, just like + * unmap_mapping_range() with even_cow=0 in kernel 2.6. + */ + if (!(vma->vm_flags & VM_SHARED) && + (vma->vm_flags & VM_WRITE)) + continue; + + address = max((unsigned long)vma->vm_start, + file_to_user(vma, first)); + len = min((unsigned long)vma->vm_end, + file_to_user(vma, last) + 1) - address; + + VMA_DEBUG(vma, "zapping vma [first="LPU64" last="LPU64" " + "address=%ld len=%ld]\n", first, last, address, len); + LASSERT(len > 0); + ll_zap_page_range(vma, address, len); + } +} +#endif + +/* XXX put nice comment here. talk about __free_pte -> dirty pages and + * nopage's reference passing to the pte */ +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, + __u64 last) +{ + int rc = -ENOENT; + ENTRY; + + LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + if (mapping_mapped(mapping)) { + rc = 0; + unmap_mapping_range(mapping, first + PAGE_SIZE - 1, + last - first + 1, 0); + } +#else + spin_lock(&mapping->i_shared_lock); + if (mapping->i_mmap != NULL) { + rc = 0; + teardown_vmas(mapping->i_mmap, first, last); + } + if (mapping->i_mmap_shared != NULL) { + rc = 0; + teardown_vmas(mapping->i_mmap_shared, first, last); + } + spin_unlock(&mapping->i_shared_lock); +#endif + RETURN(rc); +} + +static struct vm_operations_struct ll_file_vm_ops = { + .nopage = ll_nopage, + .open = ll_vm_open, + .close = ll_vm_close, +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + .populate = ll_populate, +#endif +}; + +int ll_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + int rc; + ENTRY; + + rc = generic_file_mmap(file, vma); + if (rc == 0) { + vma->vm_ops = &ll_file_vm_ops; + vma->vm_ops->open(vma); + /* update the inode's size and mtime */ + rc = ll_glimpse_size(file->f_dentry->d_inode); + } + + RETURN(rc); +} diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 6b68ea5..cd34804 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1078,6 +1078,49 @@ out_unlock: spin_unlock(&sbi->ll_lock); return; } +int ll_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_export *exp; + struct ll_async_page *llap; + int rc = 0; + ENTRY; + + LASSERT(!PageDirty(page)); + LASSERT(PageLocked(page)); + + exp = ll_i2obdexp(inode); + if (exp == NULL) + GOTO(out, rc = -EINVAL); + + llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE); + if (IS_ERR(llap)) + GOTO(out, rc = PTR_ERR(llap)); + + page_cache_get(page); + if (llap->llap_write_queued) { + LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); + rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, + llap->llap_cookie, + ASYNC_READY | ASYNC_URGENT); + } else { + rc = queue_or_sync_write(exp, inode, llap, + PAGE_SIZE, ASYNC_READY | ASYNC_URGENT); + } + if (rc) + page_cache_release(page); +out: + if (rc) { + if (!lli->lli_async_rc) + lli->lli_async_rc = rc; + /* re-dirty page on error so it retries write */ + SetPageDirty(page); + ClearPageLaunder(page); + unlock_page(page); + } + RETURN(rc); +} /* * for now we do our readpage the same on both 2.4 and 2.5. The kernel's @@ -1141,12 +1184,10 @@ int ll_readpage(struct file *filp, struct page *page) } if (rc == 0) { -#if 0 CWARN("ino %lu page %lu (%llu) not covered by " "a lock (mmap?). check debug logs.\n", inode->i_ino, page->index, (long long)page->index << PAGE_CACHE_SHIFT); -#endif } rc = ll_issue_page_read(exp, llap, oig, 0); diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index 6ad6dcd..aa8e708 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -49,56 +49,6 @@ #include "llite_internal.h" #include -static int ll_writepage_24(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_export *exp; - struct ll_async_page *llap; - int rc = 0; - ENTRY; - - LASSERT(!PageDirty(page)); - LASSERT(PageLocked(page)); - - exp = ll_i2obdexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); - - llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE); - if (IS_ERR(llap)) - GOTO(out, rc = PTR_ERR(llap)); - - page_cache_get(page); - if (llap->llap_write_queued) { - LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, lli->lli_smd, NULL, - llap->llap_cookie, - ASYNC_READY | ASYNC_URGENT); - } else { - llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, lli->lli_smd, NULL, - llap->llap_cookie, OBD_BRW_WRITE, 0, 0, - 0, ASYNC_READY | ASYNC_URGENT); - if (rc == 0) - LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n"); - else - llap->llap_write_queued = 0; - } - if (rc) - page_cache_release(page); -out: - if (rc) { - if (!lli->lli_async_rc) - lli->lli_async_rc = rc; - /* re-dirty page on error so it retries write */ - SetPageDirty(page); - ClearPageLaunder(page); - unlock_page(page); - } - RETURN(rc); -} - static int ll_direct_IO_24(int rw, #ifdef HAVE_DIO_FILE struct file *file, @@ -194,7 +144,7 @@ static int ll_max_readahead(struct inode *inode) struct address_space_operations ll_aops = { .readpage = ll_readpage, .direct_IO = ll_direct_IO_24, - .writepage = ll_writepage_24, + .writepage = ll_writepage, .prepare_write = ll_prepare_write, .commit_write = ll_commit_write, .removepage = ll_removepage, diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 409fbee..07b0d45 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -53,53 +53,7 @@ static int ll_writepage_26(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_export *exp; - struct ll_async_page *llap; - int rc; - ENTRY; - - LASSERT(!PageDirty(page)); - LASSERT(PageLocked(page)); - - exp = ll_i2obdexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); - - llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE); - if (IS_ERR(llap)) - GOTO(out, rc = PTR_ERR(llap)); - - page_cache_get(page); - if (llap->llap_write_queued) { - LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, lli->lli_smd, NULL, - llap->llap_cookie, - ASYNC_READY | ASYNC_URGENT); - } else { - llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, lli->lli_smd, NULL, - llap->llap_cookie, OBD_BRW_WRITE, 0, 0, - 0, ASYNC_READY | ASYNC_URGENT); - if (rc == 0) - LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n"); - else - llap->llap_write_queued = 0; - } - if (rc) - page_cache_release(page); -out: - if (rc) { - if (!lli->lli_async_rc) - lli->lli_async_rc = rc; - /* re-dirty page on error so it retries write */ - SetPageDirty(page); - unlock_page(page); - } else { - set_page_writeback(page); - } - RETURN(rc); + return ll_writepage(page); } /* It is safe to not check anything in invalidatepage/releasepage below diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index e942a02..dc6ce4f 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -1377,6 +1377,42 @@ static int lov_cancel_unused(struct obd_export *exp, RETURN(rc); } +static int lov_join_lru(struct obd_export *exp, + struct lov_stripe_md *lsm, int join) +{ + struct lov_obd *lov; + struct lov_oinfo *loi; + int i, count = 0; + ENTRY; + + ASSERT_LSM_MAGIC(lsm); + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { + struct lov_stripe_md submd; + int rc = 0; + + if (lov->tgts[loi->loi_ost_idx].active == 0) + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + + submd.lsm_object_id = loi->loi_id; + submd.lsm_stripe_count = 0; + rc = obd_join_lru(lov->tgts[loi->loi_ost_idx].ltd_exp, + &submd, join); + if (rc < 0) { + CERROR("join lru failed. objid: "LPX64" subobj: "LPX64 + " ostidx: %d rc: %d\n", lsm->lsm_object_id, + loi->loi_id, loi->loi_ost_idx, rc); + return rc; + } else { + count += rc; + } + } + RETURN(count); +} + #define LOV_U64_MAX ((__u64)~0ULL) #define LOV_SUM_MAX(tot, add) \ do { \ @@ -1803,6 +1839,7 @@ struct obd_ops lov_obd_ops = { .o_change_cbdata = lov_change_cbdata, .o_cancel = lov_cancel, .o_cancel_unused = lov_cancel_unused, + .o_join_lru = lov_join_lru, .o_iocontrol = lov_iocontrol, .o_get_info = lov_get_info, .o_set_info = lov_set_info, diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 996dbdf..9ab5fb4 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -645,6 +645,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata); LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel); LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, join_lru); LPROCFS_OBD_OP_INIT(num_private_stats, stats, san_preprw); LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export); LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index f2493b4..3a69c1c 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2415,7 +2415,8 @@ static int sanosc_brw(int cmd, struct obd_export *exp, struct obdo *oa, #endif #endif -static void osc_set_data_with_check(struct lustre_handle *lockh, void *data) +static void osc_set_data_with_check(struct lustre_handle *lockh, void *data, + int flags) { struct ldlm_lock *lock = ldlm_handle2lock(lockh); @@ -2439,6 +2440,7 @@ static void osc_set_data_with_check(struct lustre_handle *lockh, void *data) } #endif lock->l_ast_data = data; + lock->l_flags |= (flags & LDLM_FL_NO_LRU); l_unlock(&lock->l_resource->lr_namespace->ns_lock); LDLM_LOCK_PUT(lock); } @@ -2479,7 +2481,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode, lockh); if (rc == 1) { - osc_set_data_with_check(lockh, data); + osc_set_data_with_check(lockh, data, *flags); if (*flags & LDLM_FL_HAS_INTENT) { /* I would like to be able to ASSERT here that rss <= * kms, but I can't, for reasons which are explained in @@ -2510,7 +2512,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, * lock_match. I want a second opinion. */ ldlm_lock_addref(lockh, LCK_PR); ldlm_lock_decref(lockh, LCK_PW); - osc_set_data_with_check(lockh, data); + osc_set_data_with_check(lockh, data, *flags); RETURN(ELDLM_OK); } } @@ -2576,7 +2578,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, policy, mode, lockh); if (rc) { //if (!(*flags & LDLM_FL_TEST_LOCK)) - osc_set_data_with_check(lockh, data); + osc_set_data_with_check(lockh, data, *flags); RETURN(rc); } /* If we're trying to read, we also search for an existing PW lock. The @@ -2589,7 +2591,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, /* FIXME: This is not incredibly elegant, but it might * be more elegant than adding another parameter to * lock_match. I want a second opinion. */ - osc_set_data_with_check(lockh, data); + osc_set_data_with_check(lockh, data, *flags); ldlm_lock_addref(lockh, LCK_PR); ldlm_lock_decref(lockh, LCK_PW); } @@ -2617,6 +2619,15 @@ static int osc_cancel_unused(struct obd_export *exp, opaque); } +static int osc_join_lru(struct obd_export *exp, + struct lov_stripe_md *lsm, int join) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + + return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join); +} + static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, unsigned long max_age) { @@ -3117,6 +3128,7 @@ struct obd_ops osc_obd_ops = { .o_change_cbdata = osc_change_cbdata, .o_cancel = osc_cancel, .o_cancel_unused = osc_cancel_unused, + .o_join_lru = osc_join_lru, .o_iocontrol = osc_iocontrol, .o_get_info = osc_get_info, .o_set_info = osc_set_info, @@ -3148,6 +3160,7 @@ struct obd_ops sanosc_obd_ops = { .o_change_cbdata = osc_change_cbdata, .o_cancel = osc_cancel, .o_cancel_unused = osc_cancel_unused, + .o_join_lru = osc_join_lru, .o_iocontrol = osc_iocontrol, .o_import_event = osc_import_event, .o_llog_init = osc_llog_init, diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index fa9d6f4..c042fe2 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -32,6 +32,7 @@ endif bin_PROGRAMS = mcreate munlink endif # TESTS +mmap_sanity_SOURCES= mmap_sanity.c stat_SOURCES = stat.c stat_fs.h # mkdirdeep_LDADD=-L$(top_builddir)/portals/utils -lptlctl $(LIBREADLINE) diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c index 3fd0b0e..5a61806 100644 --- a/lustre/tests/mmap_sanity.c +++ b/lustre/tests/mmap_sanity.c @@ -13,148 +13,25 @@ #include #include #include +#include -char *dir = NULL, *node = NULL, *dir2 = NULL; +char *dir = NULL, *dir2 = NULL; long page_size; char mmap_sanity[256]; static void usage(void) { - printf("Usage: mmap_sanity -d dir [-n node | -m dir2]\n"); + printf("Usage: mmap_sanity -d dir [-m dir2]\n"); printf(" dir lustre mount point\n"); - printf(" node another client\n"); printf(" dir2 another mount point\n"); exit(127); } -#define MMAP_NOTIFY_PORT 7676 -static int mmap_notify(char *target, char *str, int delay) -{ - unsigned short port = MMAP_NOTIFY_PORT; - int socket_type = SOCK_DGRAM; - struct sockaddr_in server; - struct hostent *hp; - int len, sockfd, rc = 0; - - if (target == NULL) - return 0; - - sockfd = socket(AF_INET, socket_type, 0); - if (sockfd < 0) { - perror("socket()"); - return errno; - } - - if ((hp = gethostbyname(target)) == NULL) { - perror(target); - rc = errno; - goto out_close; - } - - memset(&server,0,sizeof(server)); - memcpy(&(server.sin_addr), hp->h_addr, hp->h_length); - server.sin_family = AF_INET; - server.sin_port = htons(port); - - len = sizeof(server); - if (delay) - sleep(delay); - - rc = sendto(sockfd, str, strlen(str), 0, - (struct sockaddr *)&server, len); - if (rc < 0) { - perror("sendto()"); - rc = errno; - } else - rc = 0; - -out_close: - close(sockfd); - return rc; -} - -static int mmap_wait(char *str, int timeout) -{ - unsigned short port = MMAP_NOTIFY_PORT; - int socket_type = SOCK_DGRAM; - struct sockaddr_in local, from; - char host[256]; - struct hostent *hp; - fd_set rfds; - struct timeval tv; - int sockfd, rc = 0; - - if (dir2 != NULL) - return 0; - - memset(host, 0, sizeof(host)); - if (gethostname(host, sizeof(host))) { - perror("gethostname()"); - return errno; - } - - if ((hp = gethostbyname(host)) == NULL) { - perror(host); - return errno; - } - - local.sin_family = AF_INET; - memcpy(&(local.sin_addr), hp->h_addr, hp->h_length); - local.sin_port = htons(port); - - sockfd = socket(AF_INET, socket_type, 0); - if (sockfd < 0) { - perror("socket()"); - return errno; - } - - rc = bind(sockfd, (struct sockaddr *)&local, sizeof(local)); - if (rc < 0) { - perror("bind()"); - rc = errno; - goto out_close; - } - - FD_ZERO(&rfds); - FD_SET(sockfd, &rfds); - tv.tv_sec = timeout ? timeout : 5; - tv.tv_usec = 0; - - rc = select(sockfd + 1, &rfds, NULL, NULL, &tv); - if (rc) { /* got data */ - char buffer[1024]; - int fromlen =sizeof(from); - - memset(buffer, 0, sizeof(buffer)); - rc = recvfrom(sockfd, buffer, sizeof(buffer), 0, - (struct sockaddr *)&from, &fromlen); - if (rc <= 0) { - perror("recvfrom()"); - rc = errno; - goto out_close; - } - rc = 0; - - if (strncmp(str, buffer, strlen(str)) != 0) { - fprintf(stderr, "expected string mismatch!\n"); - rc = EINVAL; - } - } else { /* timeout */ - fprintf(stderr, "timeout!\n"); - rc = ETIME; - } - -out_close: - close(sockfd); - return rc; -} - static int remote_tst(int tc, char *mnt); -static int mmap_run(char *host, int tc) +static int mmap_run(int tc) { pid_t child; - char nodearg[256], command[256]; int rc = 0; child = fork(); @@ -166,17 +43,13 @@ static int mmap_run(char *host, int tc) if (dir2 != NULL) { rc = remote_tst(tc, dir2); } else { - sprintf(nodearg, "-w %s", node); - sprintf(command, "%s -d %s -n %s -c %d", - mmap_sanity, dir, host, tc); - rc = execlp("pdsh", "pdsh", "-S", nodearg, command, NULL); - if (rc) - perror("execlp()"); + rc = EINVAL; + fprintf(stderr, "invalid argument!\n"); } _exit(rc); } -static int mmap_initialize(char *myself, int tc) +static int mmap_initialize(char *myself) { char buf[1024], *file; int fdr, fdw, count, rc = 0; @@ -186,8 +59,6 @@ static int mmap_initialize(char *myself, int tc) perror("sysconf(_SC_PAGESIZE)"); return errno; } - if (tc) - return 0; /* copy myself to lustre for another client */ fdr = open(myself, O_RDONLY); @@ -230,10 +101,8 @@ static int mmap_initialize(char *myself, int tc) return rc; } -static void mmap_finalize(int tc) +static void mmap_finalize() { - if (tc) - return; unlink(mmap_sanity); } @@ -332,7 +201,7 @@ out_close: /* cocurrent mmap operations on two nodes */ static int mmap_tst3(char *mnt) { - char *ptr, mmap_file[256], host[256]; + char *ptr, mmap_file[256]; int region, fd, rc = 0; region = page_size * 100; @@ -357,19 +226,11 @@ static int mmap_tst3(char *mnt) goto out_close; } - if (gethostname(host, sizeof(host))) { - perror("gethostname()"); - rc = errno; - goto out_unmap; - } - - rc = mmap_run(host, 3); + rc = mmap_run(3); if (rc) goto out_unmap; - rc = mmap_wait("mmap done", 10); memset(ptr, 'a', region); - sleep(2); /* wait for remote test finish */ out_unmap: munmap(ptr, region); @@ -400,14 +261,8 @@ static int remote_tst3(char *mnt) goto out_close; } memset(ptr, 'b', region); - - rc = mmap_notify(node, "mmap done", 1); - if (rc) - goto out_unmap; - memset(ptr, 'c', region); -out_unmap: munmap(ptr, region); out_close: close(fd); @@ -418,7 +273,7 @@ out_close: * client2 write to file_4b from mmap()ed file_4a. */ static int mmap_tst4(char *mnt) { - char *ptr, filea[256], fileb[256], host[256]; + char *ptr, filea[256], fileb[256]; int region, fdr, fdw, rc = 0; region = page_size * 100; @@ -456,17 +311,7 @@ static int mmap_tst4(char *mnt) goto out_close; } - if (gethostname(host, sizeof(host))) { - perror("gethostname()"); - rc = errno; - goto out_unmap; - } - - rc = mmap_run(host, 4); - if (rc) - goto out_unmap; - - rc = mmap_wait("mmap done", 10); + rc = mmap_run(4); if (rc) goto out_unmap; @@ -521,10 +366,6 @@ static int remote_tst4(char *mnt) goto out_close; } - rc = mmap_notify(node, "mmap done", 1); - if (rc) - goto out_unmap; - memset(ptr, '2', region); rc = write(fdw, ptr, region); @@ -534,7 +375,6 @@ static int remote_tst4(char *mnt) } else rc = 0; -out_unmap: munmap(ptr, region); out_close: if (fdr >= 0) @@ -544,6 +384,188 @@ out_close: return rc; } +static int cancel_lru_locks(char *prefix) +{ + char cmd[256], line[1024]; + FILE *file; + pid_t child; + int len = 1024, rc = 0; + + child = fork(); + if (child < 0) + return errno; + else if (child) { + int status; + rc = waitpid(child, &status, WNOHANG); + if (rc == child) + rc = 0; + return rc; + } + + if (prefix) + sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/%s_*/lru_size", prefix); + else + sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*/lru_size"); + + file = popen(cmd, "r"); + if (file == NULL) { + perror("popen()"); + return errno; + } + + while (fgets(line, len, file)) { + FILE *f; + + if (!strlen(line)) + continue; + /* trim newline character */ + *(line + strlen(line) - 1) = '\0'; + f = fopen(line, "w"); + if (f == NULL) { + perror("fopen()"); + rc = errno; + break; + } + rc = fwrite("clear", strlen("clear") + 1, 1, f); + if (rc < 1) { + perror("fwrite()"); + rc = errno; + fclose(f); + break; + } + fclose(f); + } + + pclose(file); + _exit(rc); +} + +/* don't dead lock while read/write file to/from the buffer which + * mmaped to just this file */ +static int mmap_tst5(char *mnt) +{ + char *ptr, mmap_file[256]; + int region, fd, off, rc = 0; + + region = page_size * 40; + off = page_size * 10; + sprintf(mmap_file, "%s/%s", mnt, "mmap_file5"); + + if (unlink(mmap_file) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + + fd = open(mmap_file, O_CREAT|O_RDWR, 0600); + if (fd < 0) { + perror(mmap_file); + return errno; + } + ftruncate(fd, region); + + ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + memset(ptr, 'a', region); + + /* cancel unused locks */ + cancel_lru_locks("OSC"); + if (rc) + goto out_unmap; + + /* read/write region of file and buffer should be overlap */ + rc = read(fd, ptr + off, off * 2); + if (rc != off * 2) { + perror("read()"); + rc = errno; + goto out_unmap; + } + rc = write(fd, ptr + off, off * 2); + if (rc != off * 2) { + perror("write()"); + rc = errno; + } + rc = 0; +out_unmap: + munmap(ptr, region); +out_close: + close(fd); + unlink(mmap_file); + return rc; +} + +/* mmap write to a file form client1 then mmap read from client2 */ +static int mmap_tst6(char *mnt) +{ + char mmap_file[256], mmap_file2[256]; + char *ptr = NULL, *ptr2 = NULL; + int fd = 0, fd2 = 0, rc = 0; + + sprintf(mmap_file, "%s/%s", mnt, "mmap_file6"); + sprintf(mmap_file2, "%s/%s", dir2, "mmap_file6"); + if (unlink(mmap_file) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + + fd = open(mmap_file, O_CREAT|O_RDWR, 0600); + if (fd < 0) { + perror(mmap_file); + return errno; + } + ftruncate(fd, page_size); + + fd2 = open(mmap_file2, O_RDWR, 0600); + if (fd2 < 0) { + perror(mmap_file2); + goto out; + } + + ptr = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out; + } + + ptr2 = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, 0); + if (ptr2 == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out; + } + + cancel_lru_locks("OSC"); + if (rc) + goto out; + + memcpy(ptr, "blah", strlen("blah")); + if (strncmp(ptr, ptr2, strlen("blah"))) { + fprintf(stderr, "client2 mmap mismatch!\n"); + rc = EFAULT; + goto out; + } + memcpy(ptr2, "foo", strlen("foo")); + if (strncmp(ptr, ptr2, strlen("foo"))) { + fprintf(stderr, "client1 mmap mismatch!\n"); + rc = EFAULT; + } +out: + if (ptr2) + munmap(ptr2, page_size); + if (ptr) + munmap(ptr, page_size); + if (fd2 > 0) + close(fd2); + if (fd > 0) + close(fd); + unlink(mmap_file); + return rc; +} + static int remote_tst(int tc, char *mnt) { int rc = 0; @@ -554,8 +576,6 @@ static int remote_tst(int tc, char *mnt) case 4: rc = remote_tst4(mnt); break; - case 1: - case 2: default: fprintf(stderr, "wrong test case number %d\n", tc); rc = EINVAL; @@ -577,6 +597,10 @@ struct test_case tests[] = { { 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 }, { 4, "mmap test4: c1 write to f1 from mmaped f2, " "c2 write to f1 from mmaped f1", mmap_tst4, 2 }, + { 5, "mmap test5: read/write file to/from the buffer " + "which mmaped to just this file", mmap_tst5, 1 }, + { 6, "mmap test6: check mmap write/read content on two nodes", + mmap_tst6, 2 }, { 0, NULL, 0, 0 } }; @@ -584,10 +608,10 @@ int main(int argc, char **argv) { extern char *optarg; struct test_case *test; - int c, rc = 0, tc = 0; + int c, rc = 0; for(;;) { - c = getopt(argc, argv, "d:n:c:m:"); + c = getopt(argc, argv, "d:m:"); if ( c == -1 ) break; @@ -595,12 +619,6 @@ int main(int argc, char **argv) case 'd': dir = optarg; break; - case 'n': - node = optarg; - break; - case 'c': - tc = atoi(optarg); - break; case 'm': dir2 = optarg; break; @@ -613,23 +631,16 @@ int main(int argc, char **argv) if (dir == NULL) usage(); - if (dir2 != NULL && node != NULL) - usage(); - if (mmap_initialize(argv[0], tc) != 0) { + if (mmap_initialize(argv[0]) != 0) { fprintf(stderr, "mmap_initialize failed!\n"); return EINVAL; } - if (tc) { - rc = remote_tst(tc, dir); - goto out; - } - for (test = tests; test->tc; test++) { char *rs = "skip"; rc = 0; - if (test->node_cnt == 1 || node != NULL || dir2 != NULL) { + if (test->node_cnt == 1 || dir2 != NULL) { rc = test->test_fn(dir); rs = rc ? "fail" : "pass"; } @@ -637,7 +648,7 @@ int main(int argc, char **argv) if (rc) break; } -out: - mmap_finalize(tc); + + mmap_finalize(); return rc; } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index efb0f44..699c2f2 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -2298,6 +2298,41 @@ test_69() { } run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======" +test_71() { + cp `which dbench` $DIR + + [ ! -f $DIR/dbench ] && echo "dbench not installed, skip this test" && return 0 + + TGT=$DIR/client.txt + SRC=${SRC:-/usr/lib/dbench/client.txt} + [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT + SRC=/usr/lib/dbench/client_plain.txt + [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT + + echo "copying necessary lib to $DIR" + if [ -d /lib64 ]; then + mkdir $DIR/lib64 + cp /lib64/libc* $DIR/lib64 + cp /lib64/ld-* $DIR/lib64 + else + mkdir $DIR/lib + cp /lib/libc* $DIR/lib + cp /lib/ld-* $DIR/lib + fi + + echo "chroot $DIR /dbench -c client.txt 2" + chroot $DIR /dbench -c client.txt 2 + RC=$? + + rm -f $DIR/dbench + rm -f $TGT + rm -fr $DIR/lib + rm -fr $DIR/lib64 + + return $RC +} +run_test 71 "Running dbench on lustre (don't segment fault) ====" + # on the LLNL clusters, runas will still pick up root's $TMP settings, # which will not be writable for the runas user, and then you get a CVS # error message with a corrupt path string (CVS bug) and panic. diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 47d7bba..d2135ca 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -346,7 +346,7 @@ test_15() { # bug 974 - ENOSPC run_test 15 "test out-of-space with multiple writers ===========" test_16() { - fsx -R -W -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile + fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile } run_test 16 "2500 iterations of dual-mount fsx =================" @@ -373,6 +373,11 @@ run_test 17 "resource creation/LVB creation race ===============" test_18() { ./mmap_sanity -d $MOUNT1 -m $MOUNT2 +} +run_test 18 "mmap sanity check =================================" + +test_18() { + ./mmap_sanity -d $MOUNT1 -m $MOUNT2 sync; sleep 1; sync } #run_test 18 "mmap sanity check =================================" -- 1.8.3.1