From: niu Date: Wed, 25 Aug 2004 06:47:16 +0000 (+0000) Subject: - land b1_2_mmap onto b1_2 (20040825_1413) X-Git-Tag: v1_8_0_110~486^5~169 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=9c787cce5b72c5886e787d8228a29b8e73bcec9b;p=fs%2Flustre-release.git - land b1_2_mmap onto b1_2 (20040825_1413) --- diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index 27704bd..94fa984 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -436,6 +436,16 @@ if test x$enable_modules != xno ; then AC_MSG_RESULT([no]) ]) + # --------- zap_page_range(vma) -------------------------------- + AC_MSG_CHECKING([if zap_pag_range with vma parameter]) + ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" + if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then + AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS diff --git a/lnet/include/linux/libcfs.h b/lnet/include/linux/libcfs.h index acf4045..cad7a69 100644 --- a/lnet/include/linux/libcfs.h +++ b/lnet/include/linux/libcfs.h @@ -90,6 +90,7 @@ struct ptldebug_header { #define D_RPCTRACE 0x00100000 /* for distributed debugging */ #define D_VFSTRACE 0x00200000 #define D_READA 0x00400000 /* read-ahead */ +#define D_MMAP 0x00800000 #ifdef __KERNEL__ # include /* THREAD_SIZE */ diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index e546aaf..dce196f 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -74,7 +74,7 @@ static const char *portal_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", "blocks", "net", "warning", "buffs", "other", "dentry", "portals", "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", - "reada", NULL}; + "reada", "mmap", NULL}; struct debug_daemon_cmd { char *cmd; diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 22dd2fb..f344fea 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -19,6 +19,7 @@ - replace some LBUG about llog ops with error handling (3841) - don't match INVALID dentries from d_lookup and spin (3784) - hold dcache_lock while marking dentries INVALID and hashing (4255) + - basic mmap support (3918) * miscellania - add libwrap support for the TCP acceptor (3996) - add /proc/sys/portals/routes for non-root route listing (3994) diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index ede6646..56e36e9 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -212,6 +212,17 @@ static inline void cond_resched(void) #define PageWriteback(page) 0 #define end_page_writeback(page) +static inline int mapping_mapped(struct address_space *mapping) +{ + return mapping->i_mmap_shared ? 1 : 0; +} + +#ifdef ZAP_PAGE_RANGE_VMA +#define ll_zap_page_range(vma, addr, len) zap_page_range(vma, addr, len) +#else +#define ll_zap_page_range(vma, addr, len) zap_page_range(vma->vm_mm, addr, len) +#endif + #endif /* end of 2.4 compat macros */ #ifdef HAVE_PAGE_LIST diff --git a/lustre/kernel_patches/patches/export-zap-page-range.patch b/lustre/kernel_patches/patches/export-zap-page-range.patch new file mode 100644 index 0000000..9b9d48f --- /dev/null +++ b/lustre/kernel_patches/patches/export-zap-page-range.patch @@ -0,0 +1,12 @@ +Index: linux-2.4.24-l36mmap/mm/memory.c +=================================================================== +--- linux-2.4.24-l36mmap.orig/mm/memory.c 2004-05-27 17:44:13.000000000 -0700 ++++ linux-2.4.24-l36mmap/mm/memory.c 2004-05-27 17:45:07.000000000 -0700 +@@ -411,6 +411,7 @@ + mm->rss = 0; + spin_unlock(&mm->page_table_lock); + } ++EXPORT_SYMBOL_GPL(zap_page_range); + + /* + * Do a quick page-table lookup for a single page. diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20 index d11bec0..fa7a583 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.20 +++ b/lustre/kernel_patches/series/vanilla-2.4.20 @@ -52,3 +52,4 @@ gfp_memalloc-2.4.22.patch procfs-ndynamic-2.4.patch linux-2.4.20-filemap.patch ext3-truncate-buffer-head.patch +export-zap-page-range.patch diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 9492120..4daad42 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -1,5 +1,5 @@ MODULES := llite -llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o +llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o ifeq ($(PATCHLEVEL),4) llite-objs += rw24.o super.o @@ -7,4 +7,4 @@ else llite-objs += rw26.o super25.o endif -@INCLUDE_RULES@ \ No newline at end of file +@INCLUDE_RULES@ diff --git a/lustre/llite/Makefile.mk b/lustre/llite/Makefile.mk index 06dd10e..dabbd9e 100644 --- a/lustre/llite/Makefile.mk +++ b/lustre/llite/Makefile.mk @@ -8,4 +8,4 @@ include $(src)/../portals/Kernelenv obj-y += llite.o llite-objs := llite_lib.o dcache.o super.o rw.o \ super25.o file.o dir.o symlink.o namei.o lproc_llite.o \ - rw26.o llite_nfs.o llite_close.o special.o + rw26.o llite_nfs.o llite_close.o special.o llite_mmap.o diff --git a/lustre/llite/file.c b/lustre/llite/file.c index bdac6d1..8a497ca 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -361,7 +361,7 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, if (end < tmpex.l_extent.end >> PAGE_CACHE_SHIFT) end = ~0; - i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + i = inode->i_size ? (inode->i_size - 1) >> PAGE_CACHE_SHIFT : 0; if (i < end) end = i; @@ -369,6 +369,19 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, "count: %lu skip: %lu end: %lu%s\n", start, start % count, count, skip, end, discard ? " (DISCARDING)" : ""); + /* walk through the vmas on the inode and tear down mmaped pages that + * intersect with the lock. this stops immediately if there are no + * mmap()ed regions of the file. This is not efficient at all and + * should be short lived. We'll associate mmap()ed pages with the lock + * and will be able to find them directly */ + for (i = start; i <= end; i += (j + skip)) { + j = min(count - (i % count), end - i + 1); + LASSERT(inode->i_mapping); + if (ll_teardown_mmaps(inode->i_mapping, i << PAGE_CACHE_SHIFT, + ((i+j) << PAGE_CACHE_SHIFT) - 1) ) + break; + } + /* this is the simplistic implementation of page eviction at * cancelation. It is careful to get races with other page * lockers handled correctly. fixes from bug 20 will make it @@ -722,12 +735,11 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { - struct ll_file_data *fd = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct lustre_handle lockh = { 0 }; - ldlm_policy_data_t policy; + struct ll_lock_tree tree; + struct ll_lock_tree_node *node; int rc; ssize_t retval; __u64 kms; @@ -746,10 +758,13 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, if (!lsm) RETURN(0); - policy.l_extent.start = *ppos; - policy.l_extent.end = *ppos + count - 1; + node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, + LCK_PR); + + tree.lt_fd = filp->private_data; - rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, 0); + rc = ll_tree_lock(&tree, node, inode, buf, count, + filp->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0); if (rc != 0) RETURN(rc); @@ -776,7 +791,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, retval = generic_file_read(filp, buf, count, ppos); out: - ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); + ll_tree_unlock(&tree, inode); RETURN(retval); } @@ -786,11 +801,10 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { - struct ll_file_data *fd = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct lustre_handle lockh = { 0 }; - ldlm_policy_data_t policy; + struct ll_lock_tree tree; + struct ll_lock_tree_node *node; loff_t maxbytes = ll_file_maxbytes(inode); ssize_t retval; int rc; @@ -811,15 +825,18 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, LASSERT(lsm); - if (file->f_flags & O_APPEND) { - policy.l_extent.start = 0; - policy.l_extent.end = OBD_OBJECT_EOF; - } else { - policy.l_extent.start = *ppos; - policy.l_extent.end = *ppos + count - 1; - } + if (file->f_flags & O_APPEND) + node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW); + else + node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, + LCK_PW); + if (IS_ERR(node)) + RETURN(PTR_ERR(node)); + + tree.lt_fd = file->private_data; - rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, 0); + rc = ll_tree_lock(&tree, node, inode, buf, count, + file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0); if (rc != 0) RETURN(rc); @@ -844,7 +861,8 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, retval = generic_file_write(file, buf, count, ppos); out: - ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh); + ll_tree_unlock(&tree, inode); + /* serialize with mmap/munmap/mremap */ lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES, retval > 0 ? retval : 0); RETURN(retval); @@ -1370,7 +1388,7 @@ struct file_operations ll_file_operations = { .ioctl = ll_file_ioctl, .open = ll_file_open, .release = ll_file_release, - .mmap = generic_file_mmap, + .mmap = ll_file_mmap, .llseek = ll_file_seek, #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) .sendfile = generic_file_sendfile, diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index c4b3f87..8fcce14 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -165,6 +165,7 @@ void ll_prepare_mdc_op_data(struct mdc_op_data *, /* llite/rw.c */ int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to); int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); +int ll_writepage(struct page *page); void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa); void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc); void ll_removepage(struct page *page); @@ -268,6 +269,28 @@ void ll_queue_done_writing(struct inode *inode); void ll_close_thread_shutdown(struct ll_close_queue *lcq); int ll_close_thread_start(struct ll_close_queue **lcq_ret); +/* llite/llite_mmap.c */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +typedef struct rb_root rb_root_t; +typedef struct rb_node rb_node_t; +#endif + +struct ll_lock_tree_node; +struct ll_lock_tree { + rb_root_t lt_root; + struct list_head lt_locked_list; + struct ll_file_data *lt_fd; +}; +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, + __u64 last); +int ll_file_mmap(struct file * file, struct vm_area_struct * vma); +struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, + __u64 end, ldlm_mode_t mode); +int ll_tree_lock(struct ll_lock_tree *tree, + struct ll_lock_tree_node *first_node, struct inode *inode, + const char *buf, size_t count, int ast_flags); +int ll_tree_unlock(struct ll_lock_tree *tree, struct inode *inode); + #define LL_SBI_NOLCK 0x1 #define LL_SBI_READAHEAD 0x2 diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c new file mode 100644 index 0000000..9e34556 --- /dev/null +++ b/lustre/llite/llite_mmap.c @@ -0,0 +1,482 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +#endif + + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include "llite_internal.h" +#include + +struct ll_lock_tree_node { + rb_node_t lt_node; + struct list_head lt_locked_item; + __u64 lt_oid; + ldlm_policy_data_t lt_policy; + struct lustre_handle lt_lockh; + ldlm_mode_t lt_mode; +}; + +__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms); +int lt_get_mmap_locks(struct ll_lock_tree *tree, struct inode *inode, + unsigned long addr, size_t count); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int *type); +#else + +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int unused); +#endif + +struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, + __u64 end, ldlm_mode_t mode) +{ + struct ll_lock_tree_node *node; + + OBD_ALLOC(node, sizeof(*node)); + if (node == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id; + node->lt_policy.l_extent.start = start; + node->lt_policy.l_extent.end = end; + memset(&node->lt_lockh, 0, sizeof(node->lt_lockh)); + INIT_LIST_HEAD(&node->lt_locked_item); + node->lt_mode = mode; + + return node; +} + +int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two) +{ + if ( one->lt_oid < two->lt_oid) + return -1; + if ( one->lt_oid > two->lt_oid) + return 1; + + if ( one->lt_policy.l_extent.end < two->lt_policy.l_extent.start ) + return -1; + if ( one->lt_policy.l_extent.start > two->lt_policy.l_extent.end ) + return 1; + + return 0; /* they are the same object and overlap */ +} + +static void lt_merge(struct ll_lock_tree_node *dst, + struct ll_lock_tree_node *src) +{ + dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start, + src->lt_policy.l_extent.start); + dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end, + src->lt_policy.l_extent.end); + + /* XXX could be a real call to the dlm to find superset modes */ + if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW) + dst->lt_mode = LCK_PW; +} + +static void lt_insert(struct ll_lock_tree *tree, + struct ll_lock_tree_node *node) +{ + struct ll_lock_tree_node *walk; + rb_node_t **p, *parent; + ENTRY; + +restart: + p = &tree->lt_root.rb_node; + parent = NULL; + while (*p) { + parent = *p; + walk = rb_entry(parent, struct ll_lock_tree_node, lt_node); + switch (lt_compare(node, walk)) { + case -1: + p = &(*p)->rb_left; + break; + case 1: + p = &(*p)->rb_right; + break; + case 0: + lt_merge(node, walk); + rb_erase(&walk->lt_node, &tree->lt_root); + OBD_FREE(walk, sizeof(*walk)); + goto restart; + break; + default: + LBUG(); + break; + } + } + rb_link_node(&node->lt_node, parent, p); + rb_insert_color(&node->lt_node, &tree->lt_root); + EXIT; +} + +static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree) +{ + rb_node_t *rbnode; + struct ll_lock_tree_node *node = NULL; + + for ( rbnode = tree->lt_root.rb_node; rbnode != NULL; + rbnode = rbnode->rb_left) { + if (rbnode->rb_left == NULL) { + node = rb_entry(rbnode, struct ll_lock_tree_node, + lt_node); + break; + } + } + RETURN(node); +} + +int ll_tree_unlock(struct ll_lock_tree *tree, struct inode *inode) +{ + struct ll_lock_tree_node *node; + struct list_head *pos, *n; + int rc = 0; + ENTRY; + + list_for_each_safe(pos, n, &tree->lt_locked_list) { + node = list_entry(pos, struct ll_lock_tree_node, + lt_locked_item); + + rc = ll_extent_unlock(tree->lt_fd, inode, + ll_i2info(inode)->lli_smd, node->lt_mode, + &node->lt_lockh); + if (rc != 0) { + /* XXX better message */ + CERROR("couldn't unlock %d\n", rc); + } + list_del(&node->lt_locked_item); + OBD_FREE(node, sizeof(*node)); + } + + while ((node = lt_least_node(tree))) { + rb_erase(&node->lt_node, &tree->lt_root); + OBD_FREE(node, sizeof(*node)); + } + + RETURN(rc); +} + +int ll_tree_lock(struct ll_lock_tree *tree, + struct ll_lock_tree_node *first_node, struct inode *inode, + const char *buf, size_t count, int ast_flags) +{ + struct ll_lock_tree_node *node; + int rc = 0; + ENTRY; + + tree->lt_root.rb_node = NULL; + INIT_LIST_HEAD(&tree->lt_locked_list); + if (first_node != NULL) + lt_insert(tree, first_node); + + if (mapping_mapped(inode->i_mapping)) { + rc = lt_get_mmap_locks(tree, inode, (unsigned long)buf, count); + if (rc) + GOTO(out, rc); + } + + while ((node = lt_least_node(tree))) { + rc = ll_extent_lock(tree->lt_fd, inode, + ll_i2info(inode)->lli_smd, node->lt_mode, + &node->lt_policy, &node->lt_lockh, + ast_flags); + if (rc != 0) + GOTO(out, rc); + + rb_erase(&node->lt_node, &tree->lt_root); + list_add_tail(&node->lt_locked_item, &tree->lt_locked_list); + } + RETURN(rc); +out: + ll_tree_unlock(tree, inode); + RETURN(rc); +} + +static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma) +{ + /* we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return LCK_PW; + return LCK_PR; +} + +static void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, + size_t count) +{ + policy->l_extent.start = ((addr - vma->vm_start) & PAGE_CACHE_MASK) + + (vma->vm_pgoff << PAGE_CACHE_SHIFT); + policy->l_extent.end = (policy->l_extent.start + count - 1) | + (PAGE_CACHE_SIZE - 1); +} + +static struct vm_area_struct * our_vma(unsigned long addr, size_t count) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *ret = NULL; + ENTRY; + + spin_lock(&mm->page_table_lock); + for(vma = find_vma(mm, addr); + vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage) { + ret = vma; + break; + } + } + spin_unlock(&mm->page_table_lock); + RETURN(ret); +} + +int lt_get_mmap_locks(struct ll_lock_tree *tree, struct inode *inode, + unsigned long addr, size_t count) +{ + struct vm_area_struct *vma; + struct ll_lock_tree_node *node; + ldlm_policy_data_t policy; + ENTRY; + + if (count == 0) + RETURN(0); + + /* we need to look up vmas on page aligned addresses */ + count += addr & (PAGE_SIZE - 1); + addr -= addr & (PAGE_SIZE - 1); + + while ((vma = our_vma(addr, count)) != NULL) { + + policy_from_vma(&policy, vma, addr, count); + node = ll_node_from_inode(inode, policy.l_extent.start, + policy.l_extent.end, + mode_from_vma(vma)); + if (IS_ERR(node)) { + CERROR("not enough mem for lock_tree_node!\n"); + RETURN(-ENOMEM); + } + lt_insert(tree, node); + + if (vma->vm_end - addr >= count) + break; + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + RETURN(0); +} + +/* FIXME: there is a pagefault race goes as follow: + * 1. A user process on node A accesses a portion of a mapped file, + * resulting in a page fault. The pagefault handler invokes the + * ll_nopage function, which reads the page into memory. + * 2. A user process on node B writes to the same portion of the file + * (either via mmap or write()), that cause node A to cancel the + * lock and truncate the page. + * 3. Node A then executes the rest of do_no_page(), entering the + * now-invalid page into the PTEs. + * + * Make the whole do_no_page as a hook to cover both the page cache + * and page mapping installing with dlm lock would eliminate this race. + */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int *type) +#else +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int unused) +#endif +{ + struct file *filp = vma->vm_file; + struct ll_file_data *fd = filp->private_data; + struct inode *inode = filp->f_dentry->d_inode; + struct lustre_handle lockh = { 0 }; + ldlm_policy_data_t policy; + ldlm_mode_t mode; + struct page *page; + __u64 kms; + unsigned long pgoff, size, rand_read, seq_read; + int rc = 0; + ENTRY; + + if (ll_i2info(inode)->lli_smd == NULL) { + CERROR("No lsm on fault?\n"); + RETURN(NULL); + } + + /* start and end the lock on the first and last bytes in the page */ + policy_from_vma(&policy, vma, address, PAGE_CACHE_SIZE); + + CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n", + vma, inode->i_ino, policy.l_extent.start, + policy.l_extent.end); + + mode = mode_from_vma(vma); + + rc = ll_extent_lock(fd, inode, ll_i2info(inode)->lli_smd, mode, &policy, + &lockh, LDLM_FL_CBPENDING); + if (rc != 0) + RETURN(NULL); + + /* XXX change inode size without i_sem hold! there is a race condition + * with truncate path. (see ll_extent_lock) */ + kms = lov_merge_size(ll_i2info(inode)->lli_smd, 1); + pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; + size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + if (pgoff >= size) + ll_glimpse_size(inode); + else + inode->i_size = kms; + + /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that + * the kernel will not read other pages not covered by ldlm in + * filemap_nopage. we do our readahead in ll_readpage. + */ + rand_read = vma->vm_flags & VM_RAND_READ; + seq_read = vma->vm_flags & VM_SEQ_READ; + vma->vm_flags &= ~ VM_SEQ_READ; + vma->vm_flags |= VM_RAND_READ; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + page = filemap_nopage(vma, address, type); +#else + page = filemap_nopage(vma, address, unused); +#endif + vma->vm_flags &= ~VM_RAND_READ; + vma->vm_flags |= (rand_read | seq_read); + + ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh); + RETURN(page); +} + +/* return the user space pointer that maps to a file offset via a vma */ +static inline unsigned long file_to_user(struct vm_area_struct *vma, + __u64 byte) +{ + return vma->vm_start + + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT)); + +} + +#define VMA_DEBUG(vma, fmt, arg...) \ + CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld): " fmt, \ + vma, vma->vm_start, vma->vm_end, vma->vm_pgoff, ## arg); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +/* [first, last] are the byte offsets affected. + * vm_{start, end} are user addresses of the first byte of the mapping and + * the next byte beyond it + * vm_pgoff is the page index of the first byte in the mapping */ +static void teardown_vmas(struct vm_area_struct *vma, __u64 first, + __u64 last) +{ + unsigned long address, len; + for (; vma ; vma = vma->vm_next_share) { + if (last >> PAGE_CACHE_SHIFT < vma->vm_pgoff) + continue; + if (first >> PAGE_CACHE_SHIFT > (vma->vm_pgoff + + ((vma->vm_end - vma->vm_start) >> PAGE_CACHE_SHIFT))) + continue; + + address = max((unsigned long)vma->vm_start, + file_to_user(vma, first)); + len = min((unsigned long)vma->vm_end, + file_to_user(vma, last) + 1) - address; + + VMA_DEBUG(vma, "zapping vma [address=%ld len=%ld]\n", + address, len); + LASSERT(vma->vm_mm); + ll_zap_page_range(vma, address, len); + } +} +#endif + +/* XXX put nice comment here. talk about __free_pte -> dirty pages and + * nopage's reference passing to the pte */ +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, + __u64 last) +{ + int rc = -ENOENT; + ENTRY; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + if (mapping_mapped(mapping)) { + rc = 0; + unmap_mapping_range(mapping, first + PAGE_SIZE - 1, + last - first + 1, 1); + } +#else + spin_lock(&mapping->i_shared_lock); + if (mapping->i_mmap != NULL) { + rc = 0; + teardown_vmas(mapping->i_mmap, first, last); + } + if (mapping->i_mmap_shared != NULL) { + rc = 0; + teardown_vmas(mapping->i_mmap_shared, first, last); + } + spin_unlock(&mapping->i_shared_lock); +#endif + RETURN(rc); +} + +static struct vm_operations_struct ll_file_vm_ops = { + .nopage = ll_nopage, +}; + +int ll_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + int rc; + ENTRY; + + rc = generic_file_mmap(file, vma); + if (rc == 0) + vma->vm_ops = &ll_file_vm_ops; + + RETURN(rc); +} diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 4e09d2f..83252cc 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -390,6 +390,57 @@ struct ll_async_page *llap_from_page(struct page *page) RETURN(llap); } +static int queue_or_sync_write(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct ll_async_page *llap, + unsigned to, + obd_flag async_flags) +{ + struct obd_io_group *oig; + int rc; + ENTRY; + + /* _make_ready only sees llap once we've unlocked the page */ + llap->llap_write_queued = 1; + rc = obd_queue_async_io(exp, lsm, NULL, llap->llap_cookie, + OBD_BRW_WRITE, 0, 0, 0, async_flags); + if (rc == 0) { + LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n"); + //llap_write_pending(inode, llap); + GOTO(out, 0); + } + + llap->llap_write_queued = 0; + + rc = oig_init(&oig); + if (rc) + GOTO(out, rc); + + rc = obd_queue_group_io(exp, lsm, NULL, oig, llap->llap_cookie, + OBD_BRW_WRITE, 0, to, 0, ASYNC_READY | + ASYNC_URGENT | ASYNC_COUNT_STABLE | + ASYNC_GROUP_SYNC); + if (rc) + GOTO(free_oig, rc); + + rc = obd_trigger_group_io(exp, lsm, NULL, oig); + if (rc) + GOTO(free_oig, rc); + + rc = oig_wait(oig); + + if (!rc && async_flags & ASYNC_READY) + unlock_page(llap->llap_page); + + LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", + rc); + +free_oig: + oig_release(oig); +out: + RETURN(rc); +} + void lov_increase_kms(struct obd_export *exp, struct lov_stripe_md *lsm, obd_off size); /* update our write count to account for i_size increases that may have @@ -429,39 +480,11 @@ int ll_commit_write(struct file *file, struct page *page, unsigned from, exp = ll_i2obdexp(inode); if (exp == NULL) RETURN(-EINVAL); - - /* _make_ready only sees llap once we've unlocked the page */ - llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, lsm, NULL, llap->llap_cookie, - OBD_BRW_WRITE, 0, 0, 0, 0); - if (rc != 0) { /* async failed, try sync.. */ - struct obd_io_group *oig; - rc = oig_init(&oig); - if (rc) - GOTO(out, rc); - - llap->llap_write_queued = 0; - rc = obd_queue_group_io(exp, lsm, NULL, oig, - llap->llap_cookie, - OBD_BRW_WRITE, 0, to, 0, - ASYNC_READY | ASYNC_URGENT | - ASYNC_COUNT_STABLE | - ASYNC_GROUP_SYNC); - - if (rc) - GOTO(free_oig, rc); - - rc = obd_trigger_group_io(exp, lsm, NULL, oig); - if (rc) - GOTO(free_oig, rc); - - rc = oig_wait(oig); -free_oig: - oig_release(oig); + + rc = queue_or_sync_write(exp, ll_i2info(inode)->lli_smd, llap, + to, 0); + if (rc) GOTO(out, rc); - } - LL_CDEBUG_PAGE(D_PAGE, page, "write queued\n"); - //llap_write_pending(inode, llap); } else { lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_DIRTY_HITS); @@ -506,6 +529,44 @@ static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) spin_unlock(&sbi->ll_lock); } +int ll_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct obd_export *exp; + struct ll_async_page *llap; + int rc = 0; + ENTRY; + + LASSERT(!PageDirty(page)); + LASSERT(PageLocked(page)); + + exp = ll_i2obdexp(inode); + if (exp == NULL) + GOTO(out, rc = -EINVAL); + + llap = llap_from_page(page); + if (IS_ERR(llap)) + GOTO(out, rc = PTR_ERR(llap)); + + page_cache_get(page); + if (llap->llap_write_queued) { + LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); + rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, + llap->llap_cookie, + ASYNC_READY | ASYNC_URGENT); + } else { + rc = queue_or_sync_write(exp, ll_i2info(inode)->lli_smd, llap, + PAGE_SIZE, ASYNC_READY | + ASYNC_URGENT); + } + if (rc) + page_cache_release(page); +out: + if (rc) + unlock_page(page); + RETURN(rc); +} + /* called for each page in a completed rpc.*/ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) { @@ -957,17 +1018,10 @@ int ll_readpage(struct file *filp, struct page *page) } if (rc == 0) { - static unsigned long next_print; - CDEBUG(D_INODE, "ino %lu page %lu (%llu) didn't match a lock\n", - inode->i_ino, page->index, - (long long)page->index << PAGE_CACHE_SHIFT); - if (0 && time_after(jiffies, next_print)) { - CWARN("ino %lu page %lu (%llu) not covered by " - "a lock (mmap?). check debug logs.\n", - inode->i_ino, page->index, - (long long)page->index << PAGE_CACHE_SHIFT); - next_print = jiffies + 30 * HZ; - } + CWARN("ino %lu page %lu (%llu) not covered by " + "a lock (mmap?). check debug logs.\n", + inode->i_ino, page->index, + (long long)page->index << PAGE_CACHE_SHIFT); } rc = ll_issue_page_read(exp, llap, oig, 0); diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index 8a3099f..fc3cab1 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -49,49 +49,6 @@ #include "llite_internal.h" #include -static int ll_writepage_24(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct obd_export *exp; - struct ll_async_page *llap; - int rc = 0; - ENTRY; - - LASSERT(!PageDirty(page)); - LASSERT(PageLocked(page)); - - exp = ll_i2obdexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); - - llap = llap_from_page(page); - if (IS_ERR(llap)) - GOTO(out, rc = PTR_ERR(llap)); - - page_cache_get(page); - if (llap->llap_write_queued) { - LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, - ASYNC_READY | ASYNC_URGENT); - } else { - llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, OBD_BRW_WRITE, 0, 0, - 0, ASYNC_READY | ASYNC_URGENT); - if (rc == 0) - LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n"); - else - llap->llap_write_queued = 0; - } - if (rc) - page_cache_release(page); -out: - if (rc) - unlock_page(page); - RETURN(rc); -} - static int ll_direct_IO_24(int rw, #ifdef HAVE_DIO_FILE struct file *file, @@ -179,7 +136,7 @@ static int ll_direct_IO_24(int rw, struct address_space_operations ll_aops = { .readpage = ll_readpage, .direct_IO = ll_direct_IO_24, - .writepage = ll_writepage_24, + .writepage = ll_writepage, .prepare_write = ll_prepare_write, .commit_write = ll_commit_write, .removepage = ll_removepage, diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 71964de..53bde80 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -51,51 +51,6 @@ #include "llite_internal.h" #include -static int ll_writepage_26(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct obd_export *exp; - struct ll_async_page *llap; - int rc; - ENTRY; - - LASSERT(!PageDirty(page)); - LASSERT(PageLocked(page)); - - exp = ll_i2obdexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); - - llap = llap_from_page(page); - if (IS_ERR(llap)) - GOTO(out, rc = PTR_ERR(llap)); - - page_cache_get(page); - if (llap->llap_write_queued) { - LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, - ASYNC_READY | ASYNC_URGENT); - } else { - llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, OBD_BRW_WRITE, 0, 0, - 0, ASYNC_READY | ASYNC_URGENT); - if (rc == 0) - LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n"); - else - llap->llap_write_queued = 0; - } - if (rc) - page_cache_release(page); -out: - if (rc) - unlock_page(page); - else - set_page_writeback(page); - RETURN(rc); -} - /* It is safe to not check anything in invalidatepage/releasepage below because they are run with page locked and all our io is happening with locked page too */ @@ -117,7 +72,7 @@ struct address_space_operations ll_aops = { .readpage = ll_readpage, // .readpages = ll_readpages, // .direct_IO = ll_direct_IO_26, - .writepage = ll_writepage_26, + .writepage = ll_writepage, .writepages = generic_writepages, .set_page_dirty = __set_page_dirty_nobuffers, .sync_page = NULL, diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index 27704bd..94fa984 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -436,6 +436,16 @@ if test x$enable_modules != xno ; then AC_MSG_RESULT([no]) ]) + # --------- zap_page_range(vma) -------------------------------- + AC_MSG_CHECKING([if zap_pag_range with vma parameter]) + ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" + if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then + AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h index acf4045..cad7a69 100644 --- a/lustre/portals/include/linux/libcfs.h +++ b/lustre/portals/include/linux/libcfs.h @@ -90,6 +90,7 @@ struct ptldebug_header { #define D_RPCTRACE 0x00100000 /* for distributed debugging */ #define D_VFSTRACE 0x00200000 #define D_READA 0x00400000 /* read-ahead */ +#define D_MMAP 0x00800000 #ifdef __KERNEL__ # include /* THREAD_SIZE */ diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c index e546aaf..dce196f 100644 --- a/lustre/portals/utils/debug.c +++ b/lustre/portals/utils/debug.c @@ -74,7 +74,7 @@ static const char *portal_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", "blocks", "net", "warning", "buffs", "other", "dentry", "portals", "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", - "reada", NULL}; + "reada", "mmap", NULL}; struct debug_daemon_cmd { char *cmd; diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index bc148be..3eb90ab 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -65,3 +65,5 @@ ll_dirstripe_verify openfilleddirunlink copy_attr rename_many +memhog +mmap_sanity diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index bb3368d..166755f 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -24,10 +24,12 @@ noinst_PROGRAMS += wantedi statone runas openfile getdents mkdirdeep o_directory noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany rmdirmany noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2 +noinst_PROGRAMS += mmap_sanity # noinst_PROGRAMS += ldaptest copy_attr bin_PROGRAMS = mcreate munlink endif # TESTS +mmap_sanity_SOURCES= mmap_sanity.c stat_SOURCES = stat.c stat_fs.h mkdirdeep_LDADD=-L$(top_builddir)/portals/utils -lptlctl $(LIBREADLINE) #write_append_truncate_CC=mpicc diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c new file mode 100644 index 0000000..3fd0b0e --- /dev/null +++ b/lustre/tests/mmap_sanity.c @@ -0,0 +1,643 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char *dir = NULL, *node = NULL, *dir2 = NULL; +long page_size; +char mmap_sanity[256]; + + +static void usage(void) +{ + printf("Usage: mmap_sanity -d dir [-n node | -m dir2]\n"); + printf(" dir lustre mount point\n"); + printf(" node another client\n"); + printf(" dir2 another mount point\n"); + exit(127); +} + +#define MMAP_NOTIFY_PORT 7676 +static int mmap_notify(char *target, char *str, int delay) +{ + unsigned short port = MMAP_NOTIFY_PORT; + int socket_type = SOCK_DGRAM; + struct sockaddr_in server; + struct hostent *hp; + int len, sockfd, rc = 0; + + if (target == NULL) + return 0; + + sockfd = socket(AF_INET, socket_type, 0); + if (sockfd < 0) { + perror("socket()"); + return errno; + } + + if ((hp = gethostbyname(target)) == NULL) { + perror(target); + rc = errno; + goto out_close; + } + + memset(&server,0,sizeof(server)); + memcpy(&(server.sin_addr), hp->h_addr, hp->h_length); + server.sin_family = AF_INET; + server.sin_port = htons(port); + + len = sizeof(server); + if (delay) + sleep(delay); + + rc = sendto(sockfd, str, strlen(str), 0, + (struct sockaddr *)&server, len); + if (rc < 0) { + perror("sendto()"); + rc = errno; + } else + rc = 0; + +out_close: + close(sockfd); + return rc; +} + +static int mmap_wait(char *str, int timeout) +{ + unsigned short port = MMAP_NOTIFY_PORT; + int socket_type = SOCK_DGRAM; + struct sockaddr_in local, from; + char host[256]; + struct hostent *hp; + fd_set rfds; + struct timeval tv; + int sockfd, rc = 0; + + if (dir2 != NULL) + return 0; + + memset(host, 0, sizeof(host)); + if (gethostname(host, sizeof(host))) { + perror("gethostname()"); + return errno; + } + + if ((hp = gethostbyname(host)) == NULL) { + perror(host); + return errno; + } + + local.sin_family = AF_INET; + memcpy(&(local.sin_addr), hp->h_addr, hp->h_length); + local.sin_port = htons(port); + + sockfd = socket(AF_INET, socket_type, 0); + if (sockfd < 0) { + perror("socket()"); + return errno; + } + + rc = bind(sockfd, (struct sockaddr *)&local, sizeof(local)); + if (rc < 0) { + perror("bind()"); + rc = errno; + goto out_close; + } + + FD_ZERO(&rfds); + FD_SET(sockfd, &rfds); + tv.tv_sec = timeout ? timeout : 5; + tv.tv_usec = 0; + + rc = select(sockfd + 1, &rfds, NULL, NULL, &tv); + if (rc) { /* got data */ + char buffer[1024]; + int fromlen =sizeof(from); + + memset(buffer, 0, sizeof(buffer)); + rc = recvfrom(sockfd, buffer, sizeof(buffer), 0, + (struct sockaddr *)&from, &fromlen); + if (rc <= 0) { + perror("recvfrom()"); + rc = errno; + goto out_close; + } + rc = 0; + + if (strncmp(str, buffer, strlen(str)) != 0) { + fprintf(stderr, "expected string mismatch!\n"); + rc = EINVAL; + } + } else { /* timeout */ + fprintf(stderr, "timeout!\n"); + rc = ETIME; + } + +out_close: + close(sockfd); + return rc; +} + +static int remote_tst(int tc, char *mnt); +static int mmap_run(char *host, int tc) +{ + pid_t child; + char nodearg[256], command[256]; + int rc = 0; + + child = fork(); + if (child < 0) + return errno; + else if (child) + return 0; + + if (dir2 != NULL) { + rc = remote_tst(tc, dir2); + } else { + sprintf(nodearg, "-w %s", node); + sprintf(command, "%s -d %s -n %s -c %d", + mmap_sanity, dir, host, tc); + rc = execlp("pdsh", "pdsh", "-S", nodearg, command, NULL); + if (rc) + perror("execlp()"); + } + _exit(rc); +} + +static int mmap_initialize(char *myself, int tc) +{ + char buf[1024], *file; + int fdr, fdw, count, rc = 0; + + page_size = sysconf(_SC_PAGESIZE); + if (page_size == -1) { + perror("sysconf(_SC_PAGESIZE)"); + return errno; + } + if (tc) + return 0; + + /* copy myself to lustre for another client */ + fdr = open(myself, O_RDONLY); + if (fdr < 0) { + perror(myself); + return EINVAL; + } + file = strrchr(myself, '/'); + if (file == NULL) { + fprintf(stderr, "can't get test filename\n"); + close(fdr); + return EINVAL; + } + file++; + sprintf(mmap_sanity, "%s/%s", dir, file); + + fdw = open(mmap_sanity, O_CREAT|O_WRONLY, 0777); + if (fdw < 0) { + perror(mmap_sanity); + close(fdr); + return EINVAL; + } + while ((count = read(fdr, buf, sizeof(buf))) != 0) { + int writes; + + if (count < 0) { + perror("read()"); + rc = errno; + break; + } + writes = write(fdw, buf, count); + if (writes != count) { + perror("write()"); + rc = errno; + break; + } + } + close(fdr); + close(fdw); + return rc; +} + +static void mmap_finalize(int tc) +{ + if (tc) + return; + unlink(mmap_sanity); +} + +/* basic mmap operation on single node */ +static int mmap_tst1(char *mnt) +{ + char *ptr, mmap_file[256]; + int region, fd, rc = 0; + + region = page_size * 10; + sprintf(mmap_file, "%s/%s", mnt, "mmap_file1"); + + if (unlink(mmap_file) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + + fd = open(mmap_file, O_CREAT|O_RDWR, 0600); + if (fd < 0) { + perror(mmap_file); + return errno; + } + ftruncate(fd, region); + + ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + memset(ptr, 'a', region); + + munmap(ptr, region); +out_close: + close(fd); + unlink(mmap_file); + return rc; +} + +/* MAP_PRIVATE create a copy-on-write mmap */ +static int mmap_tst2(char *mnt) +{ + char *ptr, mmap_file[256], buf[256]; + int fd, rc = 0; + + sprintf(mmap_file, "%s/%s", mnt, "mmap_file2"); + + if (unlink(mmap_file) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + + fd = open(mmap_file, O_CREAT|O_RDWR, 0600); + if (fd < 0) { + perror(mmap_file); + return errno; + } + ftruncate(fd, page_size); + + ptr = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + memcpy(ptr, "blah", strlen("blah")); + + munmap(ptr, page_size); +out_close: + close(fd); + if (rc) + return rc; + + fd = open(mmap_file, O_RDONLY); + if (fd < 0) { + perror(mmap_file); + return errno; + } + rc = read(fd, buf, sizeof(buf)); + if (rc < 0) { + perror("read()"); + rc = errno; + goto out_close; + } + rc = 0; + + if (strncmp("blah", buf, strlen("blah")) == 0) { + fprintf(stderr, "mmap write back with MAP_PRIVATE!\n"); + rc = EFAULT; + } + close(fd); + unlink(mmap_file); + return rc; +} + +/* cocurrent mmap operations on two nodes */ +static int mmap_tst3(char *mnt) +{ + char *ptr, mmap_file[256], host[256]; + int region, fd, rc = 0; + + region = page_size * 100; + sprintf(mmap_file, "%s/%s", mnt, "mmap_file3"); + + if (unlink(mmap_file) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + + fd = open(mmap_file, O_CREAT|O_RDWR, 0600); + if (fd < 0) { + perror(mmap_file); + return errno; + } + ftruncate(fd, region); + + ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + + if (gethostname(host, sizeof(host))) { + perror("gethostname()"); + rc = errno; + goto out_unmap; + } + + rc = mmap_run(host, 3); + if (rc) + goto out_unmap; + + rc = mmap_wait("mmap done", 10); + memset(ptr, 'a', region); + + sleep(2); /* wait for remote test finish */ +out_unmap: + munmap(ptr, region); +out_close: + close(fd); + unlink(mmap_file); + return rc; +} + +static int remote_tst3(char *mnt) +{ + char *ptr, mmap_file[256]; + int region, fd, rc = 0; + + region = page_size * 100; + sprintf(mmap_file, "%s/%s", mnt, "mmap_file3"); + + fd = open(mmap_file, O_RDWR, 0600); + if (fd < 0) { + perror(mmap_file); + return errno; + } + + ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + memset(ptr, 'b', region); + + rc = mmap_notify(node, "mmap done", 1); + if (rc) + goto out_unmap; + + memset(ptr, 'c', region); + +out_unmap: + munmap(ptr, region); +out_close: + close(fd); + return rc; +} + +/* client1 write to file_4a from mmap()ed file_4b; + * client2 write to file_4b from mmap()ed file_4a. */ +static int mmap_tst4(char *mnt) +{ + char *ptr, filea[256], fileb[256], host[256]; + int region, fdr, fdw, rc = 0; + + region = page_size * 100; + sprintf(filea, "%s/%s", mnt, "mmap_file_4a"); + sprintf(fileb, "%s/%s", mnt, "mmap_file_4b"); + + if (unlink(filea) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + if (unlink(fileb) && errno != ENOENT) { + perror("unlink()"); + return errno; + } + + fdr = fdw = -1; + fdr = open(fileb, O_CREAT|O_RDWR, 0600); + if (fdr < 0) { + perror(fileb); + return errno; + } + ftruncate(fdr, region); + fdw = open(filea, O_CREAT|O_RDWR, 0600); + if (fdw < 0) { + perror(filea); + rc = errno; + goto out_close; + } + ftruncate(fdw, region); + + ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fdr, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + + if (gethostname(host, sizeof(host))) { + perror("gethostname()"); + rc = errno; + goto out_unmap; + } + + rc = mmap_run(host, 4); + if (rc) + goto out_unmap; + + rc = mmap_wait("mmap done", 10); + if (rc) + goto out_unmap; + + memset(ptr, '1', region); + + rc = write(fdw, ptr, region); + if (rc <= 0) { + perror("write()"); + rc = errno; + } else + rc = 0; + + sleep(2); /* wait for remote test finish */ +out_unmap: + munmap(ptr, region); +out_close: + if (fdr >= 0) + close(fdr); + if (fdw >= 0) + close(fdw); + unlink(filea); + unlink(fileb); + return rc; +} + +static int remote_tst4(char *mnt) +{ + char *ptr, filea[256], fileb[256]; + int region, fdr, fdw, rc = 0; + + region = page_size * 100; + sprintf(filea, "%s/%s", mnt, "mmap_file_4a"); + sprintf(fileb, "%s/%s", mnt, "mmap_file_4b"); + + fdr = fdw = -1; + fdr = open(filea, O_RDWR, 0600); + if (fdr < 0) { + perror(filea); + return errno; + } + fdw = open(fileb, O_RDWR, 0600); + if (fdw < 0) { + perror(fileb); + rc = errno; + goto out_close; + } + + ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fdr, 0); + if (ptr == MAP_FAILED) { + perror("mmap()"); + rc = errno; + goto out_close; + } + + rc = mmap_notify(node, "mmap done", 1); + if (rc) + goto out_unmap; + + memset(ptr, '2', region); + + rc = write(fdw, ptr, region); + if (rc <= 0) { + perror("write()"); + rc = errno; + } else + rc = 0; + +out_unmap: + munmap(ptr, region); +out_close: + if (fdr >= 0) + close(fdr); + if (fdw >= 0) + close(fdw); + return rc; +} + +static int remote_tst(int tc, char *mnt) +{ + int rc = 0; + switch(tc) { + case 3: + rc = remote_tst3(mnt); + break; + case 4: + rc = remote_tst4(mnt); + break; + case 1: + case 2: + default: + fprintf(stderr, "wrong test case number %d\n", tc); + rc = EINVAL; + break; + } + return rc; +} + +struct test_case { + int tc; /* test case number */ + char *desc; /* test description */ + int (* test_fn)(char *mnt); /* test function */ + int node_cnt; /* node count */ +}; + +struct test_case tests[] = { + { 1, "mmap test1: basic mmap operation", mmap_tst1, 1 }, + { 2, "mmap test2: MAP_PRIVATE not write back", mmap_tst2, 1 }, + { 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 }, + { 4, "mmap test4: c1 write to f1 from mmaped f2, " + "c2 write to f1 from mmaped f1", mmap_tst4, 2 }, + { 0, NULL, 0, 0 } +}; + +int main(int argc, char **argv) +{ + extern char *optarg; + struct test_case *test; + int c, rc = 0, tc = 0; + + for(;;) { + c = getopt(argc, argv, "d:n:c:m:"); + if ( c == -1 ) + break; + + switch(c) { + case 'd': + dir = optarg; + break; + case 'n': + node = optarg; + break; + case 'c': + tc = atoi(optarg); + break; + case 'm': + dir2 = optarg; + break; + default: + case '?': + usage(); + break; + } + } + + if (dir == NULL) + usage(); + if (dir2 != NULL && node != NULL) + usage(); + + if (mmap_initialize(argv[0], tc) != 0) { + fprintf(stderr, "mmap_initialize failed!\n"); + return EINVAL; + } + + if (tc) { + rc = remote_tst(tc, dir); + goto out; + } + + for (test = tests; test->tc; test++) { + char *rs = "skip"; + rc = 0; + if (test->node_cnt == 1 || node != NULL || dir2 != NULL) { + rc = test->test_fn(dir); + rs = rc ? "fail" : "pass"; + } + fprintf(stderr, "%s (%s)\n", test->desc, rs); + if (rc) + break; + } +out: + mmap_finalize(tc); + return rc; +} diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index c3e0a80..cc578b4 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -4,7 +4,7 @@ set -e ONLY=${ONLY:-"$*"} # bug number for skipped test: 1768 3192 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 14b"} +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 14b 14c"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" @@ -334,7 +334,7 @@ test_15() { # bug 974 - ENOSPC run_test 15 "test out-of-space with multiple writers ===========" test_16() { - fsx -R -W -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile + fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile } run_test 16 "2500 iterations of dual-mount fsx =================" @@ -359,6 +359,11 @@ test_17() { # bug 3513, 3667 } run_test 17 "resource creation/LVB creation race ===============" +test_18() { + ./mmap_sanity -d $MOUNT1 -m $MOUNT2 +} +run_test 18 "mmap sanity check =================================" + log "cleanup: ======================================================" rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 049491f7..3f00da0 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -88,6 +88,7 @@ ptldebug_names = { "rpctrace" : (1 << 20), "vfstrace" : (1 << 21), "reada" : (1 << 22), + "mmap" : (1 << 23), } subsystem_names = {