From: niu <niu>
Date: Wed, 25 Aug 2004 06:47:16 +0000 (+0000)
Subject: - land b1_2_mmap onto b1_2 (20040825_1413)
X-Git-Tag: v1_8_0_110~486^5~169
X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=9c787cce5b72c5886e787d8228a29b8e73bcec9b;p=fs%2Flustre-release.git

- land b1_2_mmap onto b1_2 (20040825_1413)
---

diff --git a/lnet/archdep.m4 b/lnet/archdep.m4
index 27704bd..94fa984 100644
--- a/lnet/archdep.m4
+++ b/lnet/archdep.m4
@@ -436,6 +436,16 @@ if test x$enable_modules != xno ; then
 			AC_MSG_RESULT([no])
 		])
 
+	# --------- zap_page_range(vma) --------------------------------
+	AC_MSG_CHECKING([if zap_pag_range with vma parameter])
+	ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`"
+	if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then
+		AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter])
+		AC_MSG_RESULT([yes])
+	else
+		AC_MSG_RESULT([no])
+	fi
+
 	# ---------- Red Hat 2.4.20 backports some 2.5 bits --------
 	# This needs to run after we've defined the KCPPFLAGS
 
diff --git a/lnet/include/linux/libcfs.h b/lnet/include/linux/libcfs.h
index acf4045..cad7a69 100644
--- a/lnet/include/linux/libcfs.h
+++ b/lnet/include/linux/libcfs.h
@@ -90,6 +90,7 @@ struct ptldebug_header {
 #define D_RPCTRACE    0x00100000 /* for distributed debugging */
 #define D_VFSTRACE    0x00200000
 #define D_READA       0x00400000 /* read-ahead */
+#define D_MMAP        0x00800000
 
 #ifdef __KERNEL__
 # include <linux/sched.h> /* THREAD_SIZE */
diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c
index e546aaf..dce196f 100644
--- a/lnet/utils/debug.c
+++ b/lnet/utils/debug.c
@@ -74,7 +74,7 @@ static const char *portal_debug_masks[] =
         {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
          "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
          "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace",
-         "reada", NULL};
+         "reada", "mmap", NULL};
 
 struct debug_daemon_cmd {
         char *cmd;
diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 22dd2fb..f344fea 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -19,6 +19,7 @@
 	- replace some LBUG about llog ops with error handling (3841)
 	- don't match INVALID dentries from d_lookup and spin (3784)
 	- hold dcache_lock while marking dentries INVALID and hashing (4255)
+	- basic mmap support (3918)
        * miscellania
 	- add libwrap support for the TCP acceptor (3996)
 	- add /proc/sys/portals/routes for non-root route listing (3994)
diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h
index ede6646..56e36e9 100644
--- a/lustre/include/linux/lustre_compat25.h
+++ b/lustre/include/linux/lustre_compat25.h
@@ -212,6 +212,17 @@ static inline void cond_resched(void)
 #define PageWriteback(page) 0
 #define end_page_writeback(page)
 
+static inline int mapping_mapped(struct address_space *mapping)
+{
+        return mapping->i_mmap_shared ? 1 : 0;
+}
+
+#ifdef ZAP_PAGE_RANGE_VMA
+#define ll_zap_page_range(vma, addr, len)  zap_page_range(vma, addr, len)
+#else
+#define ll_zap_page_range(vma, addr, len)  zap_page_range(vma->vm_mm, addr, len)
+#endif
+
 #endif /* end of 2.4 compat macros */
 
 #ifdef HAVE_PAGE_LIST
diff --git a/lustre/kernel_patches/patches/export-zap-page-range.patch b/lustre/kernel_patches/patches/export-zap-page-range.patch
new file mode 100644
index 0000000..9b9d48f
--- /dev/null
+++ b/lustre/kernel_patches/patches/export-zap-page-range.patch
@@ -0,0 +1,12 @@
+Index: linux-2.4.24-l36mmap/mm/memory.c
+===================================================================
+--- linux-2.4.24-l36mmap.orig/mm/memory.c	2004-05-27 17:44:13.000000000 -0700
++++ linux-2.4.24-l36mmap/mm/memory.c	2004-05-27 17:45:07.000000000 -0700
+@@ -411,6 +411,7 @@
+ 		mm->rss = 0;
+ 	spin_unlock(&mm->page_table_lock);
+ }
++EXPORT_SYMBOL_GPL(zap_page_range);
+ 
+ /*
+  * Do a quick page-table lookup for a single page. 
diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20
index d11bec0..fa7a583 100644
--- a/lustre/kernel_patches/series/vanilla-2.4.20
+++ b/lustre/kernel_patches/series/vanilla-2.4.20
@@ -52,3 +52,4 @@ gfp_memalloc-2.4.22.patch
 procfs-ndynamic-2.4.patch
 linux-2.4.20-filemap.patch
 ext3-truncate-buffer-head.patch
+export-zap-page-range.patch
diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in
index 9492120..4daad42 100644
--- a/lustre/llite/Makefile.in
+++ b/lustre/llite/Makefile.in
@@ -1,5 +1,5 @@
 MODULES := llite
-llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o
+llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o
 
 ifeq ($(PATCHLEVEL),4)
 llite-objs += rw24.o super.o
@@ -7,4 +7,4 @@ else
 llite-objs += rw26.o super25.o
 endif
 
-@INCLUDE_RULES@
\ No newline at end of file
+@INCLUDE_RULES@
diff --git a/lustre/llite/Makefile.mk b/lustre/llite/Makefile.mk
index 06dd10e..dabbd9e 100644
--- a/lustre/llite/Makefile.mk
+++ b/lustre/llite/Makefile.mk
@@ -8,4 +8,4 @@ include $(src)/../portals/Kernelenv
 obj-y += llite.o
 llite-objs := llite_lib.o dcache.o super.o rw.o \
 	super25.o file.o dir.o symlink.o namei.o lproc_llite.o \
-	rw26.o llite_nfs.o llite_close.o special.o
+	rw26.o llite_nfs.o llite_close.o special.o llite_mmap.o
diff --git a/lustre/llite/file.c b/lustre/llite/file.c
index bdac6d1..8a497ca 100644
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -361,7 +361,7 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
         if (end < tmpex.l_extent.end >> PAGE_CACHE_SHIFT)
                 end = ~0;
 
-        i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+        i = inode->i_size ? (inode->i_size - 1) >> PAGE_CACHE_SHIFT : 0;
         if (i < end)
                 end = i;
 
@@ -369,6 +369,19 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
                "count: %lu skip: %lu end: %lu%s\n", start, start % count,
                count, skip, end, discard ? " (DISCARDING)" : "");
 
+        /* walk through the vmas on the inode and tear down mmaped pages that
+         * intersect with the lock.  this stops immediately if there are no
+         * mmap()ed regions of the file.  This is not efficient at all and
+         * should be short lived. We'll associate mmap()ed pages with the lock
+         * and will be able to find them directly */
+        for (i = start; i <= end; i += (j + skip)) {
+                j = min(count - (i % count), end - i + 1);
+                LASSERT(inode->i_mapping);
+                if (ll_teardown_mmaps(inode->i_mapping, i << PAGE_CACHE_SHIFT,
+                                      ((i+j) << PAGE_CACHE_SHIFT) - 1) )
+                        break;
+        }
+
         /* this is the simplistic implementation of page eviction at
          * cancelation.  It is careful to get races with other page
          * lockers handled correctly.  fixes from bug 20 will make it
@@ -722,12 +735,11 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
                             loff_t *ppos)
 {
-        struct ll_file_data *fd = filp->private_data;
         struct inode *inode = filp->f_dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_stripe_md *lsm = lli->lli_smd;
-        struct lustre_handle lockh = { 0 };
-        ldlm_policy_data_t policy;
+        struct ll_lock_tree tree;
+        struct ll_lock_tree_node *node;
         int rc;
         ssize_t retval;
         __u64 kms;
@@ -746,10 +758,13 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         if (!lsm)
                 RETURN(0);
 
-        policy.l_extent.start = *ppos;
-        policy.l_extent.end = *ppos + count - 1;
+        node = ll_node_from_inode(inode, *ppos, *ppos  + count - 1, 
+                                  LCK_PR);
+
+        tree.lt_fd = filp->private_data;
 
-        rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, 0);
+        rc = ll_tree_lock(&tree, node, inode, buf, count, 
+                          filp->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
         if (rc != 0)
                 RETURN(rc);
 
@@ -776,7 +791,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         retval = generic_file_read(filp, buf, count, ppos);
 
  out:
-        ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
+        ll_tree_unlock(&tree, inode);
         RETURN(retval);
 }
 
@@ -786,11 +801,10 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
                              loff_t *ppos)
 {
-        struct ll_file_data *fd = file->private_data;
         struct inode *inode = file->f_dentry->d_inode;
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-        struct lustre_handle lockh = { 0 };
-        ldlm_policy_data_t policy;
+        struct ll_lock_tree tree;
+        struct ll_lock_tree_node *node;
         loff_t maxbytes = ll_file_maxbytes(inode);
         ssize_t retval;
         int rc;
@@ -811,15 +825,18 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
 
         LASSERT(lsm);
 
-        if (file->f_flags & O_APPEND) {
-                policy.l_extent.start = 0;
-                policy.l_extent.end = OBD_OBJECT_EOF;
-        } else  {
-                policy.l_extent.start = *ppos;
-                policy.l_extent.end = *ppos + count - 1;
-        }
+        if (file->f_flags & O_APPEND)
+                node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW);
+        else
+                node = ll_node_from_inode(inode, *ppos, *ppos  + count - 1, 
+                                          LCK_PW);
+        if (IS_ERR(node))
+                RETURN(PTR_ERR(node));
+
+        tree.lt_fd = file->private_data;
 
-        rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, 0);
+        rc = ll_tree_lock(&tree, node, inode, buf, count, 
+                          file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
         if (rc != 0)
                 RETURN(rc);
 
@@ -844,7 +861,8 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
         retval = generic_file_write(file, buf, count, ppos);
 
 out:
-        ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
+        ll_tree_unlock(&tree, inode);
+        /* serialize with mmap/munmap/mremap */
         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
                             retval > 0 ? retval : 0);
         RETURN(retval);
@@ -1370,7 +1388,7 @@ struct file_operations ll_file_operations = {
         .ioctl          = ll_file_ioctl,
         .open           = ll_file_open,
         .release        = ll_file_release,
-        .mmap           = generic_file_mmap,
+        .mmap           = ll_file_mmap,
         .llseek         = ll_file_seek,
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
         .sendfile       = generic_file_sendfile,
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index c4b3f87..8fcce14 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -165,6 +165,7 @@ void ll_prepare_mdc_op_data(struct mdc_op_data *,
 /* llite/rw.c */
 int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
 int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page);
 void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa);
 void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc);
 void ll_removepage(struct page *page);
@@ -268,6 +269,28 @@ void ll_queue_done_writing(struct inode *inode);
 void ll_close_thread_shutdown(struct ll_close_queue *lcq);
 int ll_close_thread_start(struct ll_close_queue **lcq_ret);
 
+/* llite/llite_mmap.c */
+#if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+typedef struct rb_root  rb_root_t;
+typedef struct rb_node  rb_node_t;
+#endif
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+        rb_root_t                       lt_root;
+        struct list_head                lt_locked_list;
+        struct ll_file_data             *lt_fd;
+};
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, 
+                      __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+                                              __u64 end, ldlm_mode_t mode);
+int ll_tree_lock(struct ll_lock_tree *tree, 
+                 struct ll_lock_tree_node *first_node, struct inode *inode,
+                 const char *buf, size_t count, int ast_flags);
+int ll_tree_unlock(struct ll_lock_tree *tree, struct inode *inode);
+
 #define LL_SBI_NOLCK            0x1
 #define LL_SBI_READAHEAD        0x2
 
diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c
new file mode 100644
index 0000000..9e34556
--- /dev/null
+++ b/lustre/llite/llite_mmap.c
@@ -0,0 +1,482 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/iobuf.h>
+#endif
+
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+struct ll_lock_tree_node {
+        rb_node_t               lt_node;
+        struct list_head        lt_locked_item;
+        __u64                   lt_oid;
+        ldlm_policy_data_t      lt_policy;
+        struct lustre_handle    lt_lockh;
+        ldlm_mode_t             lt_mode;
+};
+
+__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms);
+int lt_get_mmap_locks(struct ll_lock_tree *tree, struct inode *inode,
+                      unsigned long addr, size_t count);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int *type);
+#else
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int unused);
+#endif
+
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+                                              __u64 end, ldlm_mode_t mode)
+{
+        struct ll_lock_tree_node *node;
+
+        OBD_ALLOC(node, sizeof(*node));
+        if (node == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id;
+        node->lt_policy.l_extent.start = start;
+        node->lt_policy.l_extent.end = end;
+        memset(&node->lt_lockh, 0, sizeof(node->lt_lockh));
+        INIT_LIST_HEAD(&node->lt_locked_item);
+        node->lt_mode = mode;
+
+        return node;
+}
+
+int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two)
+{
+        if ( one->lt_oid < two->lt_oid)
+                return -1;
+        if ( one->lt_oid > two->lt_oid)
+                return 1;
+
+        if ( one->lt_policy.l_extent.end < two->lt_policy.l_extent.start )
+                return -1;
+        if ( one->lt_policy.l_extent.start > two->lt_policy.l_extent.end )
+                return 1;
+
+        return 0; /* they are the same object and overlap */
+}
+
+static void lt_merge(struct ll_lock_tree_node *dst, 
+                     struct ll_lock_tree_node *src)
+{
+        dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start,
+                                            src->lt_policy.l_extent.start);
+        dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end,
+                                          src->lt_policy.l_extent.end);
+
+        /* XXX could be a real call to the dlm to find superset modes */
+        if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW)
+                dst->lt_mode = LCK_PW;
+}
+
+static void lt_insert(struct ll_lock_tree *tree, 
+                      struct ll_lock_tree_node *node)
+{
+        struct ll_lock_tree_node *walk;
+        rb_node_t **p, *parent;
+        ENTRY;
+
+restart:
+        p = &tree->lt_root.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                walk = rb_entry(parent, struct ll_lock_tree_node, lt_node);
+                switch (lt_compare(node, walk)) {
+                case -1:
+                        p = &(*p)->rb_left;
+                        break;
+                case 1:
+                        p = &(*p)->rb_right;
+                        break;
+                case 0:
+                        lt_merge(node, walk);
+                        rb_erase(&walk->lt_node, &tree->lt_root);
+                        OBD_FREE(walk, sizeof(*walk));
+                        goto restart;
+                        break;
+                default:
+                        LBUG();
+                        break;
+                }
+        }
+        rb_link_node(&node->lt_node, parent, p);
+        rb_insert_color(&node->lt_node, &tree->lt_root);
+        EXIT;
+}
+
+static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree)
+{
+        rb_node_t *rbnode;
+        struct ll_lock_tree_node *node = NULL;
+
+        for ( rbnode = tree->lt_root.rb_node; rbnode != NULL; 
+              rbnode = rbnode->rb_left) {
+                if (rbnode->rb_left == NULL) {
+                        node = rb_entry(rbnode, struct ll_lock_tree_node, 
+                                        lt_node);
+                        break;
+                }
+        }
+        RETURN(node);
+}
+
+int ll_tree_unlock(struct ll_lock_tree *tree, struct inode *inode)
+{
+        struct ll_lock_tree_node *node;
+        struct list_head *pos, *n;
+        int rc = 0;
+        ENTRY;
+
+        list_for_each_safe(pos, n, &tree->lt_locked_list) {
+                node = list_entry(pos, struct ll_lock_tree_node, 
+                                  lt_locked_item);
+
+                rc = ll_extent_unlock(tree->lt_fd, inode, 
+                                      ll_i2info(inode)->lli_smd, node->lt_mode, 
+                                      &node->lt_lockh);
+                if (rc != 0) {
+                        /* XXX better message */
+                        CERROR("couldn't unlock %d\n", rc);
+                }
+                list_del(&node->lt_locked_item);
+                OBD_FREE(node, sizeof(*node));
+        }
+
+        while ((node = lt_least_node(tree))) {
+                rb_erase(&node->lt_node, &tree->lt_root);
+                OBD_FREE(node, sizeof(*node));
+        }
+
+        RETURN(rc);
+}
+
+int ll_tree_lock(struct ll_lock_tree *tree,
+                 struct ll_lock_tree_node *first_node, struct inode *inode,
+                 const char *buf, size_t count, int ast_flags)
+{
+        struct ll_lock_tree_node *node;
+        int rc = 0;
+        ENTRY;
+
+        tree->lt_root.rb_node = NULL;
+        INIT_LIST_HEAD(&tree->lt_locked_list);
+        if (first_node != NULL)
+                lt_insert(tree, first_node);
+
+        if (mapping_mapped(inode->i_mapping)) {
+                rc = lt_get_mmap_locks(tree, inode, (unsigned long)buf, count);
+                if (rc)
+                        GOTO(out, rc);
+        }
+
+        while ((node = lt_least_node(tree))) {
+                rc = ll_extent_lock(tree->lt_fd, inode, 
+                                    ll_i2info(inode)->lli_smd, node->lt_mode, 
+                                    &node->lt_policy, &node->lt_lockh,
+                                    ast_flags);
+                if (rc != 0)
+                        GOTO(out, rc);
+
+                rb_erase(&node->lt_node, &tree->lt_root);
+                list_add_tail(&node->lt_locked_item, &tree->lt_locked_list);
+        }
+        RETURN(rc);
+out:
+        ll_tree_unlock(tree, inode);
+        RETURN(rc);
+}
+
+static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma)
+{
+        /* we only want to hold PW locks if the mmap() can generate 
+         * writes back to the file and that only happens in shared
+         * writable vmas */
+        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+                return LCK_PW;
+        return LCK_PR;
+}
+
+static void policy_from_vma(ldlm_policy_data_t *policy, 
+                            struct vm_area_struct *vma, unsigned long addr,
+                            size_t count)
+{
+        policy->l_extent.start = ((addr - vma->vm_start) & PAGE_CACHE_MASK) +
+                                 (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+        policy->l_extent.end = (policy->l_extent.start + count - 1) | 
+                               (PAGE_CACHE_SIZE - 1);
+}
+
+static struct vm_area_struct * our_vma(unsigned long addr, size_t count)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma, *ret = NULL;
+        ENTRY;
+
+        spin_lock(&mm->page_table_lock);
+        for(vma = find_vma(mm, addr); 
+            vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+                if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage) {
+                        ret = vma;
+                        break;
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        RETURN(ret);
+}
+
+int lt_get_mmap_locks(struct ll_lock_tree *tree, struct inode *inode, 
+                      unsigned long addr, size_t count)
+{
+        struct vm_area_struct *vma;
+        struct ll_lock_tree_node *node;
+        ldlm_policy_data_t policy;
+        ENTRY;
+
+        if (count == 0)
+                RETURN(0);
+
+        /* we need to look up vmas on page aligned addresses */
+        count += addr & (PAGE_SIZE - 1);
+        addr -= addr & (PAGE_SIZE - 1);
+
+        while ((vma = our_vma(addr, count)) != NULL) {
+
+                policy_from_vma(&policy, vma, addr, count);
+                node = ll_node_from_inode(inode, policy.l_extent.start, 
+                                          policy.l_extent.end, 
+                                          mode_from_vma(vma));
+                if (IS_ERR(node)) {
+                        CERROR("not enough mem for lock_tree_node!\n");
+                        RETURN(-ENOMEM);
+                }
+                lt_insert(tree, node);
+
+                if (vma->vm_end - addr >= count)
+                        break;
+                count -= vma->vm_end - addr;
+                addr = vma->vm_end;
+        }
+        RETURN(0);
+}
+
+/* FIXME: there is a pagefault race goes as follow:
+ * 1. A user process on node A accesses a portion of a mapped file, 
+ *    resulting in a page fault.  The pagefault handler invokes the 
+ *    ll_nopage function, which reads the page into memory.
+ * 2. A user process on node B writes to the same portion of the file 
+ *    (either via mmap or write()), that cause node A to cancel the
+ *    lock and truncate the page.
+ * 3. Node A then executes the rest of do_no_page(), entering the 
+ *    now-invalid page into the PTEs.
+ *
+ * Make the whole do_no_page as a hook to cover both the page cache
+ * and page mapping installing with dlm lock would eliminate this race.
+ */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int *type)
+#else
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int unused)
+#endif
+{
+        struct file *filp = vma->vm_file;
+        struct ll_file_data *fd = filp->private_data;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct lustre_handle lockh = { 0 };
+        ldlm_policy_data_t policy;
+        ldlm_mode_t mode;
+        struct page *page;
+        __u64 kms;
+        unsigned long pgoff, size, rand_read, seq_read;
+        int rc = 0;
+        ENTRY;
+
+        if (ll_i2info(inode)->lli_smd == NULL) {
+                CERROR("No lsm on fault?\n");
+                RETURN(NULL);
+        }
+
+        /* start and end the lock on the first and last bytes in the page */
+        policy_from_vma(&policy, vma, address, PAGE_CACHE_SIZE);
+
+        CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
+               vma, inode->i_ino, policy.l_extent.start, 
+               policy.l_extent.end);
+
+        mode = mode_from_vma(vma);
+
+        rc = ll_extent_lock(fd, inode, ll_i2info(inode)->lli_smd, mode, &policy,
+                            &lockh, LDLM_FL_CBPENDING);
+        if (rc != 0)
+                RETURN(NULL);
+        
+        /* XXX change inode size without i_sem hold! there is a race condition
+         *     with truncate path. (see ll_extent_lock) */
+        kms = lov_merge_size(ll_i2info(inode)->lli_smd, 1);
+        pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+        size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        
+        if (pgoff >= size)
+                ll_glimpse_size(inode);
+        else
+                inode->i_size = kms;
+        
+        /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that 
+         * the kernel will not read other pages not covered by ldlm in 
+         * filemap_nopage. we do our readahead in ll_readpage. 
+         */
+        rand_read = vma->vm_flags & VM_RAND_READ;
+        seq_read = vma->vm_flags & VM_SEQ_READ;
+        vma->vm_flags &= ~ VM_SEQ_READ;
+        vma->vm_flags |= VM_RAND_READ;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        page = filemap_nopage(vma, address, type);
+#else       
+        page = filemap_nopage(vma, address, unused);
+#endif  
+        vma->vm_flags &= ~VM_RAND_READ;
+        vma->vm_flags |= (rand_read | seq_read);
+        
+        ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
+        RETURN(page);
+}
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma,
+                                         __u64 byte)
+{
+        return vma->vm_start + 
+               (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT));
+
+}
+
+#define VMA_DEBUG(vma, fmt, arg...)                                     \
+        CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld): " fmt,  \
+               vma, vma->vm_start, vma->vm_end, vma->vm_pgoff, ## arg);
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+/* [first, last] are the byte offsets affected.  
+ * vm_{start, end} are user addresses of the first byte of the mapping and
+ *      the next byte beyond it
+ * vm_pgoff is the page index of the first byte in the mapping */
+static void teardown_vmas(struct vm_area_struct *vma, __u64 first,
+                          __u64 last)
+{
+        unsigned long address, len;
+        for (; vma ; vma = vma->vm_next_share) {
+                if (last >> PAGE_CACHE_SHIFT < vma->vm_pgoff)
+                        continue;
+                if (first >> PAGE_CACHE_SHIFT > (vma->vm_pgoff + 
+                    ((vma->vm_end - vma->vm_start) >> PAGE_CACHE_SHIFT)))
+                        continue;
+                
+                address = max((unsigned long)vma->vm_start, 
+                              file_to_user(vma, first));
+                len = min((unsigned long)vma->vm_end, 
+                          file_to_user(vma, last) + 1) - address;
+
+                VMA_DEBUG(vma, "zapping vma [address=%ld len=%ld]\n",
+                          address, len);
+                LASSERT(vma->vm_mm);
+                ll_zap_page_range(vma, address, len);
+        }
+}
+#endif
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, 
+                       __u64 last)
+{
+        int rc = -ENOENT;
+        ENTRY;
+        
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        if (mapping_mapped(mapping)) {
+                rc = 0;
+                unmap_mapping_range(mapping, first + PAGE_SIZE - 1,
+                                    last - first + 1, 1);
+        }
+#else
+        spin_lock(&mapping->i_shared_lock);
+        if (mapping->i_mmap != NULL) {
+                rc = 0;
+                teardown_vmas(mapping->i_mmap, first, last);
+        }
+        if (mapping->i_mmap_shared != NULL) {
+                rc = 0;
+                teardown_vmas(mapping->i_mmap_shared, first, last);
+        }
+        spin_unlock(&mapping->i_shared_lock);
+#endif
+        RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+        .nopage = ll_nopage,
+};
+
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+        int rc;
+        ENTRY;
+
+        rc = generic_file_mmap(file, vma);
+        if (rc == 0)
+                vma->vm_ops = &ll_file_vm_ops;
+
+        RETURN(rc);
+}
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index 4e09d2f..83252cc 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -390,6 +390,57 @@ struct ll_async_page *llap_from_page(struct page *page)
         RETURN(llap);
 }
 
+static int queue_or_sync_write(struct obd_export *exp, 
+                               struct lov_stripe_md *lsm, 
+                               struct ll_async_page *llap,
+                               unsigned to,
+                               obd_flag async_flags)
+{
+        struct obd_io_group *oig;
+        int rc;
+        ENTRY;
+
+        /* _make_ready only sees llap once we've unlocked the page */
+        llap->llap_write_queued = 1;
+        rc = obd_queue_async_io(exp, lsm, NULL, llap->llap_cookie,
+                                OBD_BRW_WRITE, 0, 0, 0, async_flags);
+        if (rc == 0) {
+                LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n");
+                //llap_write_pending(inode, llap);
+                GOTO(out, 0);
+        }
+
+        llap->llap_write_queued = 0;
+
+        rc = oig_init(&oig);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = obd_queue_group_io(exp, lsm, NULL, oig, llap->llap_cookie,
+                                OBD_BRW_WRITE, 0, to, 0, ASYNC_READY | 
+                                ASYNC_URGENT | ASYNC_COUNT_STABLE |
+                                ASYNC_GROUP_SYNC);
+        if (rc)
+                GOTO(free_oig, rc);
+
+        rc = obd_trigger_group_io(exp, lsm, NULL, oig);
+        if (rc)
+                GOTO(free_oig, rc);
+
+        rc = oig_wait(oig);
+        
+        if (!rc && async_flags & ASYNC_READY)
+                unlock_page(llap->llap_page);
+
+        LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", 
+                       rc);
+
+free_oig:
+        oig_release(oig);
+out:
+        RETURN(rc);
+}
+
 void lov_increase_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
                       obd_off size);
 /* update our write count to account for i_size increases that may have
@@ -429,39 +480,11 @@ int ll_commit_write(struct file *file, struct page *page, unsigned from,
                 exp = ll_i2obdexp(inode);
                 if (exp == NULL)
                         RETURN(-EINVAL);
-
-                /* _make_ready only sees llap once we've unlocked the page */
-                llap->llap_write_queued = 1;
-                rc = obd_queue_async_io(exp, lsm, NULL, llap->llap_cookie,
-                                        OBD_BRW_WRITE, 0, 0, 0, 0);
-                if (rc != 0) { /* async failed, try sync.. */
-                        struct obd_io_group *oig;
-                        rc = oig_init(&oig);
-                        if (rc)
-                                GOTO(out, rc);
-
-                        llap->llap_write_queued = 0;
-                        rc = obd_queue_group_io(exp, lsm, NULL, oig,
-                                                llap->llap_cookie,
-                                                OBD_BRW_WRITE, 0, to, 0,
-                                                ASYNC_READY | ASYNC_URGENT |
-                                                ASYNC_COUNT_STABLE |
-                                                ASYNC_GROUP_SYNC);
-
-                        if (rc)
-                                GOTO(free_oig, rc);
-
-                        rc = obd_trigger_group_io(exp, lsm, NULL, oig);
-                        if (rc)
-                                GOTO(free_oig, rc);
-
-                        rc = oig_wait(oig);
-free_oig:
-                        oig_release(oig);
+                
+                rc = queue_or_sync_write(exp, ll_i2info(inode)->lli_smd, llap,
+                                         to, 0);
+                if (rc)
                         GOTO(out, rc);
-                }
-                LL_CDEBUG_PAGE(D_PAGE, page, "write queued\n");
-                //llap_write_pending(inode, llap);
         } else {
                 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
                                      LPROC_LL_DIRTY_HITS);
@@ -506,6 +529,44 @@ static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
         spin_unlock(&sbi->ll_lock);
 }
 
+int ll_writepage(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct obd_export *exp;
+        struct ll_async_page *llap;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(!PageDirty(page));
+        LASSERT(PageLocked(page));
+
+        exp = ll_i2obdexp(inode);
+        if (exp == NULL)
+                GOTO(out, rc = -EINVAL);
+
+        llap = llap_from_page(page);
+        if (IS_ERR(llap))
+                GOTO(out, rc = PTR_ERR(llap));
+
+        page_cache_get(page);
+        if (llap->llap_write_queued) {
+                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
+                rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
+                                         llap->llap_cookie,
+                                         ASYNC_READY | ASYNC_URGENT);
+        } else {
+                rc = queue_or_sync_write(exp, ll_i2info(inode)->lli_smd, llap,
+                                         PAGE_SIZE, ASYNC_READY | 
+                                         ASYNC_URGENT);
+        }
+        if (rc)
+                page_cache_release(page);
+out:
+        if (rc)
+                unlock_page(page);
+        RETURN(rc);
+}
+
 /* called for each page in a completed rpc.*/
 void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
 {
@@ -957,17 +1018,10 @@ int ll_readpage(struct file *filp, struct page *page)
         }
 
         if (rc == 0) {
-                static unsigned long next_print;
-                CDEBUG(D_INODE, "ino %lu page %lu (%llu) didn't match a lock\n",
-                       inode->i_ino, page->index,
-                       (long long)page->index << PAGE_CACHE_SHIFT);
-                if (0 && time_after(jiffies, next_print)) {
-                        CWARN("ino %lu page %lu (%llu) not covered by "
-                               "a lock (mmap?).  check debug logs.\n",
-                               inode->i_ino, page->index,
-                               (long long)page->index << PAGE_CACHE_SHIFT);
-                        next_print = jiffies + 30 * HZ;
-                }
+                CWARN("ino %lu page %lu (%llu) not covered by "
+                      "a lock (mmap?).  check debug logs.\n",
+                      inode->i_ino, page->index,
+                      (long long)page->index << PAGE_CACHE_SHIFT);
         }
 
         rc = ll_issue_page_read(exp, llap, oig, 0);
diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c
index 8a3099f..fc3cab1 100644
--- a/lustre/llite/rw24.c
+++ b/lustre/llite/rw24.c
@@ -49,49 +49,6 @@
 #include "llite_internal.h"
 #include <linux/lustre_compat25.h>
 
-static int ll_writepage_24(struct page *page)
-{
-        struct inode *inode = page->mapping->host;
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(!PageDirty(page));
-        LASSERT(PageLocked(page));
-
-        exp = ll_i2obdexp(inode);
-        if (exp == NULL)
-                GOTO(out, rc = -EINVAL);
-
-        llap = llap_from_page(page);
-        if (IS_ERR(llap))
-                GOTO(out, rc = PTR_ERR(llap));
-
-        page_cache_get(page);
-        if (llap->llap_write_queued) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
-                rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
-                                         llap->llap_cookie,
-                                         ASYNC_READY | ASYNC_URGENT);
-        } else {
-                llap->llap_write_queued = 1;
-                rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
-                                        llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
-                                        0, ASYNC_READY | ASYNC_URGENT);
-                if (rc == 0)
-                        LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n");
-                else
-                        llap->llap_write_queued = 0;
-        }
-        if (rc)
-                page_cache_release(page);
-out:
-        if (rc)
-                unlock_page(page);
-        RETURN(rc);
-}
-
 static int ll_direct_IO_24(int rw,
 #ifdef HAVE_DIO_FILE
                            struct file *file,
@@ -179,7 +136,7 @@ static int ll_direct_IO_24(int rw,
 struct address_space_operations ll_aops = {
         .readpage       = ll_readpage,
         .direct_IO      = ll_direct_IO_24,
-        .writepage      = ll_writepage_24,
+        .writepage      = ll_writepage,
         .prepare_write  = ll_prepare_write,
         .commit_write   = ll_commit_write,
         .removepage     = ll_removepage,
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c
index 71964de..53bde80 100644
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -51,51 +51,6 @@
 #include "llite_internal.h"
 #include <linux/lustre_compat25.h>
 
-static int ll_writepage_26(struct page *page, struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        int rc;
-        ENTRY;
-
-        LASSERT(!PageDirty(page));
-        LASSERT(PageLocked(page));
-
-        exp = ll_i2obdexp(inode);
-        if (exp == NULL)
-                GOTO(out, rc = -EINVAL);
-
-        llap = llap_from_page(page);
-        if (IS_ERR(llap))
-                GOTO(out, rc = PTR_ERR(llap));
-
-        page_cache_get(page);
-        if (llap->llap_write_queued) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
-                rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
-                                         llap->llap_cookie,
-                                         ASYNC_READY | ASYNC_URGENT);
-        } else {
-                llap->llap_write_queued = 1;
-                rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
-                                        llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
-                                        0, ASYNC_READY | ASYNC_URGENT);
-                if (rc == 0)
-                        LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n");
-                else
-                        llap->llap_write_queued = 0;
-        }
-        if (rc)
-                page_cache_release(page);
-out:
-        if (rc)
-                unlock_page(page);
-        else
-                set_page_writeback(page);
-        RETURN(rc);
-}
-
 /* It is safe to not check anything in invalidatepage/releasepage below
    because they are run with page locked and all our io is happening with
    locked page too */
@@ -117,7 +72,7 @@ struct address_space_operations ll_aops = {
         .readpage       = ll_readpage,
 //        .readpages      = ll_readpages,
 //        .direct_IO      = ll_direct_IO_26,
-        .writepage      = ll_writepage_26,
+        .writepage      = ll_writepage,
         .writepages     = generic_writepages,
         .set_page_dirty = __set_page_dirty_nobuffers,
         .sync_page      = NULL,
diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4
index 27704bd..94fa984 100644
--- a/lustre/portals/archdep.m4
+++ b/lustre/portals/archdep.m4
@@ -436,6 +436,16 @@ if test x$enable_modules != xno ; then
 			AC_MSG_RESULT([no])
 		])
 
+	# --------- zap_page_range(vma) --------------------------------
+	AC_MSG_CHECKING([if zap_pag_range with vma parameter])
+	ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`"
+	if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then
+		AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter])
+		AC_MSG_RESULT([yes])
+	else
+		AC_MSG_RESULT([no])
+	fi
+
 	# ---------- Red Hat 2.4.20 backports some 2.5 bits --------
 	# This needs to run after we've defined the KCPPFLAGS
 
diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h
index acf4045..cad7a69 100644
--- a/lustre/portals/include/linux/libcfs.h
+++ b/lustre/portals/include/linux/libcfs.h
@@ -90,6 +90,7 @@ struct ptldebug_header {
 #define D_RPCTRACE    0x00100000 /* for distributed debugging */
 #define D_VFSTRACE    0x00200000
 #define D_READA       0x00400000 /* read-ahead */
+#define D_MMAP        0x00800000
 
 #ifdef __KERNEL__
 # include <linux/sched.h> /* THREAD_SIZE */
diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c
index e546aaf..dce196f 100644
--- a/lustre/portals/utils/debug.c
+++ b/lustre/portals/utils/debug.c
@@ -74,7 +74,7 @@ static const char *portal_debug_masks[] =
         {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
          "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
          "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace",
-         "reada", NULL};
+         "reada", "mmap", NULL};
 
 struct debug_daemon_cmd {
         char *cmd;
diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore
index bc148be..3eb90ab 100644
--- a/lustre/tests/.cvsignore
+++ b/lustre/tests/.cvsignore
@@ -65,3 +65,5 @@ ll_dirstripe_verify
 openfilleddirunlink
 copy_attr
 rename_many
+memhog
+mmap_sanity
diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am
index bb3368d..166755f 100644
--- a/lustre/tests/Makefile.am
+++ b/lustre/tests/Makefile.am
@@ -24,10 +24,12 @@ noinst_PROGRAMS += wantedi statone runas openfile getdents mkdirdeep o_directory
 noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod
 noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany rmdirmany
 noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2
+noinst_PROGRAMS += mmap_sanity
 # noinst_PROGRAMS += ldaptest copy_attr
 bin_PROGRAMS = mcreate munlink
 endif # TESTS
 
+mmap_sanity_SOURCES= mmap_sanity.c
 stat_SOURCES = stat.c stat_fs.h
 mkdirdeep_LDADD=-L$(top_builddir)/portals/utils -lptlctl $(LIBREADLINE)
 #write_append_truncate_CC=mpicc
diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c
new file mode 100644
index 0000000..3fd0b0e
--- /dev/null
+++ b/lustre/tests/mmap_sanity.c
@@ -0,0 +1,643 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <string.h>
+
+char *dir = NULL, *node = NULL, *dir2 = NULL;
+long page_size;
+char mmap_sanity[256];
+
+
+static void usage(void)
+{
+        printf("Usage: mmap_sanity -d dir [-n node | -m dir2]\n");
+        printf("       dir      lustre mount point\n");
+        printf("       node     another client\n");
+        printf("       dir2     another mount point\n");
+        exit(127);
+}
+
+#define MMAP_NOTIFY_PORT        7676
+static int mmap_notify(char *target, char *str, int delay)
+{
+	unsigned short port = MMAP_NOTIFY_PORT;
+	int socket_type = SOCK_DGRAM;
+	struct sockaddr_in server;
+	struct hostent *hp;
+	int len, sockfd, rc = 0;
+
+        if (target == NULL)
+                return 0;
+
+	sockfd = socket(AF_INET, socket_type, 0);
+	if (sockfd < 0) {
+                perror("socket()");
+		return errno;
+	}
+
+        if ((hp = gethostbyname(target)) == NULL) {
+                perror(target);
+                rc = errno;
+                goto out_close;
+	}
+
+	memset(&server,0,sizeof(server));
+	memcpy(&(server.sin_addr), hp->h_addr, hp->h_length);
+	server.sin_family = AF_INET;
+	server.sin_port = htons(port);
+        
+        len = sizeof(server);
+        if (delay)
+                sleep(delay);
+        
+        rc = sendto(sockfd, str, strlen(str), 0, 
+                    (struct sockaddr *)&server, len);
+        if (rc < 0) {
+                perror("sendto()");
+                rc = errno;
+        } else
+                rc = 0;
+
+out_close:
+        close(sockfd);
+        return rc;
+}
+
+static int mmap_wait(char *str, int timeout)
+{
+	unsigned short port = MMAP_NOTIFY_PORT;
+	int socket_type = SOCK_DGRAM;
+	struct sockaddr_in local, from;
+	char host[256];
+	struct hostent *hp;
+        fd_set rfds;
+        struct timeval tv;
+        int sockfd, rc = 0;
+
+        if (dir2 != NULL)
+                return 0;
+        
+	memset(host, 0, sizeof(host));
+	if (gethostname(host, sizeof(host))) {
+                perror("gethostname()");
+                return errno;
+	}
+        
+	if ((hp = gethostbyname(host)) == NULL) {
+                perror(host);
+                return errno;
+	}
+
+	local.sin_family = AF_INET;
+	memcpy(&(local.sin_addr), hp->h_addr, hp->h_length);
+	local.sin_port = htons(port);
+	
+	sockfd = socket(AF_INET, socket_type, 0);
+	if (sockfd < 0) {
+                perror("socket()");
+		return errno;
+	}
+
+	rc = bind(sockfd, (struct sockaddr *)&local, sizeof(local));
+        if (rc < 0) {
+                perror("bind()");
+                rc = errno;
+                goto out_close;
+	}
+
+        FD_ZERO(&rfds);
+        FD_SET(sockfd, &rfds);
+        tv.tv_sec = timeout ? timeout : 5;
+        tv.tv_usec = 0;
+
+        rc = select(sockfd + 1, &rfds, NULL, NULL, &tv);
+        if (rc) {       /* got data */
+                char buffer[1024];
+                int fromlen =sizeof(from);
+                
+		memset(buffer, 0, sizeof(buffer));
+		rc = recvfrom(sockfd, buffer, sizeof(buffer), 0, 
+                              (struct sockaddr *)&from, &fromlen);
+                if (rc <= 0) {
+                        perror("recvfrom()");
+                        rc = errno;
+                        goto out_close;
+                }
+                rc = 0;
+
+                if (strncmp(str, buffer, strlen(str)) != 0) {
+                        fprintf(stderr, "expected string mismatch!\n");
+                        rc = EINVAL;
+                }
+        } else {        /* timeout */
+                fprintf(stderr, "timeout!\n");
+                rc = ETIME;
+        }
+
+out_close:
+        close(sockfd);
+        return rc;
+}
+
+static int remote_tst(int tc, char *mnt);
+static int mmap_run(char *host, int tc)
+{
+        pid_t child;
+        char nodearg[256], command[256];
+        int rc = 0;
+
+        child = fork();
+        if (child < 0)
+                return errno;
+        else if (child)
+                return 0;
+
+        if (dir2 != NULL) {
+                rc = remote_tst(tc, dir2);
+        } else {
+                sprintf(nodearg, "-w %s", node);
+                sprintf(command, "%s -d %s -n %s -c %d", 
+                        mmap_sanity, dir, host, tc);
+                rc = execlp("pdsh", "pdsh", "-S", nodearg, command, NULL);
+                if (rc)
+                        perror("execlp()");
+        }
+        _exit(rc);
+}
+
+static int mmap_initialize(char *myself, int tc)
+{
+        char buf[1024], *file;
+        int fdr, fdw, count, rc = 0;
+        
+        page_size = sysconf(_SC_PAGESIZE);
+        if (page_size == -1) {
+                perror("sysconf(_SC_PAGESIZE)");
+                return errno;
+        }
+        if (tc)
+                return 0;
+
+        /* copy myself to lustre for another client */
+        fdr = open(myself, O_RDONLY);
+        if (fdr < 0) {
+                perror(myself);
+                return EINVAL;
+        }
+        file = strrchr(myself, '/');
+        if (file == NULL) {
+                fprintf(stderr, "can't get test filename\n");
+                close(fdr);
+                return EINVAL;
+        }
+        file++;
+        sprintf(mmap_sanity, "%s/%s", dir, file);
+
+        fdw = open(mmap_sanity, O_CREAT|O_WRONLY, 0777);
+        if (fdw < 0) {
+                perror(mmap_sanity);
+                close(fdr);
+                return EINVAL;
+        }
+        while ((count = read(fdr, buf, sizeof(buf))) != 0) {
+                int writes;
+
+                if (count < 0) {
+                        perror("read()");
+                        rc = errno;
+                        break;
+                }
+                writes = write(fdw, buf, count);
+                if (writes != count) {
+                        perror("write()");
+                        rc = errno;
+                        break;
+                }
+        }
+        close(fdr);
+        close(fdw);
+        return rc;
+}
+
+static void mmap_finalize(int tc)
+{
+        if (tc)
+                return;
+        unlink(mmap_sanity);
+}
+
+/* basic mmap operation on single node */
+static int mmap_tst1(char *mnt)
+{
+        char *ptr, mmap_file[256];
+        int region, fd, rc = 0;
+
+        region = page_size * 10;
+        sprintf(mmap_file, "%s/%s", mnt, "mmap_file1");
+        
+        if (unlink(mmap_file) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+
+        fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+        ftruncate(fd, region);
+
+        ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+        memset(ptr, 'a', region);
+
+        munmap(ptr, region);
+out_close:
+        close(fd);
+        unlink(mmap_file);
+        return rc;
+}
+
+/* MAP_PRIVATE create a copy-on-write mmap */
+static int mmap_tst2(char *mnt)
+{
+        char *ptr, mmap_file[256], buf[256];
+        int fd, rc = 0;
+
+        sprintf(mmap_file, "%s/%s", mnt, "mmap_file2");
+
+        if (unlink(mmap_file) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+
+        fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+        ftruncate(fd, page_size);
+
+        ptr = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+        memcpy(ptr, "blah", strlen("blah"));
+
+        munmap(ptr, page_size);
+out_close:
+        close(fd);
+        if (rc)
+                return rc;
+
+        fd = open(mmap_file, O_RDONLY);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+        rc = read(fd, buf, sizeof(buf));
+        if (rc < 0) {
+                perror("read()");
+                rc = errno;
+                goto out_close;
+        }
+        rc = 0;
+        
+        if (strncmp("blah", buf, strlen("blah")) == 0) {
+                fprintf(stderr, "mmap write back with MAP_PRIVATE!\n");
+                rc = EFAULT;
+        }
+        close(fd);
+        unlink(mmap_file);
+        return rc;
+}
+
+/* cocurrent mmap operations on two nodes */
+static int mmap_tst3(char *mnt)
+{
+        char *ptr, mmap_file[256], host[256];
+        int region, fd, rc = 0;
+
+        region = page_size * 100;
+        sprintf(mmap_file, "%s/%s", mnt, "mmap_file3");
+        
+        if (unlink(mmap_file) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+
+        fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+        ftruncate(fd, region);
+
+        ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+
+        if (gethostname(host, sizeof(host))) {
+                perror("gethostname()");
+                rc = errno;
+                goto out_unmap;
+	}
+        
+        rc = mmap_run(host, 3);
+        if (rc)
+                goto out_unmap;
+        
+        rc = mmap_wait("mmap done", 10);
+        memset(ptr, 'a', region);
+
+        sleep(2);       /* wait for remote test finish */
+out_unmap:
+        munmap(ptr, region);
+out_close:
+        close(fd);
+        unlink(mmap_file);
+        return rc;
+}       
+
+static int remote_tst3(char *mnt)
+{
+        char *ptr, mmap_file[256];
+        int region, fd, rc = 0;
+
+        region = page_size * 100;
+        sprintf(mmap_file, "%s/%s", mnt, "mmap_file3");
+
+        fd = open(mmap_file, O_RDWR, 0600);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+
+        ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+        memset(ptr, 'b', region);
+
+        rc = mmap_notify(node, "mmap done", 1);
+        if (rc)
+                goto out_unmap;
+        
+        memset(ptr, 'c', region);
+        
+out_unmap:
+        munmap(ptr, region);
+out_close:
+        close(fd);
+        return rc;
+}
+
+/* client1 write to file_4a from mmap()ed file_4b;
+ * client2 write to file_4b from mmap()ed file_4a. */
+static int mmap_tst4(char *mnt)
+{
+        char *ptr, filea[256], fileb[256], host[256];
+        int region, fdr, fdw, rc = 0;
+
+        region = page_size * 100;
+        sprintf(filea, "%s/%s", mnt, "mmap_file_4a");
+        sprintf(fileb, "%s/%s", mnt, "mmap_file_4b");
+
+        if (unlink(filea) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+        if (unlink(fileb) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+
+        fdr = fdw = -1;
+        fdr = open(fileb, O_CREAT|O_RDWR, 0600);
+        if (fdr < 0) {
+                perror(fileb);
+                return errno;
+        }
+        ftruncate(fdr, region);
+        fdw = open(filea, O_CREAT|O_RDWR, 0600);
+        if (fdw < 0) {
+                perror(filea);
+                rc = errno;
+                goto out_close;
+        }
+        ftruncate(fdw, region);
+        
+        ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fdr, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+
+        if (gethostname(host, sizeof(host))) {
+                perror("gethostname()");
+                rc = errno;
+                goto out_unmap;
+	}
+        
+        rc = mmap_run(host, 4);
+        if (rc)
+                goto out_unmap;
+        
+        rc = mmap_wait("mmap done", 10);
+        if (rc)
+                goto out_unmap;
+        
+        memset(ptr, '1', region);
+        
+        rc = write(fdw, ptr, region);
+        if (rc <= 0) {
+                perror("write()");
+                rc = errno;
+        } else
+                rc = 0;
+
+        sleep(2);       /* wait for remote test finish */
+out_unmap:
+        munmap(ptr, region);
+out_close:
+        if (fdr >= 0)
+                close(fdr);
+        if (fdw >= 0)
+                close(fdw);
+        unlink(filea);
+        unlink(fileb);
+        return rc;
+}
+
+static int remote_tst4(char *mnt)
+{
+        char *ptr, filea[256], fileb[256];
+        int region, fdr, fdw, rc = 0;
+
+        region = page_size * 100;
+        sprintf(filea, "%s/%s", mnt, "mmap_file_4a");
+        sprintf(fileb, "%s/%s", mnt, "mmap_file_4b");
+
+        fdr = fdw = -1;
+        fdr = open(filea, O_RDWR, 0600);
+        if (fdr < 0) {
+                perror(filea);
+                return errno;
+        }
+        fdw = open(fileb, O_RDWR, 0600);
+        if (fdw < 0) {
+                perror(fileb);
+                rc = errno;
+                goto out_close;
+        }
+
+        ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fdr, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+
+        rc = mmap_notify(node, "mmap done", 1);
+        if (rc)
+                goto out_unmap;
+
+        memset(ptr, '2', region);
+
+        rc = write(fdw, ptr, region);
+        if (rc <= 0) {
+                perror("write()");
+                rc = errno;
+        } else
+                rc = 0;
+     
+out_unmap:
+        munmap(ptr, region);
+out_close:
+        if (fdr >= 0)
+                close(fdr);
+        if (fdw >= 0)
+                close(fdw);
+        return rc;
+}
+
+static int remote_tst(int tc, char *mnt)
+{
+        int rc = 0;
+        switch(tc) {
+        case 3:
+                rc = remote_tst3(mnt);
+                break;
+        case 4:
+                rc = remote_tst4(mnt);
+                break;
+        case 1:
+        case 2:
+        default:
+                fprintf(stderr, "wrong test case number %d\n", tc);
+                rc = EINVAL;
+                break;
+        }
+        return rc;
+}
+        
+struct test_case {
+        int     tc;                     /* test case number */
+        char    *desc;                  /* test description */
+        int     (* test_fn)(char *mnt); /* test function */
+        int     node_cnt;               /* node count */
+};
+
+struct test_case tests[] = {
+        { 1, "mmap test1: basic mmap operation", mmap_tst1, 1 },
+        { 2, "mmap test2: MAP_PRIVATE not write back", mmap_tst2, 1 },
+        { 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 },
+        { 4, "mmap test4: c1 write to f1 from mmaped f2, " 
+             "c2 write to f1 from mmaped f1", mmap_tst4, 2 },
+        { 0, NULL, 0, 0 }
+};
+
+int main(int argc, char **argv)
+{
+        extern char *optarg;
+        struct test_case *test;
+        int c, rc = 0, tc = 0;
+
+        for(;;) {
+                c = getopt(argc, argv, "d:n:c:m:");
+                if ( c == -1 )
+                        break;
+
+                switch(c) {
+                        case 'd':
+                                dir = optarg;
+                                break;
+                        case 'n':
+                                node = optarg;
+                                break;
+                        case 'c':
+                                tc = atoi(optarg);
+                                break;
+                        case 'm':
+                                dir2 = optarg;
+                                break;
+                        default:
+                        case '?':
+                                usage();
+                                break;
+                }
+        }
+
+        if (dir == NULL)
+                usage();
+        if (dir2 != NULL && node != NULL)
+                usage();
+
+        if (mmap_initialize(argv[0], tc) != 0) {
+                fprintf(stderr, "mmap_initialize failed!\n");
+                return EINVAL;
+        }
+
+        if (tc) {
+                rc = remote_tst(tc, dir);
+                goto out;
+        }
+        
+        for (test = tests; test->tc; test++) {
+                char *rs = "skip";
+                rc = 0;
+                if (test->node_cnt == 1 || node != NULL || dir2 != NULL) {
+                        rc = test->test_fn(dir);
+                        rs = rc ? "fail" : "pass";
+                }
+                fprintf(stderr, "%s (%s)\n", test->desc, rs);
+                if (rc)
+                        break;
+        }
+out:
+        mmap_finalize(tc);
+        return rc;
+}
diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh
index c3e0a80..cc578b4 100644
--- a/lustre/tests/sanityN.sh
+++ b/lustre/tests/sanityN.sh
@@ -4,7 +4,7 @@ set -e
 
 ONLY=${ONLY:-"$*"}
 # bug number for skipped test: 1768 3192
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4   14b"}
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4   14b 14c"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
@@ -334,7 +334,7 @@ test_15() {	# bug 974 - ENOSPC
 run_test 15 "test out-of-space with multiple writers ==========="
 
 test_16() {
-	fsx -R -W -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
+	fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
 }
 run_test 16 "2500 iterations of dual-mount fsx ================="
 
@@ -359,6 +359,11 @@ test_17() { # bug 3513, 3667
 }
 run_test 17 "resource creation/LVB creation race ==============="
 
+test_18() {
+	./mmap_sanity -d $MOUNT1 -m $MOUNT2
+}
+run_test 18 "mmap sanity check ================================="
+
 log "cleanup: ======================================================"
 rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
 
diff --git a/lustre/utils/lconf b/lustre/utils/lconf
index 049491f7..3f00da0 100755
--- a/lustre/utils/lconf
+++ b/lustre/utils/lconf
@@ -88,6 +88,7 @@ ptldebug_names = {
     "rpctrace" :  (1 << 20),
     "vfstrace" :  (1 << 21),
     "reada" :     (1 << 22),
+    "mmap" :	  (1 << 23),
     }
 
 subsystem_names = {