From e8ffe16619baf1ef7c5c6b117d338956372aa752 Mon Sep 17 00:00:00 2001
From: Jinshan Xiong <jay@whamcloud.com>
Date: Wed, 19 Oct 2011 16:34:26 -0700
Subject: [PATCH] LU-884 clio: client in memory checksum

Use page_mkwrite() method from latest kernels to correctly implement
RPC checksum functionality. Also OBD_FL_MMAP is removed because it
won't be used any more.

Change-Id: I6ec5aae14f56c95b1ac6936d21b5a273582fa4e8
Signed-off-by: Jinshan Xiong <jinshan.xiong@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/1609
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/include/cl_object.h         |   8 +-
 lustre/include/lustre/lustre_idl.h |   4 +-
 lustre/lclient/lcommon_cl.c        |  11 --
 lustre/llite/llite_internal.h      |  13 +++
 lustre/llite/llite_mmap.c          | 206 +++++++++++++++++++++++++++++++++----
 lustre/llite/vvp_io.c              | 131 +++++++++++++----------
 lustre/obdclass/cl_io.c            |   8 +-
 lustre/osc/osc_request.c           |  21 +---
 lustre/tests/mmap_sanity.c         |  68 +++++++++++-
 9 files changed, 361 insertions(+), 109 deletions(-)

diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h
index 86966dc..3d44b1c 100644
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -2222,7 +2222,9 @@ enum cl_io_lock_dmd {
         /** Layers are free to decide between local and global locking. */
         CILR_MAYBE,
         /** Never lock: there is no cache (e.g., liblustre). */
-        CILR_NEVER
+        CILR_NEVER,
+        /** Peek lock: use existing locks, don't queue new ones */
+        CILR_PEEK
 };
 
 struct cl_io_rw_common {
@@ -2283,10 +2285,12 @@ struct cl_io {
                         pgoff_t         ft_index;
                         /** bytes valid byte on a faulted page. */
                         int             ft_nob;
-                        /** writable page? */
+                        /** writable page? for nopage() only */
                         int             ft_writable;
                         /** page of an executable? */
                         int             ft_executable;
+                        /** page_mkwrite() */
+                        int             ft_mkwrite;
                         /** resulting page */
                         struct cl_page *ft_page;
                 } ci_fault;
diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h
index 45d8270..732846c 100644
--- a/lustre/include/lustre/lustre_idl.h
+++ b/lustre/include/lustre/lustre_idl.h
@@ -1276,7 +1276,9 @@ enum obdo_flags {
         OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
         OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
         OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
-        OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client */
+        OBD_FL_MMAP         = 0x00040000, /* object is mmapped on the client.
+                                           * XXX: obsoleted - reserved for old
+                                           * clients prior than 2.2 */
         OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
         OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
 
diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c
index 35b1455..c0798ad 100644
--- a/lustre/lclient/lcommon_cl.c
+++ b/lustre/lclient/lcommon_cl.c
@@ -1025,17 +1025,6 @@ void ccc_req_attr_set(const struct lu_env *env,
         }
         obdo_from_inode(oa, inode, &cl_i2info(inode)->lli_fid,
                         valid_flags & flags);
-#ifdef __KERNEL__
-        /* Bug11742 - set the OBD_FL_MMAP flag for memory mapped files */
-        if (cfs_atomic_read(&(cl_inode2ccc(inode)->cob_mmap_cnt)) != 0) {
-                if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
-                        oa->o_valid |= OBD_MD_FLFLAGS;
-                        oa->o_flags = OBD_FL_MMAP;
-                } else {
-                        oa->o_flags |= OBD_FL_MMAP;
-                }
-        }
-#endif
 }
 
 const struct cl_req_operations ccc_req_ops = {
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index e24aa81..5d95c4e 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -975,6 +975,19 @@ void policy_from_vma(ldlm_policy_data_t *policy,
                 struct vm_area_struct *vma, unsigned long addr, size_t count);
 struct vm_area_struct *our_vma(unsigned long addr, size_t count);
 
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+        struct address_space *mapping = vmpage->mapping;
+        loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+        LASSERT(PageLocked(vmpage));
+        if (mapping == NULL)
+                return;
+
+        ll_teardown_mmaps(mapping, offset, offset + CFS_PAGE_SIZE);
+        truncate_complete_page(mapping, vmpage);
+}
+
 #define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
 
 /* don't need an addref as the sb_info should be holding one */
diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c
index 981c7d7..821f96f 100644
--- a/lustre/llite/llite_mmap.c
+++ b/lustre/llite/llite_mmap.c
@@ -64,13 +64,6 @@
 #include "llite_internal.h"
 #include <linux/lustre_compat25.h>
 
-#define VMA_DEBUG(vma, fmt, arg...)                                     \
-        CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) "   \
-               "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end,  \
-               vma->vm_pgoff, vma->vm_file->f_dentry->d_inode,               \
-               vma->vm_file->f_dentry->d_inode->i_ino,                       \
-               vma->vm_file->f_dentry->d_iname, ## arg);                     \
-
 struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
                        int *type);
 
@@ -128,7 +121,6 @@ struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
 {
         struct file       *file  = vma->vm_file;
         struct inode      *inode = file->f_dentry->d_inode;
-        const unsigned long writable = VM_SHARED|VM_WRITE;
         struct cl_io      *io;
         struct cl_fault_io *fio;
         struct lu_env     *env;
@@ -157,7 +149,6 @@ struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
 
         fio = &io->u.ci_fault;
         fio->ft_index      = index;
-        fio->ft_writable   = (vma->vm_flags&writable) == writable;
         fio->ft_executable = vma->vm_flags&VM_EXEC;
 
         /*
@@ -165,12 +156,13 @@ struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
          * the kernel will not read other pages not covered by ldlm in
          * filemap_nopage. we do our readahead in ll_readpage.
          */
-        *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+        if (ra_flags != NULL)
+                *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
         vma->vm_flags &= ~VM_SEQ_READ;
         vma->vm_flags |= VM_RAND_READ;
 
-        CDEBUG(D_INFO, "vm_flags: %lx (%lu %d %d)\n", vma->vm_flags,
-               fio->ft_index, fio->ft_writable, fio->ft_executable);
+        CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+               fio->ft_index, fio->ft_executable);
 
         if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
                 struct ccc_io *cio = ccc_env_io(env);
@@ -188,6 +180,93 @@ struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
         return io;
 }
 
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+                            bool *retry)
+{
+        struct lu_env           *env;
+        struct cl_io            *io;
+        struct vvp_io           *vio;
+        struct cl_env_nest       nest;
+        int                      result;
+        ENTRY;
+
+        LASSERT(vmpage != NULL);
+
+        io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
+        if (IS_ERR(io))
+                GOTO(out, result = PTR_ERR(io));
+
+        result = io->ci_result;
+        if (result < 0)
+                GOTO(out, result);
+
+        /* Don't enqueue new locks for page_mkwrite().
+         * If the lock has been cancelled then page must have been
+         * truncated, in that case, kernel will handle it.
+         */
+        io->ci_lockreq = CILR_PEEK;
+        io->u.ci_fault.ft_mkwrite = 1;
+        io->u.ci_fault.ft_writable = 1;
+
+        vio = vvp_env_io(env);
+        vio->u.fault.ft_vma    = vma;
+        vio->u.fault.ft_vmpage = vmpage;
+
+        result = cl_io_loop(env, io);
+
+        if (result == -ENODATA) /* peek failed, no lock caching. */
+                CDEBUG(D_MMAP, "race on page_mkwrite: %lx (%lu %p)\n",
+                       vma->vm_flags, io->u.ci_fault.ft_index, vmpage);
+
+        if (result == 0 || result == -ENODATA) {
+                lock_page(vmpage);
+                if (vmpage->mapping == NULL) {
+                        unlock_page(vmpage);
+
+                        /* page was truncated and lock was cancelled, return
+                         * ENODATA so that VM_FAULT_NOPAGE will be returned
+                         * to handle_mm_fault(). */
+                        if (result == 0)
+                                result = -ENODATA;
+                } else if (result == -ENODATA) {
+                        /* Invalidate it if the cl_lock is being revoked.
+                         * This piece of code is definitely needed for RHEL5,
+                         * otherwise, SIGBUS will be wrongly returned to
+                         * applications. */
+                        ll_invalidate_page(vmpage);
+                        LASSERT(vmpage->mapping == NULL);
+                        unlock_page(vmpage);
+                } else if (!PageDirty(vmpage)) {
+                        /* race, the page has been cleaned by ptlrpcd after
+                         * it was unlocked, it has to be added into dirty
+                         * cache again otherwise this soon-to-dirty page won't
+                         * consume any grants, even worse if this page is being
+                         * transferred because it will break RPC checksum.
+                         */
+                        unlock_page(vmpage);
+
+                        CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+                               "been written out, retry.\n",
+                               vmpage, vmpage->index);
+
+                        *retry = true;
+                        result = -EAGAIN;
+                }
+        }
+        EXIT;
+
+out:
+        cl_io_fini(env, io);
+        cl_env_nested_put(&nest, env);
+
+        CDEBUG(D_MMAP, "%s mkwrite with %d\n", cfs_current()->comm, result);
+
+        LASSERT(ergo(result == 0, PageLocked(vmpage)));
+        return(result);
+}
+
+
 #ifndef HAVE_VM_OP_FAULT
 /**
  * Lustre implementation of a vm_operations_struct::nopage() method, called by
@@ -214,6 +293,7 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
         unsigned long           ra_flags;
         pgoff_t                 pg_offset;
         int                     result;
+        const unsigned long     writable = VM_SHARED|VM_WRITE;
         ENTRY;
 
         pg_offset = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -225,17 +305,21 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
         if (result < 0)
                 goto out_err;
 
+        io->u.ci_fault.ft_writable = (vma->vm_flags&writable) == writable;
+
         vio = vvp_env_io(env);
         vio->u.fault.ft_vma            = vma;
         vio->u.fault.nopage.ft_address = address;
         vio->u.fault.nopage.ft_type    = type;
+        vio->u.fault.ft_vmpage         = NULL;
 
         result = cl_io_loop(env, io);
+        page = vio->u.fault.ft_vmpage;
+        if (result != 0 && page != NULL)
+                page_cache_release(page);
 
 out_err:
-        if (result == 0)
-                page = vio->u.fault.ft_vmpage;
-        else if (result == -ENOMEM)
+        if (result == -ENOMEM)
                 page = NOPAGE_OOM;
 
         vma->vm_flags &= ~VM_RAND_READ;
@@ -246,6 +330,36 @@ out_err:
 
         RETURN(page);
 }
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct page *vmpage)
+{
+        int count = 0;
+        bool printed = false;
+        bool retry;
+        int result;
+
+        do {
+                retry = false;
+                result = ll_page_mkwrite0(vma, vmpage, &retry);
+
+                if (!printed && ++count > 16) {
+                        CWARN("app(%s): the page %lu of file %lu is under heavy"
+                              " contention.\n",
+                              current->comm, page_index(vmpage),
+                              vma->vm_file->f_dentry->d_inode->i_ino);
+                        printed = true;
+                }
+        } while (retry);
+
+        if (result == 0)
+                unlock_page(vmpage);
+        else if (result == -ENODATA)
+                result = 0; /* kernel will know truncate has happened and
+                             * retry */
+
+        return result;
+}
+
 #else
 /**
  * Lustre implementation of a vm_operations_struct::fault() method, called by
@@ -258,11 +372,12 @@ out_err:
  * \retval VM_FAULT_ERROR on general error
  * \retval NOPAGE_OOM not have memory for allocate new page
  */
-int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
         struct lu_env           *env;
         struct cl_io            *io;
         struct vvp_io           *vio = NULL;
+        struct page             *vmpage;
         unsigned long            ra_flags;
         struct cl_env_nest       nest;
         int                      result;
@@ -283,21 +398,30 @@ int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
         vio->u.fault.fault.ft_vmf = vmf;
 
         result = cl_io_loop(env, io);
+
+        vmpage = vio->u.fault.ft_vmpage;
+        if (result != 0 && vmpage != NULL) {
+                page_cache_release(vmpage);
+                vmf->page = NULL;
+        }
+
         fault_ret = vio->u.fault.fault.ft_flags;
 
 out_err:
-        if ((result != 0) && !(fault_ret & VM_FAULT_RETRY))
-                fault_ret |= VM_FAULT_ERROR;
+        if (result != 0 && fault_ret == 0)
+                fault_ret = VM_FAULT_ERROR;
 
         vma->vm_flags |= ra_flags;
 
         cl_io_fini(env, io);
         cl_env_nested_put(&nest, env);
 
+        CDEBUG(D_MMAP, "%s fault %d/%d\n",
+               cfs_current()->comm, fault_ret, result);
         RETURN(fault_ret);
 }
 
-int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
         int count = 0;
         bool printed = false;
@@ -330,6 +454,49 @@ restart:
         }
         return result;
 }
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        int count = 0;
+        bool printed = false;
+        bool retry;
+        int result;
+
+        do {
+                retry = false;
+                result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+                if (!printed && ++count > 16) {
+                        CWARN("app(%s): the page %lu of file %lu is under heavy"
+                              " contention.\n",
+                              current->comm, vmf->pgoff,
+                              vma->vm_file->f_dentry->d_inode->i_ino);
+                        printed = true;
+                }
+        } while (retry);
+
+        switch(result) {
+        case 0:
+                LASSERT(PageLocked(vmf->page));
+                result = VM_FAULT_LOCKED;
+                break;
+        case -ENODATA:
+        case -EFAULT:
+                result = VM_FAULT_NOPAGE;
+                break;
+        case -ENOMEM:
+                result = VM_FAULT_OOM;
+                break;
+        case -EAGAIN:
+                result = VM_FAULT_RETRY;
+                break;
+        default:
+                result = VM_FAULT_SIGBUS;
+                break;
+        }
+
+        return result;
+}
 #endif
 
 /**
@@ -412,6 +579,7 @@ static struct vm_operations_struct ll_file_vm_ops = {
 #else
         .fault          = ll_fault,
 #endif
+        .page_mkwrite   = ll_page_mkwrite,
         .open           = ll_vm_open,
         .close          = ll_vm_close,
 };
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c
index 9f92f9e..c7e6313 100644
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -638,45 +638,43 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
                        cfio->nopage.ft_address, (long)cfio->nopage.ft_type);
 
         cfio->ft_vmpage = vmpage;
+        lock_page(vmpage);
 
         return 0;
 }
 #else
 static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 {
-        cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, cfio->fault.ft_vmf);
-
-        if (cfio->fault.ft_vmf->page) {
-                LL_CDEBUG_PAGE(D_PAGE, cfio->fault.ft_vmf->page,
-                               "got addr %p type NOPAGE\n",
-                               cfio->fault.ft_vmf->virtual_address);
-                /*XXX workaround to bug in CLIO - he deadlocked with
-                 lock cancel if page locked  */
-                if (likely(cfio->fault.ft_flags & VM_FAULT_LOCKED)) {
-                        unlock_page(cfio->fault.ft_vmf->page);
-                        cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+        struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+        cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+
+        if (vmf->page) {
+                LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+                               vmf->virtual_address);
+                if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+                        lock_page(vmf->page);
+                        cfio->fault.ft_flags &= VM_FAULT_LOCKED;
                 }
 
-                cfio->ft_vmpage = cfio->fault.ft_vmf->page;
+                cfio->ft_vmpage = vmf->page;
                 return 0;
         }
 
-        if (unlikely (cfio->fault.ft_flags & VM_FAULT_ERROR)) {
-                CDEBUG(D_PAGE, "got addr %p - SIGBUS\n",
-                       cfio->fault.ft_vmf->virtual_address);
+        if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+                CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
                 return -EFAULT;
         }
 
-        if (unlikely (cfio->fault.ft_flags & VM_FAULT_NOPAGE)) {
-                CDEBUG(D_PAGE, "got addr %p - OOM\n",
-                       cfio->fault.ft_vmf->virtual_address);
+        if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+                CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
                 return -ENOMEM;
         }
 
-        if (unlikely(cfio->fault.ft_flags & VM_FAULT_RETRY))
+        if (cfio->fault.ft_flags & VM_FAULT_RETRY)
                 return -EAGAIN;
 
-        CERROR("unknow error in page fault!\n");
+        CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags);
         return -EINVAL;
 }
 
@@ -692,8 +690,8 @@ static int vvp_io_fault_start(const struct lu_env *env,
         struct cl_fault_io  *fio     = &io->u.ci_fault;
         struct vvp_fault_io *cfio    = &vio->u.fault;
         loff_t               offset;
-        int                  kernel_result = 0;
         int                  result  = 0;
+        cfs_page_t          *vmpage  = NULL;
         struct cl_page      *page;
         loff_t               size;
         pgoff_t              last; /* last page in a file data region */
@@ -711,63 +709,86 @@ static int vvp_io_fault_start(const struct lu_env *env,
         if (result != 0)
                 return result;
 
-        /* must return unlocked page */
-        kernel_result = vvp_io_kernel_fault(cfio);
-        if (kernel_result != 0)
-                return kernel_result;
-
-        if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) {
-                truncate_inode_pages_range(inode->i_mapping,
-                                         cl_offset(obj, fio->ft_index), offset);
+        /* must return locked page */
+        if (fio->ft_mkwrite) {
+                LASSERT(cfio->ft_vmpage != NULL);
+                lock_page(cfio->ft_vmpage);
+        } else {
+                result = vvp_io_kernel_fault(cfio);
+                if (result != 0)
+                        return result;
         }
 
-        /* Temporarily lock vmpage to keep cl_page_find() happy. */
-        lock_page(cfio->ft_vmpage);
+        vmpage = cfio->ft_vmpage;
+        LASSERT(PageLocked(vmpage));
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+                ll_invalidate_page(vmpage);
 
         /* Though we have already held a cl_lock upon this page, but
          * it still can be truncated locally. */
-        if (unlikely(cfio->ft_vmpage->mapping == NULL)) {
-                unlock_page(cfio->ft_vmpage);
-
+        if (unlikely(vmpage->mapping == NULL)) {
                 CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
 
                 /* return +1 to stop cl_io_loop() and ll_fault() will catch
                  * and retry. */
-                return +1;
+                GOTO(out, result = +1);
         }
 
-        page = cl_page_find(env, obj, fio->ft_index, cfio->ft_vmpage,
-                            CPT_CACHEABLE);
-        unlock_page(cfio->ft_vmpage);
-        if (IS_ERR(page)) {
-                page_cache_release(cfio->ft_vmpage);
-                cfio->ft_vmpage = NULL;
-                return PTR_ERR(page);
+        page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+        if (IS_ERR(page))
+                GOTO(out, result = PTR_ERR(page));
+
+        /* if page is going to be written, we should add this page into cache
+         * earlier. */
+        if (fio->ft_mkwrite) {
+                wait_on_page_writeback(vmpage);
+                if (set_page_dirty(vmpage)) {
+                        struct ccc_page *cp;
+
+                        /* vvp_page_assume() calls wait_on_page_writeback(). */
+                        cl_page_assume(env, io, page);
+
+                        cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+                        vvp_write_pending(cl2ccc(obj), cp);
+
+                        /* Do not set Dirty bit here so that in case IO is
+                         * started before the page is really made dirty, we
+                         * still have chance to detect it. */
+                        result = cl_page_cache_add(env, io, page, CRT_WRITE);
+                        if (result < 0) {
+                                cl_page_unassume(env, io, page);
+                                cl_page_put(env, page);
+
+                                /* we're in big trouble, what can we do now? */
+                                if (result == -EDQUOT)
+                                        result = -ENOSPC;
+                                GOTO(out, result);
+                        }
+                }
         }
 
         size = i_size_read(inode);
         last = cl_index(obj, size - 1);
+        LASSERT(fio->ft_index <= last);
         if (fio->ft_index == last)
                 /*
                  * Last page is mapped partially.
                  */
                 fio->ft_nob = size - cl_offset(obj, fio->ft_index);
-         else
+        else
                 fio->ft_nob = cl_page_size(obj);
 
-         lu_ref_add(&page->cp_reference, "fault", io);
-         fio->ft_page = page;
-         /*
-          * Certain 2.6 kernels return not-NULL from
-          * filemap_nopage() when page is beyond the file size,
-          * on the grounds that "An external ptracer can access
-          * pages that normally aren't accessible.." Don't
-          * propagate such page fault to the lower layers to
-          * avoid side-effects like KMS updates.
-          */
-          if (fio->ft_index > last)
-                result = +1;
+        lu_ref_add(&page->cp_reference, "fault", io);
+        fio->ft_page = page;
+        EXIT;
 
+out:
+        /* return unlocked vmpage to avoid deadlocking */
+        unlock_page(vmpage);
+#ifdef HAVE_VM_OP_FAULT
+        cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+#endif
         return result;
 }
 
diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c
index cf18605..9ebc40d 100644
--- a/lustre/obdclass/cl_io.c
+++ b/lustre/obdclass/cl_io.c
@@ -377,7 +377,13 @@ static int cl_lockset_lock_one(const struct lu_env *env,
 
         ENTRY;
 
-        lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+        if (io->ci_lockreq == CILR_PEEK) {
+                lock = cl_lock_peek(env, io, &link->cill_descr, "io", io);
+                if (lock == NULL)
+                        lock = ERR_PTR(-ENODATA);
+        } else
+                lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
         if (!IS_ERR(lock)) {
                 link->cill_lock = lock;
                 cfs_list_move(&link->cill_linkage, &set->cls_curr);
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 4c82107..f213c09 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -1486,10 +1486,6 @@ static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
                 return 0;
         }
 
-        /* If this is mmaped file - it can be changed at any time */
-        if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags & OBD_FL_MMAP)
-                return 1;
-
         cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
                                        oa->o_flags : 0);
         new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
@@ -2225,20 +2221,9 @@ static int brw_interpret(const struct lu_env *env,
         rc = osc_brw_fini_request(req, rc);
         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
         if (osc_recoverable_error(rc)) {
-                /* Only retry once for mmaped files since the mmaped page
-                 * might be modified at anytime. We have to retry at least
-                 * once in case there WAS really a corruption of the page
-                 * on the network, that was not caused by mmap() modifying
-                 * the page. Bug11742 */
-                if ((rc == -EAGAIN) && (aa->aa_resends > 0) &&
-                    aa->aa_oa->o_valid & OBD_MD_FLFLAGS &&
-                    aa->aa_oa->o_flags & OBD_FL_MMAP) {
-                        rc = 0;
-                } else {
-                        rc = osc_brw_redo_request(req, aa);
-                        if (rc == 0)
-                                RETURN(0);
-                }
+                rc = osc_brw_redo_request(req, aa);
+                if (rc == 0)
+                        RETURN(0);
         }
 
         if (aa->aa_ocapa) {
diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c
index 478896e..60f72a9 100644
--- a/lustre/tests/mmap_sanity.c
+++ b/lustre/tests/mmap_sanity.c
@@ -652,7 +652,7 @@ static int mmap_tst7_func(char *mnt, int rw)
                 rc = errno;
                 goto out;
         }
-        buf = mmap(NULL, page_size,
+        buf = mmap(NULL, page_size * 2,
                    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
         if (buf == MAP_FAILED) {
                 perror("mmap");
@@ -688,6 +688,69 @@ static int mmap_tst7(char *mnt)
         return rc;
 }
 
+static int mmap_tst8(char *mnt)
+{
+        char  fname[256];
+        char *buf = MAP_FAILED;
+        int fd = -1;
+        int rc = 0;
+        pid_t pid;
+        char xyz[page_size * 2];
+
+        if (snprintf(fname, 256, "%s/mmap_tst8", mnt) >= 256) {
+                fprintf(stderr, "dir name too long\n");
+                rc = ENAMETOOLONG;
+                goto out;
+        }
+        fd = open(fname, O_RDWR | O_CREAT, 0644);
+        if (fd == -1) {
+                perror("open");
+                rc = errno;
+                goto out;
+        }
+        if (ftruncate(fd, page_size) == -1) {
+                perror("truncate");
+                rc = errno;
+                goto out;
+        }
+        buf = mmap(NULL, page_size * 2,
+                   PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+        if (buf == MAP_FAILED) {
+                perror("mmap");
+                rc = errno;
+                goto out;
+        }
+
+        pid = fork();
+        if (pid == 0) { /* child */
+                memcpy(xyz, buf, page_size * 2);
+                /* shouldn't reach here. */
+                exit(0);
+        } else if (pid > 0) { /* parent */
+                int status = 0;
+                pid = waitpid(pid, &status, 0);
+                if (pid < 0) {
+                        perror("wait");
+                        rc = errno;
+                        goto out;
+                }
+
+                rc = EFAULT;
+                if (WIFSIGNALED(status) && SIGBUS == WTERMSIG(status))
+                        rc = 0;
+        } else {
+                perror("fork");
+                rc = errno;
+        }
+
+out:
+        if (buf != MAP_FAILED)
+                munmap(buf, page_size);
+        if (fd != -1)
+                close(fd);
+        return rc;
+}
+
 static int remote_tst(int tc, char *mnt)
 {
         int rc = 0;
@@ -705,7 +768,7 @@ static int remote_tst(int tc, char *mnt)
         }
         return rc;
 }
-        
+
 struct test_case {
         int     tc;                     /* test case number */
         char    *desc;                  /* test description */
@@ -724,6 +787,7 @@ struct test_case tests[] = {
         { 6, "mmap test6: check mmap write/read content on two nodes", 
                 mmap_tst6, 2 },
         { 7, "mmap test7: file i/o with an unmapped buffer", mmap_tst7, 1},
+        { 8, "mmap test8: SIGBUS for beyond file size", mmap_tst8, 1},
         { 0, NULL, 0, 0 }
 };
 
-- 
1.8.3.1