From 472ca19feae4c8f556cc7e65ddce84b54538646d Mon Sep 17 00:00:00 2001 From: phil Date: Sun, 24 Oct 2004 00:24:40 +0000 Subject: [PATCH] back the mmap code out of b1_2 a complete backout patch will be attached to bug 3918 --- lnet/archdep.m4 | 10 - lnet/include/linux/libcfs.h | 1 - lnet/utils/debug.c | 2 +- lustre/include/linux/lustre_compat25.h | 11 - .../patches/export-zap-page-range.patch | 12 - lustre/llite/Makefile.in | 4 +- lustre/llite/Makefile.mk | 2 +- lustre/llite/file.c | 60 +-- lustre/llite/llite_internal.h | 35 +- lustre/llite/llite_mmap.c | 482 --------------------- lustre/llite/lproc_llite.c | 19 +- lustre/llite/rw.c | 59 +-- lustre/llite/rw24.c | 45 +- lustre/llite/rw26.c | 42 +- lustre/portals/archdep.m4 | 10 - lustre/portals/include/linux/libcfs.h | 1 - lustre/portals/utils/debug.c | 2 +- lustre/tests/sanityN.sh | 4 +- lustre/utils/lconf | 1 - 19 files changed, 127 insertions(+), 675 deletions(-) delete mode 100644 lustre/kernel_patches/patches/export-zap-page-range.patch delete mode 100644 lustre/llite/llite_mmap.c diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index 94fa984..27704bd 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -436,16 +436,6 @@ if test x$enable_modules != xno ; then AC_MSG_RESULT([no]) ]) - # --------- zap_page_range(vma) -------------------------------- - AC_MSG_CHECKING([if zap_pag_range with vma parameter]) - ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" - if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then - AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - fi - # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS diff --git a/lnet/include/linux/libcfs.h b/lnet/include/linux/libcfs.h index 301dca1..66ee471 100644 --- a/lnet/include/linux/libcfs.h +++ b/lnet/include/linux/libcfs.h @@ -89,7 +89,6 @@ struct ptldebug_header { #define D_RPCTRACE 0x00100000 /* for distributed debugging */ #define D_VFSTRACE 0x00200000 #define D_READA 0x00400000 /* read-ahead */ -#define D_MMAP 0x00800000 #ifdef __KERNEL__ # include /* THREAD_SIZE */ diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 300437f..538af44 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -74,7 +74,7 @@ static const char *portal_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", "blocks", "net", "warning", "buffs", "other", "dentry", "portals", "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", - "reada", "mmap", NULL}; + "reada", NULL}; struct debug_daemon_cmd { char *cmd; diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 6167f2f..95c462f 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -216,17 +216,6 @@ static inline void cond_resched(void) #define PageWriteback(page) 0 #define end_page_writeback(page) -static inline int mapping_mapped(struct address_space *mapping) -{ - return mapping->i_mmap_shared ? 1 : 0; -} - -#ifdef ZAP_PAGE_RANGE_VMA -#define ll_zap_page_range(vma, addr, len) zap_page_range(vma, addr, len) -#else -#define ll_zap_page_range(vma, addr, len) zap_page_range(vma->vm_mm, addr, len) -#endif - #endif /* end of 2.4 compat macros */ #ifdef HAVE_PAGE_LIST diff --git a/lustre/kernel_patches/patches/export-zap-page-range.patch b/lustre/kernel_patches/patches/export-zap-page-range.patch deleted file mode 100644 index 9b9d48f..0000000 --- a/lustre/kernel_patches/patches/export-zap-page-range.patch +++ /dev/null @@ -1,12 +0,0 @@ -Index: linux-2.4.24-l36mmap/mm/memory.c -=================================================================== ---- linux-2.4.24-l36mmap.orig/mm/memory.c 2004-05-27 17:44:13.000000000 -0700 -+++ linux-2.4.24-l36mmap/mm/memory.c 2004-05-27 17:45:07.000000000 -0700 -@@ -411,6 +411,7 @@ - mm->rss = 0; - spin_unlock(&mm->page_table_lock); - } -+EXPORT_SYMBOL_GPL(zap_page_range); - - /* - * Do a quick page-table lookup for a single page. diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 4daad42..9492120 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -1,5 +1,5 @@ MODULES := llite -llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o +llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o ifeq ($(PATCHLEVEL),4) llite-objs += rw24.o super.o @@ -7,4 +7,4 @@ else llite-objs += rw26.o super25.o endif -@INCLUDE_RULES@ +@INCLUDE_RULES@ \ No newline at end of file diff --git a/lustre/llite/Makefile.mk b/lustre/llite/Makefile.mk index dabbd9e..06dd10e 100644 --- a/lustre/llite/Makefile.mk +++ b/lustre/llite/Makefile.mk @@ -8,4 +8,4 @@ include $(src)/../portals/Kernelenv obj-y += llite.o llite-objs := llite_lib.o dcache.o super.o rw.o \ super25.o file.o dir.o symlink.o namei.o lproc_llite.o \ - rw26.o llite_nfs.o llite_close.o special.o llite_mmap.o + rw26.o llite_nfs.o llite_close.o special.o diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 338353e..ef1f0a5 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -361,7 +361,7 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, if (end < tmpex.l_extent.end >> PAGE_CACHE_SHIFT) end = ~0; - i = inode->i_size ? (inode->i_size - 1) >> PAGE_CACHE_SHIFT : 0; + i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; if (i < end) end = i; @@ -369,19 +369,6 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, "count: %lu skip: %lu end: %lu%s\n", start, start % count, count, skip, end, discard ? " (DISCARDING)" : ""); - /* walk through the vmas on the inode and tear down mmaped pages that - * intersect with the lock. this stops immediately if there are no - * mmap()ed regions of the file. This is not efficient at all and - * should be short lived. We'll associate mmap()ed pages with the lock - * and will be able to find them directly */ - for (i = start; i <= end; i += (j + skip)) { - j = min(count - (i % count), end - i + 1); - LASSERT(inode->i_mapping); - if (ll_teardown_mmaps(inode->i_mapping, i << PAGE_CACHE_SHIFT, - ((i+j) << PAGE_CACHE_SHIFT) - 1) ) - break; - } - /* this is the simplistic implementation of page eviction at * cancelation. It is careful to get races with other page * lockers handled correctly. fixes from bug 20 will make it @@ -740,11 +727,12 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { + struct ll_file_data *fd = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct ll_lock_tree tree; - struct ll_lock_tree_node *node; + struct lustre_handle lockh = { 0 }; + ldlm_policy_data_t policy; int rc; ssize_t retval; __u64 kms; @@ -763,13 +751,10 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, if (!lsm) RETURN(0); - node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, - LCK_PR); - - tree.lt_fd = filp->private_data; + policy.l_extent.start = *ppos; + policy.l_extent.end = *ppos + count - 1; - rc = ll_tree_lock(&tree, node, inode, buf, count, - filp->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0); + rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, 0); if (rc != 0) RETURN(rc); @@ -796,7 +781,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, retval = generic_file_read(filp, buf, count, ppos); out: - ll_tree_unlock(&tree, inode); + ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); RETURN(retval); } @@ -806,10 +791,11 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { + struct ll_file_data *fd = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct ll_lock_tree tree; - struct ll_lock_tree_node *node; + struct lustre_handle lockh = { 0 }; + ldlm_policy_data_t policy; loff_t maxbytes = ll_file_maxbytes(inode); ssize_t retval; int rc; @@ -830,18 +816,15 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, LASSERT(lsm); - if (file->f_flags & O_APPEND) - node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW); - else - node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, - LCK_PW); - if (IS_ERR(node)) - RETURN(PTR_ERR(node)); - - tree.lt_fd = file->private_data; + if (file->f_flags & O_APPEND) { + policy.l_extent.start = 0; + policy.l_extent.end = OBD_OBJECT_EOF; + } else { + policy.l_extent.start = *ppos; + policy.l_extent.end = *ppos + count - 1; + } - rc = ll_tree_lock(&tree, node, inode, buf, count, - file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0); + rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, 0); if (rc != 0) RETURN(rc); @@ -866,8 +849,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, retval = generic_file_write(file, buf, count, ppos); out: - ll_tree_unlock(&tree, inode); - /* serialize with mmap/munmap/mremap */ + ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh); lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES, retval > 0 ? retval : 0); RETURN(retval); @@ -1394,7 +1376,7 @@ struct file_operations ll_file_operations = { .ioctl = ll_file_ioctl, .open = ll_file_open, .release = ll_file_release, - .mmap = ll_file_mmap, + .mmap = generic_file_mmap, .llseek = ll_file_seek, #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) .sendfile = generic_file_sendfile, diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index a3ad73b..e5352ed 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -134,20 +134,10 @@ struct ll_async_page { /* only trust these if the page lock is providing exclusion */ unsigned llap_write_queued:1, llap_defer_uptodate:1, - llap_origin:3, llap_ra_used:1; struct list_head llap_proc_item; }; -enum { - LLAP_ORIGIN_UNKNOWN = 0, - LLAP_ORIGIN_READPAGE, - LLAP_ORIGIN_READAHEAD, - LLAP_ORIGIN_COMMIT_WRITE, - LLAP_ORIGIN_WRITEPAGE, - LLAP__ORIGIN_MAX, -}; - /* llite/lproc_llite.c */ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, struct super_block *sb, char *osc, char *mdc); @@ -173,13 +163,12 @@ void ll_prepare_mdc_op_data(struct mdc_op_data *, /* llite/rw.c */ int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to); int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); -int ll_writepage(struct page *page); void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa); void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc); void ll_removepage(struct page *page); int ll_readpage(struct file *file, struct page *page); struct ll_async_page *llap_from_cookie(void *cookie); -struct ll_async_page *llap_from_page(struct page *page, unsigned origin); +struct ll_async_page *llap_from_page(struct page *page); struct ll_async_page *llap_cast_private(struct page *page); void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); void ll_ra_accounting(struct page *page, struct address_space *mapping); @@ -277,28 +266,6 @@ void ll_queue_done_writing(struct inode *inode); void ll_close_thread_shutdown(struct ll_close_queue *lcq); int ll_close_thread_start(struct ll_close_queue **lcq_ret); -/* llite/llite_mmap.c */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -typedef struct rb_root rb_root_t; -typedef struct rb_node rb_node_t; -#endif - -struct ll_lock_tree_node; -struct ll_lock_tree { - rb_root_t lt_root; - struct list_head lt_locked_list; - struct ll_file_data *lt_fd; -}; -int ll_teardown_mmaps(struct address_space *mapping, __u64 first, - __u64 last); -int ll_file_mmap(struct file * file, struct vm_area_struct * vma); -struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, - __u64 end, ldlm_mode_t mode); -int ll_tree_lock(struct ll_lock_tree *tree, - struct ll_lock_tree_node *first_node, struct inode *inode, - const char *buf, size_t count, int ast_flags); -int ll_tree_unlock(struct ll_lock_tree *tree, struct inode *inode); - #define LL_SBI_NOLCK 0x1 #define LL_SBI_READAHEAD 0x2 diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c deleted file mode 100644 index 9e34556..0000000 --- a/lustre/llite/llite_mmap.c +++ /dev/null @@ -1,482 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#include -#endif - - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include "llite_internal.h" -#include - -struct ll_lock_tree_node { - rb_node_t lt_node; - struct list_head lt_locked_item; - __u64 lt_oid; - ldlm_policy_data_t lt_policy; - struct lustre_handle lt_lockh; - ldlm_mode_t lt_mode; -}; - -__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms); -int lt_get_mmap_locks(struct ll_lock_tree *tree, struct inode *inode, - unsigned long addr, size_t count); - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, - int *type); -#else - -struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, - int unused); -#endif - -struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, - __u64 end, ldlm_mode_t mode) -{ - struct ll_lock_tree_node *node; - - OBD_ALLOC(node, sizeof(*node)); - if (node == NULL) - RETURN(ERR_PTR(-ENOMEM)); - - node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id; - node->lt_policy.l_extent.start = start; - node->lt_policy.l_extent.end = end; - memset(&node->lt_lockh, 0, sizeof(node->lt_lockh)); - INIT_LIST_HEAD(&node->lt_locked_item); - node->lt_mode = mode; - - return node; -} - -int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two) -{ - if ( one->lt_oid < two->lt_oid) - return -1; - if ( one->lt_oid > two->lt_oid) - return 1; - - if ( one->lt_policy.l_extent.end < two->lt_policy.l_extent.start ) - return -1; - if ( one->lt_policy.l_extent.start > two->lt_policy.l_extent.end ) - return 1; - - return 0; /* they are the same object and overlap */ -} - -static void lt_merge(struct ll_lock_tree_node *dst, - struct ll_lock_tree_node *src) -{ - dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start, - src->lt_policy.l_extent.start); - dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end, - src->lt_policy.l_extent.end); - - /* XXX could be a real call to the dlm to find superset modes */ - if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW) - dst->lt_mode = LCK_PW; -} - -static void lt_insert(struct ll_lock_tree *tree, - struct ll_lock_tree_node *node) -{ - struct ll_lock_tree_node *walk; - rb_node_t **p, *parent; - ENTRY; - -restart: - p = &tree->lt_root.rb_node; - parent = NULL; - while (*p) { - parent = *p; - walk = rb_entry(parent, struct ll_lock_tree_node, lt_node); - switch (lt_compare(node, walk)) { - case -1: - p = &(*p)->rb_left; - break; - case 1: - p = &(*p)->rb_right; - break; - case 0: - lt_merge(node, walk); - rb_erase(&walk->lt_node, &tree->lt_root); - OBD_FREE(walk, sizeof(*walk)); - goto restart; - break; - default: - LBUG(); - break; - } - } - rb_link_node(&node->lt_node, parent, p); - rb_insert_color(&node->lt_node, &tree->lt_root); - EXIT; -} - -static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree) -{ - rb_node_t *rbnode; - struct ll_lock_tree_node *node = NULL; - - for ( rbnode = tree->lt_root.rb_node; rbnode != NULL; - rbnode = rbnode->rb_left) { - if (rbnode->rb_left == NULL) { - node = rb_entry(rbnode, struct ll_lock_tree_node, - lt_node); - break; - } - } - RETURN(node); -} - -int ll_tree_unlock(struct ll_lock_tree *tree, struct inode *inode) -{ - struct ll_lock_tree_node *node; - struct list_head *pos, *n; - int rc = 0; - ENTRY; - - list_for_each_safe(pos, n, &tree->lt_locked_list) { - node = list_entry(pos, struct ll_lock_tree_node, - lt_locked_item); - - rc = ll_extent_unlock(tree->lt_fd, inode, - ll_i2info(inode)->lli_smd, node->lt_mode, - &node->lt_lockh); - if (rc != 0) { - /* XXX better message */ - CERROR("couldn't unlock %d\n", rc); - } - list_del(&node->lt_locked_item); - OBD_FREE(node, sizeof(*node)); - } - - while ((node = lt_least_node(tree))) { - rb_erase(&node->lt_node, &tree->lt_root); - OBD_FREE(node, sizeof(*node)); - } - - RETURN(rc); -} - -int ll_tree_lock(struct ll_lock_tree *tree, - struct ll_lock_tree_node *first_node, struct inode *inode, - const char *buf, size_t count, int ast_flags) -{ - struct ll_lock_tree_node *node; - int rc = 0; - ENTRY; - - tree->lt_root.rb_node = NULL; - INIT_LIST_HEAD(&tree->lt_locked_list); - if (first_node != NULL) - lt_insert(tree, first_node); - - if (mapping_mapped(inode->i_mapping)) { - rc = lt_get_mmap_locks(tree, inode, (unsigned long)buf, count); - if (rc) - GOTO(out, rc); - } - - while ((node = lt_least_node(tree))) { - rc = ll_extent_lock(tree->lt_fd, inode, - ll_i2info(inode)->lli_smd, node->lt_mode, - &node->lt_policy, &node->lt_lockh, - ast_flags); - if (rc != 0) - GOTO(out, rc); - - rb_erase(&node->lt_node, &tree->lt_root); - list_add_tail(&node->lt_locked_item, &tree->lt_locked_list); - } - RETURN(rc); -out: - ll_tree_unlock(tree, inode); - RETURN(rc); -} - -static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma) -{ - /* we only want to hold PW locks if the mmap() can generate - * writes back to the file and that only happens in shared - * writable vmas */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return LCK_PW; - return LCK_PR; -} - -static void policy_from_vma(ldlm_policy_data_t *policy, - struct vm_area_struct *vma, unsigned long addr, - size_t count) -{ - policy->l_extent.start = ((addr - vma->vm_start) & PAGE_CACHE_MASK) + - (vma->vm_pgoff << PAGE_CACHE_SHIFT); - policy->l_extent.end = (policy->l_extent.start + count - 1) | - (PAGE_CACHE_SIZE - 1); -} - -static struct vm_area_struct * our_vma(unsigned long addr, size_t count) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *ret = NULL; - ENTRY; - - spin_lock(&mm->page_table_lock); - for(vma = find_vma(mm, addr); - vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { - if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage) { - ret = vma; - break; - } - } - spin_unlock(&mm->page_table_lock); - RETURN(ret); -} - -int lt_get_mmap_locks(struct ll_lock_tree *tree, struct inode *inode, - unsigned long addr, size_t count) -{ - struct vm_area_struct *vma; - struct ll_lock_tree_node *node; - ldlm_policy_data_t policy; - ENTRY; - - if (count == 0) - RETURN(0); - - /* we need to look up vmas on page aligned addresses */ - count += addr & (PAGE_SIZE - 1); - addr -= addr & (PAGE_SIZE - 1); - - while ((vma = our_vma(addr, count)) != NULL) { - - policy_from_vma(&policy, vma, addr, count); - node = ll_node_from_inode(inode, policy.l_extent.start, - policy.l_extent.end, - mode_from_vma(vma)); - if (IS_ERR(node)) { - CERROR("not enough mem for lock_tree_node!\n"); - RETURN(-ENOMEM); - } - lt_insert(tree, node); - - if (vma->vm_end - addr >= count) - break; - count -= vma->vm_end - addr; - addr = vma->vm_end; - } - RETURN(0); -} - -/* FIXME: there is a pagefault race goes as follow: - * 1. A user process on node A accesses a portion of a mapped file, - * resulting in a page fault. The pagefault handler invokes the - * ll_nopage function, which reads the page into memory. - * 2. A user process on node B writes to the same portion of the file - * (either via mmap or write()), that cause node A to cancel the - * lock and truncate the page. - * 3. Node A then executes the rest of do_no_page(), entering the - * now-invalid page into the PTEs. - * - * Make the whole do_no_page as a hook to cover both the page cache - * and page mapping installing with dlm lock would eliminate this race. - */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, - int *type) -#else -struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, - int unused) -#endif -{ - struct file *filp = vma->vm_file; - struct ll_file_data *fd = filp->private_data; - struct inode *inode = filp->f_dentry->d_inode; - struct lustre_handle lockh = { 0 }; - ldlm_policy_data_t policy; - ldlm_mode_t mode; - struct page *page; - __u64 kms; - unsigned long pgoff, size, rand_read, seq_read; - int rc = 0; - ENTRY; - - if (ll_i2info(inode)->lli_smd == NULL) { - CERROR("No lsm on fault?\n"); - RETURN(NULL); - } - - /* start and end the lock on the first and last bytes in the page */ - policy_from_vma(&policy, vma, address, PAGE_CACHE_SIZE); - - CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n", - vma, inode->i_ino, policy.l_extent.start, - policy.l_extent.end); - - mode = mode_from_vma(vma); - - rc = ll_extent_lock(fd, inode, ll_i2info(inode)->lli_smd, mode, &policy, - &lockh, LDLM_FL_CBPENDING); - if (rc != 0) - RETURN(NULL); - - /* XXX change inode size without i_sem hold! there is a race condition - * with truncate path. (see ll_extent_lock) */ - kms = lov_merge_size(ll_i2info(inode)->lli_smd, 1); - pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; - size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - if (pgoff >= size) - ll_glimpse_size(inode); - else - inode->i_size = kms; - - /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that - * the kernel will not read other pages not covered by ldlm in - * filemap_nopage. we do our readahead in ll_readpage. - */ - rand_read = vma->vm_flags & VM_RAND_READ; - seq_read = vma->vm_flags & VM_SEQ_READ; - vma->vm_flags &= ~ VM_SEQ_READ; - vma->vm_flags |= VM_RAND_READ; - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - page = filemap_nopage(vma, address, type); -#else - page = filemap_nopage(vma, address, unused); -#endif - vma->vm_flags &= ~VM_RAND_READ; - vma->vm_flags |= (rand_read | seq_read); - - ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh); - RETURN(page); -} - -/* return the user space pointer that maps to a file offset via a vma */ -static inline unsigned long file_to_user(struct vm_area_struct *vma, - __u64 byte) -{ - return vma->vm_start + - (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT)); - -} - -#define VMA_DEBUG(vma, fmt, arg...) \ - CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld): " fmt, \ - vma, vma->vm_start, vma->vm_end, vma->vm_pgoff, ## arg); - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -/* [first, last] are the byte offsets affected. - * vm_{start, end} are user addresses of the first byte of the mapping and - * the next byte beyond it - * vm_pgoff is the page index of the first byte in the mapping */ -static void teardown_vmas(struct vm_area_struct *vma, __u64 first, - __u64 last) -{ - unsigned long address, len; - for (; vma ; vma = vma->vm_next_share) { - if (last >> PAGE_CACHE_SHIFT < vma->vm_pgoff) - continue; - if (first >> PAGE_CACHE_SHIFT > (vma->vm_pgoff + - ((vma->vm_end - vma->vm_start) >> PAGE_CACHE_SHIFT))) - continue; - - address = max((unsigned long)vma->vm_start, - file_to_user(vma, first)); - len = min((unsigned long)vma->vm_end, - file_to_user(vma, last) + 1) - address; - - VMA_DEBUG(vma, "zapping vma [address=%ld len=%ld]\n", - address, len); - LASSERT(vma->vm_mm); - ll_zap_page_range(vma, address, len); - } -} -#endif - -/* XXX put nice comment here. talk about __free_pte -> dirty pages and - * nopage's reference passing to the pte */ -int ll_teardown_mmaps(struct address_space *mapping, __u64 first, - __u64 last) -{ - int rc = -ENOENT; - ENTRY; - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - if (mapping_mapped(mapping)) { - rc = 0; - unmap_mapping_range(mapping, first + PAGE_SIZE - 1, - last - first + 1, 1); - } -#else - spin_lock(&mapping->i_shared_lock); - if (mapping->i_mmap != NULL) { - rc = 0; - teardown_vmas(mapping->i_mmap, first, last); - } - if (mapping->i_mmap_shared != NULL) { - rc = 0; - teardown_vmas(mapping->i_mmap_shared, first, last); - } - spin_unlock(&mapping->i_shared_lock); -#endif - RETURN(rc); -} - -static struct vm_operations_struct ll_file_vm_ops = { - .nopage = ll_nopage, -}; - -int ll_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - int rc; - ENTRY; - - rc = generic_file_mmap(file, vma); - if (rc == 0) - vma->vm_ops = &ll_file_vm_ops; - - RETURN(rc); -} diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 59113dd..d390eab 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -494,7 +494,7 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement * it in our own state */ if (dummy_llap->llap_magic == 0) { - seq_printf(seq, "generation | llap .cookie origin | page "); + seq_printf(seq, "generation | llap .cookie | page "); seq_printf(seq, "inode .index [ page flags ]\n"); return 0; } @@ -505,21 +505,10 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) if (llap != NULL) { int has_flags = 0; struct page *page = llap->llap_page; - static char *origins[] = { - [LLAP_ORIGIN_UNKNOWN] = "--", - [LLAP_ORIGIN_READPAGE] = "rp", - [LLAP_ORIGIN_READAHEAD] = "ra", - [LLAP_ORIGIN_COMMIT_WRITE] = "cw", - [LLAP_ORIGIN_WRITEPAGE] = "wp", - }; - - LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n", - llap->llap_origin); - - seq_printf(seq, "%lu | %p %p %s | %p %p %lu [", + + seq_printf(seq, "%lu | %p %p | %p %p %lu [", sbi->ll_pglist_gen, - llap, llap->llap_cookie, - origins[llap->llap_origin], + llap, llap->llap_cookie, page, page->mapping->host, page->index); seq_page_flag(seq, page, locked, has_flags); seq_page_flag(seq, page, error, has_flags); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index c9f7637..bdc9b10 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -360,7 +360,7 @@ struct ll_async_page *llap_cast_private(struct page *page) } /* XXX have the exp be an argument? */ -struct ll_async_page *llap_from_page(struct page *page, unsigned origin) +struct ll_async_page *llap_from_page(struct page *page) { struct ll_async_page *llap; struct obd_export *exp; @@ -369,11 +369,9 @@ struct ll_async_page *llap_from_page(struct page *page, unsigned origin) int rc; ENTRY; - LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin); - llap = llap_cast_private(page); if (llap != NULL) - GOTO(out, llap); + RETURN(llap); exp = ll_i2obdexp(page->mapping->host); if (exp == NULL) @@ -391,8 +389,6 @@ struct ll_async_page *llap_from_page(struct page *page, unsigned origin) RETURN(ERR_PTR(rc)); } - LL_CDEBUG_PAGE(D_PAGE, page, "obj off "LPU64"\n", - (obd_off)page->index << PAGE_SHIFT); CDEBUG(D_CACHE, "llap %p page %p cookie %p obj off "LPU64"\n", llap, page, llap->llap_cookie, (obd_off)page->index << PAGE_SHIFT); /* also zeroing the PRIVBITS low order bitflags */ @@ -404,8 +400,6 @@ struct ll_async_page *llap_from_page(struct page *page, unsigned origin) list_add_tail(&llap->llap_proc_item, &sbi->ll_pglist); spin_unlock(&sbi->ll_lock); -out: - llap->llap_origin = origin; RETURN(llap); } @@ -498,7 +492,7 @@ int ll_commit_write(struct file *file, struct page *page, unsigned from, CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n", inode, page, from, to, page->index); - llap = llap_from_page(page, LLAP_ORIGIN_COMMIT_WRITE); + llap = llap_from_page(page); if (IS_ERR(llap)) RETURN(PTR_ERR(llap)); @@ -566,43 +560,6 @@ static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) spin_unlock(&sbi->ll_lock); } -int ll_writepage(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct obd_export *exp; - struct ll_async_page *llap; - int rc = 0; - ENTRY; - - LASSERT(!PageDirty(page)); - LASSERT(PageLocked(page)); - - exp = ll_i2obdexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); - - llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE); - if (IS_ERR(llap)) - GOTO(out, rc = PTR_ERR(llap)); - - page_cache_get(page); - if (llap->llap_write_queued) { - LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, - ASYNC_READY | ASYNC_URGENT); - } else { - rc = queue_or_sync_write(exp, inode, llap, PAGE_SIZE, - ASYNC_READY | ASYNC_URGENT); - } - if (rc) - page_cache_release(page); -out: - if (rc) - unlock_page(page); - RETURN(rc); -} - /* called for each page in a completed rpc.*/ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) { @@ -683,7 +640,7 @@ void ll_removepage(struct page *page) return; } - llap = llap_from_page(page, 0); + llap = llap_from_page(page); if (IS_ERR(llap)) { CERROR("page %p ind %lu couldn't find llap: %ld\n", page, page->index, PTR_ERR(llap)); @@ -770,7 +727,7 @@ void ll_ra_accounting(struct page *page, struct address_space *mapping) { struct ll_async_page *llap; - llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE); + llap = llap_from_page(page); if (IS_ERR(llap)) return; @@ -847,7 +804,7 @@ static int ll_readahead(struct ll_readahead_state *ras, /* we do this first so that we can see the page in the /proc * accounting */ - llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD); + llap = llap_from_page(page); if (IS_ERR(llap) || llap->llap_defer_uptodate) goto next_page; @@ -1026,7 +983,7 @@ int ll_readpage(struct file *filp, struct page *page) if (exp == NULL) GOTO(out, rc = -EINVAL); - llap = llap_from_page(page, LLAP_ORIGIN_READPAGE); + llap = llap_from_page(page); if (IS_ERR(llap)) GOTO(out, rc = PTR_ERR(llap)); @@ -1054,10 +1011,12 @@ int ll_readpage(struct file *filp, struct page *page) } if (rc == 0) { +#if 0 CWARN("ino %lu page %lu (%llu) not covered by " "a lock (mmap?). check debug logs.\n", inode->i_ino, page->index, (long long)page->index << PAGE_CACHE_SHIFT); +#endif } rc = ll_issue_page_read(exp, llap, oig, 0); diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index d8c98f0..3993af4 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -49,6 +49,49 @@ #include "llite_internal.h" #include +static int ll_writepage_24(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct obd_export *exp; + struct ll_async_page *llap; + int rc = 0; + ENTRY; + + LASSERT(!PageDirty(page)); + LASSERT(PageLocked(page)); + + exp = ll_i2obdexp(inode); + if (exp == NULL) + GOTO(out, rc = -EINVAL); + + llap = llap_from_page(page); + if (IS_ERR(llap)) + GOTO(out, rc = PTR_ERR(llap)); + + page_cache_get(page); + if (llap->llap_write_queued) { + LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); + rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, + llap->llap_cookie, + ASYNC_READY | ASYNC_URGENT); + } else { + llap->llap_write_queued = 1; + rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, + llap->llap_cookie, OBD_BRW_WRITE, 0, 0, + 0, ASYNC_READY | ASYNC_URGENT); + if (rc == 0) + LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n"); + else + llap->llap_write_queued = 0; + } + if (rc) + page_cache_release(page); +out: + if (rc) + unlock_page(page); + RETURN(rc); +} + static int ll_direct_IO_24(int rw, #ifdef HAVE_DIO_FILE struct file *file, @@ -137,7 +180,7 @@ static int ll_direct_IO_24(int rw, struct address_space_operations ll_aops = { .readpage = ll_readpage, .direct_IO = ll_direct_IO_24, - .writepage = ll_writepage, + .writepage = ll_writepage_24, .prepare_write = ll_prepare_write, .commit_write = ll_commit_write, .removepage = ll_removepage, diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 07b0d45..71964de 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -53,7 +53,47 @@ static int ll_writepage_26(struct page *page, struct writeback_control *wbc) { - return ll_writepage(page); + struct inode *inode = page->mapping->host; + struct obd_export *exp; + struct ll_async_page *llap; + int rc; + ENTRY; + + LASSERT(!PageDirty(page)); + LASSERT(PageLocked(page)); + + exp = ll_i2obdexp(inode); + if (exp == NULL) + GOTO(out, rc = -EINVAL); + + llap = llap_from_page(page); + if (IS_ERR(llap)) + GOTO(out, rc = PTR_ERR(llap)); + + page_cache_get(page); + if (llap->llap_write_queued) { + LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); + rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, + llap->llap_cookie, + ASYNC_READY | ASYNC_URGENT); + } else { + llap->llap_write_queued = 1; + rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, + llap->llap_cookie, OBD_BRW_WRITE, 0, 0, + 0, ASYNC_READY | ASYNC_URGENT); + if (rc == 0) + LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n"); + else + llap->llap_write_queued = 0; + } + if (rc) + page_cache_release(page); +out: + if (rc) + unlock_page(page); + else + set_page_writeback(page); + RETURN(rc); } /* It is safe to not check anything in invalidatepage/releasepage below diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index 94fa984..27704bd 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -436,16 +436,6 @@ if test x$enable_modules != xno ; then AC_MSG_RESULT([no]) ]) - # --------- zap_page_range(vma) -------------------------------- - AC_MSG_CHECKING([if zap_pag_range with vma parameter]) - ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" - if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then - AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - fi - # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h index 301dca1..66ee471 100644 --- a/lustre/portals/include/linux/libcfs.h +++ b/lustre/portals/include/linux/libcfs.h @@ -89,7 +89,6 @@ struct ptldebug_header { #define D_RPCTRACE 0x00100000 /* for distributed debugging */ #define D_VFSTRACE 0x00200000 #define D_READA 0x00400000 /* read-ahead */ -#define D_MMAP 0x00800000 #ifdef __KERNEL__ # include /* THREAD_SIZE */ diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c index 300437f..538af44 100644 --- a/lustre/portals/utils/debug.c +++ b/lustre/portals/utils/debug.c @@ -74,7 +74,7 @@ static const char *portal_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", "blocks", "net", "warning", "buffs", "other", "dentry", "portals", "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", - "reada", "mmap", NULL}; + "reada", NULL}; struct debug_daemon_cmd { char *cmd; diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index e6c797d..7df1454 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -346,7 +346,7 @@ test_15() { # bug 974 - ENOSPC run_test 15 "test out-of-space with multiple writers ===========" test_16() { - fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile + fsx -R -W -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile } run_test 16 "2500 iterations of dual-mount fsx =================" @@ -375,7 +375,7 @@ test_18() { ./mmap_sanity -d $MOUNT1 -m $MOUNT2 sync; sleep 1; sync } -run_test 18 "mmap sanity check =================================" +#run_test 18 "mmap sanity check =================================" test_19() { # bug3811 [ -d /proc/fs/lustre/obdfilter ] || return 0 diff --git a/lustre/utils/lconf b/lustre/utils/lconf index cb8d14a..0a1d5bc 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -88,7 +88,6 @@ ptldebug_names = { "rpctrace" : (1 << 20), "vfstrace" : (1 << 21), "reada" : (1 << 22), - "mmap" : (1 << 23), } subsystem_names = { -- 1.8.3.1