#define __set_page_ll_data(page, llap) page->private = (unsigned long)llap
#define __clear_page_ll_data(page) page->private = 0
#define PageWriteback(page) 0
+#define set_page_writeback(page)
#define end_page_writeback(page)
+static inline int mapping_mapped(struct address_space *mapping)
+{
+ if (mapping->i_mmap_shared)
+ return 1;
+ if (mapping->i_mmap)
+ return 1;
+ return 0;
+}
+
+#ifdef ZAP_PAGE_RANGE_VMA
+#define ll_zap_page_range(vma, addr, len) zap_page_range(vma, addr, len)
+#else
+#define ll_zap_page_range(vma, addr, len) zap_page_range(vma->vm_mm, addr, len)
+#endif
+
#endif /* end of 2.4 compat macros */
#ifdef HAVE_PAGE_LIST
* list. */
#define LDLM_FL_KMS_IGNORE 0x200000
+/* Don't drop lock covering mmapped file in LRU */
+#define LDLM_FL_NO_LRU 0x400000
+
/* The blocking callback is overloaded to perform two functions. These flags
* indicate which operation should be performed. */
#define LDLM_CB_BLOCKING 1
int ldlm_cli_cancel(struct lustre_handle *lockh);
int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *,
int flags, void *opaque);
+int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *,
+ int join);
/* mds/handler.c */
/* This has to be here because recursive inclusion sucks. */
__u64 lli_io_epoch;
unsigned long lli_flags;
- /* this lock protects s_d_w and p_w_ll */
+ /* this lock protects s_d_w and p_w_ll and mmap_cnt */
spinlock_t lli_lock;
int lli_send_done_writing;
struct list_head lli_pending_write_llaps;
+ atomic_t lli_mmap_cnt;
struct list_head lli_close_item;
__u32 mode, struct lustre_handle *);
int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
int flags, void *opaque);
+ int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *,
+ int join);
int (*o_san_preprw)(int cmd, struct obd_export *exp,
struct obdo *oa, int objcount,
struct obd_ioobj *obj, int niocount,
RETURN(rc);
}
+static inline int obd_join_lru(struct obd_export *exp,
+ struct lov_stripe_md *ea, int join)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_OP(exp, join_lru);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, join_lru);
+
+ rc = OBP(exp->exp_obd, join_lru)(exp, ea, join);
+ RETURN(rc);
+}
static inline int obd_san_preprw(int cmd, struct obd_export *exp,
struct obdo *oa,
--- /dev/null
+Index: linux-2.6.7/mm/filemap.c
+===================================================================
+--- linux-2.6.7.orig/mm/filemap.c 2004-11-15 12:02:35.000000000 +0800
++++ linux-2.6.7/mm/filemap.c 2004-11-15 12:04:38.000000000 +0800
+@@ -1409,6 +1409,7 @@
+
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(filemap_populate);
+
+ static struct vm_operations_struct generic_file_vm_ops = {
+ .nopage = filemap_nopage,
+Index: linux-2.6.7/include/linux/mm.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/mm.h 2004-11-15 12:02:43.000000000 +0800
++++ linux-2.6.7/include/linux/mm.h 2004-11-15 12:04:23.000000000 +0800
+@@ -661,6 +661,8 @@
+
+ /* generic vm_area_ops exported for stackable file systems */
+ struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
++int filemap_populate(struct vm_area_struct *, unsigned long, unsigned long,
++ pgprot_t, unsigned long, int);
+
+ /* mm/page-writeback.c */
+ int write_one_page(struct page *page, int wait);
mtd-2.6-suse-lnxi.patch
perfctr-2.6-suse-lnxi.patch
kexec-2.6-suse-lnxi.patch
+export-filemap_populate.patch
grab_cache_page_nowait_gfp-2.6-suse.patch
md_path_lookup-2.6-suse.patch
ext3-super-ntohl.patch
export-show_task-2.6-vanilla.patch
+export-filemap_populate.patch
export_num_siblings.patch
ext3-nlinks-2.4.24.patch
export-show_task-2.4-vanilla.patch
+export-zap-page-range.patch
if (ldlm_bl_to_thread(ns, NULL, lock) != 0)
ldlm_handle_bl_callback(ns, NULL, lock);
} else if (ns->ns_client == LDLM_NAMESPACE_CLIENT &&
- !lock->l_readers && !lock->l_writers) {
+ !lock->l_readers && !lock->l_writers &&
+ !(lock->l_flags & LDLM_FL_NO_LRU)) {
/* If this is a client-side namespace and this was the last
* reference, put it on the LRU. */
LASSERT(list_empty(&lock->l_lru));
EXPORT_SYMBOL(ldlm_cli_enqueue);
EXPORT_SYMBOL(ldlm_cli_cancel);
EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+EXPORT_SYMBOL(ldlm_cli_join_lru);
EXPORT_SYMBOL(ldlm_replay_locks);
EXPORT_SYMBOL(ldlm_resource_foreach);
EXPORT_SYMBOL(ldlm_namespace_foreach);
RETURN(ELDLM_OK);
}
+/* join/split resource locks to/from lru list */
+int ldlm_cli_join_lru(struct ldlm_namespace *ns,
+ struct ldlm_res_id *res_id, int join)
+{
+ struct ldlm_resource *res;
+ struct ldlm_lock *lock, *n;
+ int count = 0;
+ ENTRY;
+
+ LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT);
+
+ res = ldlm_resource_get(ns, NULL, *res_id, LDLM_EXTENT, 0);
+ if (res == NULL)
+ RETURN(count);
+ LASSERT(res->lr_type == LDLM_EXTENT);
+
+ l_lock(&ns->ns_lock);
+ if (!join)
+ goto split;
+
+ list_for_each_entry_safe (lock, n, &res->lr_granted, l_res_link) {
+ if (list_empty(&lock->l_lru) &&
+ !lock->l_readers && !lock->l_writers &&
+ !(lock->l_flags & LDLM_FL_LOCAL) &&
+ !(lock->l_flags & LDLM_FL_CBPENDING)) {
+ LASSERT(ns->ns_nr_unused >= 0);
+ list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+ ns->ns_nr_unused++;
+ lock->l_flags &= ~LDLM_FL_NO_LRU;
+ LDLM_DEBUG(lock, "join lock to lru");
+ count++;
+ }
+ }
+ goto unlock;
+split:
+ list_for_each_entry_safe (lock, n, &ns->ns_unused_list, l_lru) {
+ if (lock->l_resource == res) {
+ ldlm_lock_remove_from_lru(lock);
+ lock->l_flags |= LDLM_FL_NO_LRU;
+ LDLM_DEBUG(lock, "split lock from lru");
+ count++;
+ }
+ }
+unlock:
+ l_unlock(&ns->ns_lock);
+ ldlm_resource_putref(res);
+ RETURN(count);
+}
+
/* Lock iterators. */
int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
MODULES := llite
-llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o
+llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o
ifeq ($(PATCHLEVEL),4)
llite-objs += rw24.o super.o
llite-objs += rw26.o super25.o
endif
-@INCLUDE_RULES@
\ No newline at end of file
+@INCLUDE_RULES@
obj-y += llite.o
llite-objs := llite_lib.o dcache.o super.o rw.o \
super25.o file.o dir.o symlink.o namei.o lproc_llite.o \
- rw26.o llite_nfs.o llite_close.o special.o
+ rw26.o llite_nfs.o llite_close.o special.o llite_mmap.o
CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
"count: %lu skip: %lu end: %lu%s\n", start, start % count,
count, skip, end, discard ? " (DISCARDING)" : "");
+
+ /* walk through the vmas on the inode and tear down mmaped pages that
+ * intersect with the lock. this stops immediately if there are no
+ * mmap()ed regions of the file. This is not efficient at all and
+ * should be short lived. We'll associate mmap()ed pages with the lock
+ * and will be able to find them directly */
+ for (i = start; i <= end; i += (j + skip)) {
+ j = min(count - (i % count), end - i + 1);
+ LASSERT(j > 0);
+ LASSERT(inode->i_mapping);
+ if (ll_teardown_mmaps(inode->i_mapping,
+ (__u64)i << PAGE_CACHE_SHIFT,
+ ((__u64)(i+j) << PAGE_CACHE_SHIFT) - 1) )
+ break;
+ }
/* this is the simplistic implementation of page eviction at
* cancelation. It is careful to get races with other page
LASSERT(lockh->cookie == 0);
+ /* don't drop the mmapped file to LRU */
+ if (mapping_mapped(inode->i_mapping))
+ ast_flags |= LDLM_FL_NO_LRU;
+
/* XXX phil: can we do this? won't it screw the file size up? */
if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
(sbi->ll_flags & LL_SBI_NOLCK))
static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
loff_t *ppos)
{
- struct ll_file_data *fd = filp->private_data;
struct inode *inode = filp->f_dentry->d_inode;
struct ll_inode_info *lli = ll_i2info(inode);
struct lov_stripe_md *lsm = lli->lli_smd;
- struct lustre_handle lockh = { 0 };
- ldlm_policy_data_t policy;
+ struct ll_lock_tree tree;
+ struct ll_lock_tree_node *node;
int rc;
ssize_t retval;
__u64 kms;
if (!lsm)
RETURN(0);
-
- policy.l_extent.start = *ppos;
- policy.l_extent.end = *ppos + count - 1;
-
- rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, 0);
+
+ node = ll_node_from_inode(inode, *ppos, *ppos + count - 1,
+ LCK_PR);
+ tree.lt_fd = filp->private_data;
+ rc = ll_tree_lock(&tree, node, buf, count,
+ filp->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
if (rc != 0)
RETURN(rc);
retval = generic_file_read(filp, buf, count, ppos);
out:
- ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
+ ll_tree_unlock(&tree);
RETURN(retval);
}
static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
loff_t *ppos)
{
- struct ll_file_data *fd = file->private_data;
struct inode *inode = file->f_dentry->d_inode;
struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
- struct lustre_handle lockh = { 0 };
- ldlm_policy_data_t policy;
+ struct ll_lock_tree tree;
+ struct ll_lock_tree_node *node;
loff_t maxbytes = ll_file_maxbytes(inode);
ssize_t retval;
int rc;
RETURN(-EBADF);
LASSERT(lsm);
-
- if (file->f_flags & O_APPEND) {
- policy.l_extent.start = 0;
- policy.l_extent.end = OBD_OBJECT_EOF;
- } else {
- policy.l_extent.start = *ppos;
- policy.l_extent.end = *ppos + count - 1;
- }
-
- rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, 0);
+
+ if (file->f_flags & O_APPEND)
+ node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW);
+ else
+ node = ll_node_from_inode(inode, *ppos, *ppos + count - 1,
+ LCK_PW);
+ if (IS_ERR(node))
+ RETURN(PTR_ERR(node));
+
+ tree.lt_fd = file->private_data;
+ rc = ll_tree_lock(&tree, node, buf, count,
+ file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
if (rc != 0)
RETURN(rc);
retval = generic_file_write(file, buf, count, ppos);
out:
- ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
+ ll_tree_unlock(&tree);
lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
retval > 0 ? retval : 0);
RETURN(retval);
.ioctl = ll_file_ioctl,
.open = ll_file_open,
.release = ll_file_release,
- .mmap = generic_file_mmap,
+ .mmap = ll_file_mmap,
.llseek = ll_file_seek,
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
.sendfile = generic_file_sendfile,
void ll_close_thread_shutdown(struct ll_close_queue *lcq);
int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+/* llite/llite_mmap.c */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+typedef struct rb_root rb_root_t;
+typedef struct rb_node rb_node_t;
+#endif
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+ rb_root_t lt_root;
+ struct list_head lt_locked_list;
+ struct ll_file_data *lt_fd;
+};
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+ __u64 end, ldlm_mode_t mode);
+int ll_tree_lock(struct ll_lock_tree *tree,
+ struct ll_lock_tree_node *first_node,
+ const char *buf, size_t count, int ast_flags);
+int ll_tree_unlock(struct ll_lock_tree *tree);
+
+
#define LL_SBI_NOLCK 0x1
#define LL_MAX_BLKSIZE (4UL * 1024 * 1024)
devno = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid,
strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid));
+ /* s_dev is also used in lt_compare() to compare two fs */
sb->s_dev = devno;
obd = class_name2obd(osc);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/iobuf.h>
+#endif
+
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+#define VMA_DEBUG(vma, fmt, arg...) \
+ CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) " \
+ "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end, \
+ vma->vm_pgoff, vma->vm_file->f_dentry->d_inode, \
+ vma->vm_file->f_dentry->d_inode->i_ino, \
+ vma->vm_file->f_dentry->d_iname, ## arg); \
+
+
+struct ll_lock_tree_node {
+ rb_node_t lt_node;
+ struct list_head lt_locked_item;
+ __u64 lt_oid;
+ ldlm_policy_data_t lt_policy;
+ struct lustre_handle lt_lockh;
+ ldlm_mode_t lt_mode;
+ struct inode *lt_inode;
+};
+
+__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms);
+int lt_get_mmap_locks(struct ll_lock_tree *tree,
+ unsigned long addr, size_t count);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+ int *type);
+#else
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+ int unused);
+#endif
+
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+ __u64 end, ldlm_mode_t mode)
+{
+ struct ll_lock_tree_node *node;
+
+ OBD_ALLOC(node, sizeof(*node));
+ if (node == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
+
+ node->lt_inode = inode;
+ node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id;
+ node->lt_policy.l_extent.start = start;
+ node->lt_policy.l_extent.end = end;
+ memset(&node->lt_lockh, 0, sizeof(node->lt_lockh));
+ INIT_LIST_HEAD(&node->lt_locked_item);
+ node->lt_mode = mode;
+
+ return node;
+}
+
+int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two)
+{
+ /* To avoid multiple fs deadlock */
+ if (one->lt_inode->i_sb->s_dev < two->lt_inode->i_sb->s_dev)
+ return -1;
+ if (one->lt_inode->i_sb->s_dev > two->lt_inode->i_sb->s_dev)
+ return 1;
+
+ if (one->lt_oid < two->lt_oid)
+ return -1;
+ if (one->lt_oid > two->lt_oid)
+ return 1;
+
+ if (one->lt_policy.l_extent.end < two->lt_policy.l_extent.start)
+ return -1;
+ if (one->lt_policy.l_extent.start > two->lt_policy.l_extent.end)
+ return 1;
+
+ return 0; /* they are the same object and overlap */
+}
+
+static void lt_merge(struct ll_lock_tree_node *dst,
+ struct ll_lock_tree_node *src)
+{
+ dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start,
+ src->lt_policy.l_extent.start);
+ dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end,
+ src->lt_policy.l_extent.end);
+
+ /* XXX could be a real call to the dlm to find superset modes */
+ if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW)
+ dst->lt_mode = LCK_PW;
+}
+
+static void lt_insert(struct ll_lock_tree *tree,
+ struct ll_lock_tree_node *node)
+{
+ struct ll_lock_tree_node *walk;
+ rb_node_t **p, *parent;
+ ENTRY;
+
+restart:
+ p = &tree->lt_root.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ walk = rb_entry(parent, struct ll_lock_tree_node, lt_node);
+ switch (lt_compare(node, walk)) {
+ case -1:
+ p = &(*p)->rb_left;
+ break;
+ case 1:
+ p = &(*p)->rb_right;
+ break;
+ case 0:
+ lt_merge(node, walk);
+ rb_erase(&walk->lt_node, &tree->lt_root);
+ OBD_FREE(walk, sizeof(*walk));
+ goto restart;
+ break;
+ default:
+ LBUG();
+ break;
+ }
+ }
+ rb_link_node(&node->lt_node, parent, p);
+ rb_insert_color(&node->lt_node, &tree->lt_root);
+ EXIT;
+}
+
+static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree)
+{
+ rb_node_t *rbnode;
+ struct ll_lock_tree_node *node = NULL;
+
+ for ( rbnode = tree->lt_root.rb_node; rbnode != NULL;
+ rbnode = rbnode->rb_left) {
+ if (rbnode->rb_left == NULL) {
+ node = rb_entry(rbnode, struct ll_lock_tree_node,
+ lt_node);
+ break;
+ }
+ }
+ RETURN(node);
+}
+
+int ll_tree_unlock(struct ll_lock_tree *tree)
+{
+ struct ll_lock_tree_node *node;
+ struct list_head *pos, *n;
+ struct inode *inode;
+ int rc = 0;
+ ENTRY;
+
+ list_for_each_safe(pos, n, &tree->lt_locked_list) {
+ node = list_entry(pos, struct ll_lock_tree_node,
+ lt_locked_item);
+
+ inode = node->lt_inode;
+ rc = ll_extent_unlock(tree->lt_fd, inode,
+ ll_i2info(inode)->lli_smd, node->lt_mode,
+ &node->lt_lockh);
+ if (rc != 0) {
+ /* XXX better message */
+ CERROR("couldn't unlock %d\n", rc);
+ }
+ list_del(&node->lt_locked_item);
+ OBD_FREE(node, sizeof(*node));
+ }
+
+ while ((node = lt_least_node(tree))) {
+ rb_erase(&node->lt_node, &tree->lt_root);
+ OBD_FREE(node, sizeof(*node));
+ }
+
+ RETURN(rc);
+}
+
+int ll_tree_lock(struct ll_lock_tree *tree,
+ struct ll_lock_tree_node *first_node,
+ const char *buf, size_t count, int ast_flags)
+{
+ struct ll_lock_tree_node *node;
+ int rc = 0;
+ ENTRY;
+
+ tree->lt_root.rb_node = NULL;
+ INIT_LIST_HEAD(&tree->lt_locked_list);
+ if (first_node != NULL)
+ lt_insert(tree, first_node);
+
+ /* To avoid such subtle deadlock case: client1 try to read file1 to
+ * mmapped file2, on the same time, client2 try to read file2 to
+ * mmapped file1.*/
+ rc = lt_get_mmap_locks(tree, (unsigned long)buf, count);
+ if (rc)
+ GOTO(out, rc);
+
+ while ((node = lt_least_node(tree))) {
+ struct inode *inode = node->lt_inode;
+ rc = ll_extent_lock(tree->lt_fd, inode,
+ ll_i2info(inode)->lli_smd, node->lt_mode,
+ &node->lt_policy, &node->lt_lockh,
+ ast_flags);
+ if (rc != 0)
+ GOTO(out, rc);
+
+ rb_erase(&node->lt_node, &tree->lt_root);
+ list_add_tail(&node->lt_locked_item, &tree->lt_locked_list);
+ }
+ RETURN(rc);
+out:
+ ll_tree_unlock(tree);
+ RETURN(rc);
+}
+
+static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma)
+{
+ /* we only want to hold PW locks if the mmap() can generate
+ * writes back to the file and that only happens in shared
+ * writable vmas */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return LCK_PW;
+ return LCK_PR;
+}
+
+static void policy_from_vma(ldlm_policy_data_t *policy,
+ struct vm_area_struct *vma, unsigned long addr,
+ size_t count)
+{
+ policy->l_extent.start = ((addr - vma->vm_start) & PAGE_CACHE_MASK) +
+ (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+ policy->l_extent.end = (policy->l_extent.start + count - 1) |
+ (PAGE_CACHE_SIZE - 1);
+}
+
+static struct vm_area_struct * our_vma(unsigned long addr, size_t count)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *ret = NULL;
+ ENTRY;
+
+ spin_lock(&mm->page_table_lock);
+ for(vma = find_vma(mm, addr);
+ vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+ if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage &&
+ vma->vm_flags & VM_SHARED) {
+ ret = vma;
+ break;
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+ RETURN(ret);
+}
+
+int lt_get_mmap_locks(struct ll_lock_tree *tree,
+ unsigned long addr, size_t count)
+{
+ struct vm_area_struct *vma;
+ struct ll_lock_tree_node *node;
+ ldlm_policy_data_t policy;
+ struct inode *inode;
+ ENTRY;
+
+ if (count == 0)
+ RETURN(0);
+
+ /* we need to look up vmas on page aligned addresses */
+ count += addr & (PAGE_SIZE - 1);
+ addr &= PAGE_MASK;
+
+ while ((vma = our_vma(addr, count)) != NULL) {
+ LASSERT(vma->vm_file);
+
+ inode = vma->vm_file->f_dentry->d_inode;
+ policy_from_vma(&policy, vma, addr, count);
+ node = ll_node_from_inode(inode, policy.l_extent.start,
+ policy.l_extent.end,
+ mode_from_vma(vma));
+ if (IS_ERR(node)) {
+ CERROR("not enough mem for lock_tree_node!\n");
+ RETURN(-ENOMEM);
+ }
+ lt_insert(tree, node);
+
+ if (vma->vm_end - addr >= count)
+ break;
+ count -= vma->vm_end - addr;
+ addr = vma->vm_end;
+ }
+ RETURN(0);
+}
+
+/* FIXME: there is a pagefault race goes as follow (only 2.4):
+ * 1. A user process on node A accesses a portion of a mapped file,
+ * resulting in a page fault. The pagefault handler invokes the
+ * ll_nopage function, which reads the page into memory.
+ * 2. A user process on node B writes to the same portion of the file
+ * (either via mmap or write()), that cause node A to cancel the
+ * lock and truncate the page.
+ * 3. Node A then executes the rest of do_no_page(), entering the
+ * now-invalid page into the PTEs.
+ *
+ * Make the whole do_no_page as a hook to cover both the page cache
+ * and page mapping installing with dlm lock would eliminate this race.
+ *
+ * In 2.6, the truncate_count of address_space can cover this race.
+ */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+ int *type)
+#else
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+ int unused)
+#endif
+{
+ struct file *filp = vma->vm_file;
+ struct ll_file_data *fd = filp->private_data;
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct lustre_handle lockh = { 0 };
+ ldlm_policy_data_t policy;
+ ldlm_mode_t mode;
+ struct page *page = NULL;
+ __u64 kms, old_mtime;
+ unsigned long pgoff, size, rand_read, seq_read;
+ int rc = 0;
+ ENTRY;
+
+ if (ll_i2info(inode)->lli_smd == NULL) {
+ CERROR("No lsm on fault?\n");
+ RETURN(NULL);
+ }
+
+ /* start and end the lock on the first and last bytes in the page */
+ policy_from_vma(&policy, vma, address, PAGE_CACHE_SIZE);
+
+ CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
+ vma, inode->i_ino, policy.l_extent.start,
+ policy.l_extent.end);
+
+ mode = mode_from_vma(vma);
+ old_mtime = LTIME_S(inode->i_mtime);
+
+ rc = ll_extent_lock(fd, inode, ll_i2info(inode)->lli_smd, mode, &policy,
+ &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU);
+ if (rc != 0)
+ RETURN(NULL);
+
+ if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)
+ CWARN("binary changed. inode %lu\n", inode->i_ino);
+
+ /* XXX change inode size without i_sem hold! there is a race condition
+ * with truncate path. (see ll_extent_lock) */
+ kms = lov_merge_size(ll_i2info(inode)->lli_smd, 1);
+ pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+ size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ if (pgoff >= size)
+ ll_glimpse_size(inode);
+ else
+ inode->i_size = kms;
+
+ /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+ * the kernel will not read other pages not covered by ldlm in
+ * filemap_nopage. we do our readahead in ll_readpage.
+ */
+ rand_read = vma->vm_flags & VM_RAND_READ;
+ seq_read = vma->vm_flags & VM_SEQ_READ;
+ vma->vm_flags &= ~ VM_SEQ_READ;
+ vma->vm_flags |= VM_RAND_READ;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ page = filemap_nopage(vma, address, type);
+#else
+ page = filemap_nopage(vma, address, unused);
+#endif
+ vma->vm_flags &= ~VM_RAND_READ;
+ vma->vm_flags |= (rand_read | seq_read);
+
+ ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
+ RETURN(page);
+}
+
+/* To avoid cancel the locks covering mmapped region for lock cache pressure,
+ * we track the mapped vma count by lli_mmap_cnt.
+ * ll_vm_open(): when first vma is linked, split locks from lru.
+ * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru.
+ *
+ * XXX we don't check the if the region of vma/lock for performance.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ ENTRY;
+
+ LASSERT(vma->vm_file);
+
+ spin_lock(&lli->lli_lock);
+ LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0);
+
+ atomic_inc(&lli->lli_mmap_cnt);
+ if (atomic_read(&lli->lli_mmap_cnt) == 1) {
+ struct lov_stripe_md *lsm = lli->lli_smd;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ int count;
+
+ spin_unlock(&lli->lli_lock);
+ count = obd_join_lru(sbi->ll_osc_exp, lsm, 0);
+ VMA_DEBUG(vma, "split %d unused locks from lru", count);
+ } else {
+ spin_unlock(&lli->lli_lock);
+ }
+
+}
+
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ ENTRY;
+
+ LASSERT(vma->vm_file);
+
+ spin_lock(&lli->lli_lock);
+ LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0);
+
+ atomic_dec(&lli->lli_mmap_cnt);
+ if (atomic_read(&lli->lli_mmap_cnt) == 0) {
+ struct lov_stripe_md *lsm = lli->lli_smd;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ int count;
+
+ spin_unlock(&lli->lli_lock);
+ count = obd_join_lru(sbi->ll_osc_exp, lsm, 1);
+ VMA_DEBUG(vma, "join %d unused locks to lru", count);
+ } else {
+ spin_unlock(&lli->lli_lock);
+ }
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+static int ll_populate(struct vm_area_struct *area, unsigned long address,
+ unsigned long len, pgprot_t prot, unsigned long pgoff,
+ int nonblock)
+{
+ int rc = 0;
+ ENTRY;
+
+ /* always set nonblock as true to avoid page read ahead */
+ rc = filemap_populate(area, address, len, prot, pgoff, 1);
+ RETURN(rc);
+}
+#endif
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma,
+ __u64 byte)
+{
+ return vma->vm_start +
+ (byte - ((__u64)vma->vm_pgoff << PAGE_SHIFT));
+
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+/* [first, last] are the byte offsets affected.
+ * vm_{start, end} are user addresses of the first byte of the mapping and
+ * the next byte beyond it
+ * vm_pgoff is the page index of the first byte in the mapping */
+static void teardown_vmas(struct vm_area_struct *vma, __u64 first,
+ __u64 last)
+{
+ unsigned long address, len;
+ for (; vma ; vma = vma->vm_next_share) {
+ if (last >> PAGE_SHIFT < vma->vm_pgoff)
+ continue;
+ if (first >> PAGE_SHIFT >= (vma->vm_pgoff +
+ ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
+ continue;
+
+ /* XXX in case of unmap the cow pages of a running file,
+ * don't unmap these private writeable mapping here!
+ * though that will break private mappping a little.
+ *
+ * the clean way is to check the mapping of every page
+ * and just unmap the non-cow pages, just like
+ * unmap_mapping_range() with even_cow=0 in kernel 2.6.
+ */
+ if (!(vma->vm_flags & VM_SHARED) &&
+ (vma->vm_flags & VM_WRITE))
+ continue;
+
+ address = max((unsigned long)vma->vm_start,
+ file_to_user(vma, first));
+ len = min((unsigned long)vma->vm_end,
+ file_to_user(vma, last) + 1) - address;
+
+ VMA_DEBUG(vma, "zapping vma [first="LPU64" last="LPU64" "
+ "address=%ld len=%ld]\n", first, last, address, len);
+ LASSERT(len > 0);
+ ll_zap_page_range(vma, address, len);
+ }
+}
+#endif
+
+/* XXX put nice comment here. talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first,
+ __u64 last)
+{
+ int rc = -ENOENT;
+ ENTRY;
+
+ LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ if (mapping_mapped(mapping)) {
+ rc = 0;
+ unmap_mapping_range(mapping, first + PAGE_SIZE - 1,
+ last - first + 1, 0);
+ }
+#else
+ spin_lock(&mapping->i_shared_lock);
+ if (mapping->i_mmap != NULL) {
+ rc = 0;
+ teardown_vmas(mapping->i_mmap, first, last);
+ }
+ if (mapping->i_mmap_shared != NULL) {
+ rc = 0;
+ teardown_vmas(mapping->i_mmap_shared, first, last);
+ }
+ spin_unlock(&mapping->i_shared_lock);
+#endif
+ RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+ .nopage = ll_nopage,
+ .open = ll_vm_open,
+ .close = ll_vm_close,
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ .populate = ll_populate,
+#endif
+};
+
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ int rc;
+ ENTRY;
+
+ rc = generic_file_mmap(file, vma);
+ if (rc == 0) {
+ vma->vm_ops = &ll_file_vm_ops;
+ vma->vm_ops->open(vma);
+ /* update the inode's size and mtime */
+ rc = ll_glimpse_size(file->f_dentry->d_inode);
+ }
+
+ RETURN(rc);
+}
spin_unlock(&sbi->ll_lock);
return;
}
+int ll_writepage(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_export *exp;
+ struct ll_async_page *llap;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(!PageDirty(page));
+ LASSERT(PageLocked(page));
+
+ exp = ll_i2obdexp(inode);
+ if (exp == NULL)
+ GOTO(out, rc = -EINVAL);
+
+ llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
+ if (IS_ERR(llap))
+ GOTO(out, rc = PTR_ERR(llap));
+
+ page_cache_get(page);
+ if (llap->llap_write_queued) {
+ LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
+ rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
+ llap->llap_cookie,
+ ASYNC_READY | ASYNC_URGENT);
+ } else {
+ rc = queue_or_sync_write(exp, inode, llap,
+ PAGE_SIZE, ASYNC_READY | ASYNC_URGENT);
+ }
+ if (rc)
+ page_cache_release(page);
+out:
+ if (rc) {
+ if (!lli->lli_async_rc)
+ lli->lli_async_rc = rc;
+ /* re-dirty page on error so it retries write */
+ SetPageDirty(page);
+ ClearPageLaunder(page);
+ unlock_page(page);
+ }
+ RETURN(rc);
+}
/*
* for now we do our readpage the same on both 2.4 and 2.5. The kernel's
}
if (rc == 0) {
-#if 0
CWARN("ino %lu page %lu (%llu) not covered by "
"a lock (mmap?). check debug logs.\n",
inode->i_ino, page->index,
(long long)page->index << PAGE_CACHE_SHIFT);
-#endif
}
rc = ll_issue_page_read(exp, llap, oig, 0);
#include "llite_internal.h"
#include <linux/lustre_compat25.h>
-static int ll_writepage_24(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- struct ll_inode_info *lli = ll_i2info(inode);
- struct obd_export *exp;
- struct ll_async_page *llap;
- int rc = 0;
- ENTRY;
-
- LASSERT(!PageDirty(page));
- LASSERT(PageLocked(page));
-
- exp = ll_i2obdexp(inode);
- if (exp == NULL)
- GOTO(out, rc = -EINVAL);
-
- llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
- if (IS_ERR(llap))
- GOTO(out, rc = PTR_ERR(llap));
-
- page_cache_get(page);
- if (llap->llap_write_queued) {
- LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
- rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
- llap->llap_cookie,
- ASYNC_READY | ASYNC_URGENT);
- } else {
- llap->llap_write_queued = 1;
- rc = obd_queue_async_io(exp, lli->lli_smd, NULL,
- llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
- 0, ASYNC_READY | ASYNC_URGENT);
- if (rc == 0)
- LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n");
- else
- llap->llap_write_queued = 0;
- }
- if (rc)
- page_cache_release(page);
-out:
- if (rc) {
- if (!lli->lli_async_rc)
- lli->lli_async_rc = rc;
- /* re-dirty page on error so it retries write */
- SetPageDirty(page);
- ClearPageLaunder(page);
- unlock_page(page);
- }
- RETURN(rc);
-}
-
static int ll_direct_IO_24(int rw,
#ifdef HAVE_DIO_FILE
struct file *file,
struct address_space_operations ll_aops = {
.readpage = ll_readpage,
.direct_IO = ll_direct_IO_24,
- .writepage = ll_writepage_24,
+ .writepage = ll_writepage,
.prepare_write = ll_prepare_write,
.commit_write = ll_commit_write,
.removepage = ll_removepage,
static int ll_writepage_26(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- struct ll_inode_info *lli = ll_i2info(inode);
- struct obd_export *exp;
- struct ll_async_page *llap;
- int rc;
- ENTRY;
-
- LASSERT(!PageDirty(page));
- LASSERT(PageLocked(page));
-
- exp = ll_i2obdexp(inode);
- if (exp == NULL)
- GOTO(out, rc = -EINVAL);
-
- llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
- if (IS_ERR(llap))
- GOTO(out, rc = PTR_ERR(llap));
-
- page_cache_get(page);
- if (llap->llap_write_queued) {
- LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
- rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
- llap->llap_cookie,
- ASYNC_READY | ASYNC_URGENT);
- } else {
- llap->llap_write_queued = 1;
- rc = obd_queue_async_io(exp, lli->lli_smd, NULL,
- llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
- 0, ASYNC_READY | ASYNC_URGENT);
- if (rc == 0)
- LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n");
- else
- llap->llap_write_queued = 0;
- }
- if (rc)
- page_cache_release(page);
-out:
- if (rc) {
- if (!lli->lli_async_rc)
- lli->lli_async_rc = rc;
- /* re-dirty page on error so it retries write */
- SetPageDirty(page);
- unlock_page(page);
- } else {
- set_page_writeback(page);
- }
- RETURN(rc);
+ return ll_writepage(page);
}
/* It is safe to not check anything in invalidatepage/releasepage below
RETURN(rc);
}
+static int lov_join_lru(struct obd_export *exp,
+ struct lov_stripe_md *lsm, int join)
+{
+ struct lov_obd *lov;
+ struct lov_oinfo *loi;
+ int i, count = 0;
+ ENTRY;
+
+ ASSERT_LSM_MAGIC(lsm);
+ if (!exp || !exp->exp_obd)
+ RETURN(-ENODEV);
+
+ lov = &exp->exp_obd->u.lov;
+ for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
+ struct lov_stripe_md submd;
+ int rc = 0;
+
+ if (lov->tgts[loi->loi_ost_idx].active == 0)
+ CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+
+ submd.lsm_object_id = loi->loi_id;
+ submd.lsm_stripe_count = 0;
+ rc = obd_join_lru(lov->tgts[loi->loi_ost_idx].ltd_exp,
+ &submd, join);
+ if (rc < 0) {
+ CERROR("join lru failed. objid: "LPX64" subobj: "LPX64
+ " ostidx: %d rc: %d\n", lsm->lsm_object_id,
+ loi->loi_id, loi->loi_ost_idx, rc);
+ return rc;
+ } else {
+ count += rc;
+ }
+ }
+ RETURN(count);
+}
+
#define LOV_U64_MAX ((__u64)~0ULL)
#define LOV_SUM_MAX(tot, add) \
do { \
.o_change_cbdata = lov_change_cbdata,
.o_cancel = lov_cancel,
.o_cancel_unused = lov_cancel_unused,
+ .o_join_lru = lov_join_lru,
.o_iocontrol = lov_iocontrol,
.o_get_info = lov_get_info,
.o_set_info = lov_set_info,
LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, join_lru);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, san_preprw);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
#endif
#endif
-static void osc_set_data_with_check(struct lustre_handle *lockh, void *data)
+static void osc_set_data_with_check(struct lustre_handle *lockh, void *data,
+ int flags)
{
struct ldlm_lock *lock = ldlm_handle2lock(lockh);
}
#endif
lock->l_ast_data = data;
+ lock->l_flags |= (flags & LDLM_FL_NO_LRU);
l_unlock(&lock->l_resource->lr_namespace->ns_lock);
LDLM_LOCK_PUT(lock);
}
rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode,
lockh);
if (rc == 1) {
- osc_set_data_with_check(lockh, data);
+ osc_set_data_with_check(lockh, data, *flags);
if (*flags & LDLM_FL_HAS_INTENT) {
/* I would like to be able to ASSERT here that rss <=
* kms, but I can't, for reasons which are explained in
* lock_match. I want a second opinion. */
ldlm_lock_addref(lockh, LCK_PR);
ldlm_lock_decref(lockh, LCK_PW);
- osc_set_data_with_check(lockh, data);
+ osc_set_data_with_check(lockh, data, *flags);
RETURN(ELDLM_OK);
}
}
policy, mode, lockh);
if (rc) {
//if (!(*flags & LDLM_FL_TEST_LOCK))
- osc_set_data_with_check(lockh, data);
+ osc_set_data_with_check(lockh, data, *flags);
RETURN(rc);
}
/* If we're trying to read, we also search for an existing PW lock. The
/* FIXME: This is not incredibly elegant, but it might
* be more elegant than adding another parameter to
* lock_match. I want a second opinion. */
- osc_set_data_with_check(lockh, data);
+ osc_set_data_with_check(lockh, data, *flags);
ldlm_lock_addref(lockh, LCK_PR);
ldlm_lock_decref(lockh, LCK_PW);
}
opaque);
}
+static int osc_join_lru(struct obd_export *exp,
+ struct lov_stripe_md *lsm, int join)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+
+ return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join);
+}
+
static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
unsigned long max_age)
{
.o_change_cbdata = osc_change_cbdata,
.o_cancel = osc_cancel,
.o_cancel_unused = osc_cancel_unused,
+ .o_join_lru = osc_join_lru,
.o_iocontrol = osc_iocontrol,
.o_get_info = osc_get_info,
.o_set_info = osc_set_info,
.o_change_cbdata = osc_change_cbdata,
.o_cancel = osc_cancel,
.o_cancel_unused = osc_cancel_unused,
+ .o_join_lru = osc_join_lru,
.o_iocontrol = osc_iocontrol,
.o_import_event = osc_import_event,
.o_llog_init = osc_llog_init,
bin_PROGRAMS = mcreate munlink
endif # TESTS
+mmap_sanity_SOURCES= mmap_sanity.c
stat_SOURCES = stat.c stat_fs.h
# mkdirdeep_LDADD=-L$(top_builddir)/portals/utils -lptlctl $(LIBREADLINE)
#include <sys/socket.h>
#include <netdb.h>
#include <string.h>
+#include <sys/wait.h>
-char *dir = NULL, *node = NULL, *dir2 = NULL;
+char *dir = NULL, *dir2 = NULL;
long page_size;
char mmap_sanity[256];
static void usage(void)
{
- printf("Usage: mmap_sanity -d dir [-n node | -m dir2]\n");
+ printf("Usage: mmap_sanity -d dir [-m dir2]\n");
printf(" dir lustre mount point\n");
- printf(" node another client\n");
printf(" dir2 another mount point\n");
exit(127);
}
-#define MMAP_NOTIFY_PORT 7676
-static int mmap_notify(char *target, char *str, int delay)
-{
- unsigned short port = MMAP_NOTIFY_PORT;
- int socket_type = SOCK_DGRAM;
- struct sockaddr_in server;
- struct hostent *hp;
- int len, sockfd, rc = 0;
-
- if (target == NULL)
- return 0;
-
- sockfd = socket(AF_INET, socket_type, 0);
- if (sockfd < 0) {
- perror("socket()");
- return errno;
- }
-
- if ((hp = gethostbyname(target)) == NULL) {
- perror(target);
- rc = errno;
- goto out_close;
- }
-
- memset(&server,0,sizeof(server));
- memcpy(&(server.sin_addr), hp->h_addr, hp->h_length);
- server.sin_family = AF_INET;
- server.sin_port = htons(port);
-
- len = sizeof(server);
- if (delay)
- sleep(delay);
-
- rc = sendto(sockfd, str, strlen(str), 0,
- (struct sockaddr *)&server, len);
- if (rc < 0) {
- perror("sendto()");
- rc = errno;
- } else
- rc = 0;
-
-out_close:
- close(sockfd);
- return rc;
-}
-
-static int mmap_wait(char *str, int timeout)
-{
- unsigned short port = MMAP_NOTIFY_PORT;
- int socket_type = SOCK_DGRAM;
- struct sockaddr_in local, from;
- char host[256];
- struct hostent *hp;
- fd_set rfds;
- struct timeval tv;
- int sockfd, rc = 0;
-
- if (dir2 != NULL)
- return 0;
-
- memset(host, 0, sizeof(host));
- if (gethostname(host, sizeof(host))) {
- perror("gethostname()");
- return errno;
- }
-
- if ((hp = gethostbyname(host)) == NULL) {
- perror(host);
- return errno;
- }
-
- local.sin_family = AF_INET;
- memcpy(&(local.sin_addr), hp->h_addr, hp->h_length);
- local.sin_port = htons(port);
-
- sockfd = socket(AF_INET, socket_type, 0);
- if (sockfd < 0) {
- perror("socket()");
- return errno;
- }
-
- rc = bind(sockfd, (struct sockaddr *)&local, sizeof(local));
- if (rc < 0) {
- perror("bind()");
- rc = errno;
- goto out_close;
- }
-
- FD_ZERO(&rfds);
- FD_SET(sockfd, &rfds);
- tv.tv_sec = timeout ? timeout : 5;
- tv.tv_usec = 0;
-
- rc = select(sockfd + 1, &rfds, NULL, NULL, &tv);
- if (rc) { /* got data */
- char buffer[1024];
- int fromlen =sizeof(from);
-
- memset(buffer, 0, sizeof(buffer));
- rc = recvfrom(sockfd, buffer, sizeof(buffer), 0,
- (struct sockaddr *)&from, &fromlen);
- if (rc <= 0) {
- perror("recvfrom()");
- rc = errno;
- goto out_close;
- }
- rc = 0;
-
- if (strncmp(str, buffer, strlen(str)) != 0) {
- fprintf(stderr, "expected string mismatch!\n");
- rc = EINVAL;
- }
- } else { /* timeout */
- fprintf(stderr, "timeout!\n");
- rc = ETIME;
- }
-
-out_close:
- close(sockfd);
- return rc;
-}
-
static int remote_tst(int tc, char *mnt);
-static int mmap_run(char *host, int tc)
+static int mmap_run(int tc)
{
pid_t child;
- char nodearg[256], command[256];
int rc = 0;
child = fork();
if (dir2 != NULL) {
rc = remote_tst(tc, dir2);
} else {
- sprintf(nodearg, "-w %s", node);
- sprintf(command, "%s -d %s -n %s -c %d",
- mmap_sanity, dir, host, tc);
- rc = execlp("pdsh", "pdsh", "-S", nodearg, command, NULL);
- if (rc)
- perror("execlp()");
+ rc = EINVAL;
+ fprintf(stderr, "invalid argument!\n");
}
_exit(rc);
}
-static int mmap_initialize(char *myself, int tc)
+static int mmap_initialize(char *myself)
{
char buf[1024], *file;
int fdr, fdw, count, rc = 0;
perror("sysconf(_SC_PAGESIZE)");
return errno;
}
- if (tc)
- return 0;
/* copy myself to lustre for another client */
fdr = open(myself, O_RDONLY);
return rc;
}
-static void mmap_finalize(int tc)
+static void mmap_finalize()
{
- if (tc)
- return;
unlink(mmap_sanity);
}
/* cocurrent mmap operations on two nodes */
static int mmap_tst3(char *mnt)
{
- char *ptr, mmap_file[256], host[256];
+ char *ptr, mmap_file[256];
int region, fd, rc = 0;
region = page_size * 100;
goto out_close;
}
- if (gethostname(host, sizeof(host))) {
- perror("gethostname()");
- rc = errno;
- goto out_unmap;
- }
-
- rc = mmap_run(host, 3);
+ rc = mmap_run(3);
if (rc)
goto out_unmap;
- rc = mmap_wait("mmap done", 10);
memset(ptr, 'a', region);
-
sleep(2); /* wait for remote test finish */
out_unmap:
munmap(ptr, region);
goto out_close;
}
memset(ptr, 'b', region);
-
- rc = mmap_notify(node, "mmap done", 1);
- if (rc)
- goto out_unmap;
-
memset(ptr, 'c', region);
-out_unmap:
munmap(ptr, region);
out_close:
close(fd);
* client2 write to file_4b from mmap()ed file_4a. */
static int mmap_tst4(char *mnt)
{
- char *ptr, filea[256], fileb[256], host[256];
+ char *ptr, filea[256], fileb[256];
int region, fdr, fdw, rc = 0;
region = page_size * 100;
goto out_close;
}
- if (gethostname(host, sizeof(host))) {
- perror("gethostname()");
- rc = errno;
- goto out_unmap;
- }
-
- rc = mmap_run(host, 4);
- if (rc)
- goto out_unmap;
-
- rc = mmap_wait("mmap done", 10);
+ rc = mmap_run(4);
if (rc)
goto out_unmap;
goto out_close;
}
- rc = mmap_notify(node, "mmap done", 1);
- if (rc)
- goto out_unmap;
-
memset(ptr, '2', region);
rc = write(fdw, ptr, region);
} else
rc = 0;
-out_unmap:
munmap(ptr, region);
out_close:
if (fdr >= 0)
return rc;
}
+static int cancel_lru_locks(char *prefix)
+{
+ char cmd[256], line[1024];
+ FILE *file;
+ pid_t child;
+ int len = 1024, rc = 0;
+
+ child = fork();
+ if (child < 0)
+ return errno;
+ else if (child) {
+ int status;
+ rc = waitpid(child, &status, WNOHANG);
+ if (rc == child)
+ rc = 0;
+ return rc;
+ }
+
+ if (prefix)
+ sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/%s_*/lru_size", prefix);
+ else
+ sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*/lru_size");
+
+ file = popen(cmd, "r");
+ if (file == NULL) {
+ perror("popen()");
+ return errno;
+ }
+
+ while (fgets(line, len, file)) {
+ FILE *f;
+
+ if (!strlen(line))
+ continue;
+ /* trim newline character */
+ *(line + strlen(line) - 1) = '\0';
+ f = fopen(line, "w");
+ if (f == NULL) {
+ perror("fopen()");
+ rc = errno;
+ break;
+ }
+ rc = fwrite("clear", strlen("clear") + 1, 1, f);
+ if (rc < 1) {
+ perror("fwrite()");
+ rc = errno;
+ fclose(f);
+ break;
+ }
+ fclose(f);
+ }
+
+ pclose(file);
+ _exit(rc);
+}
+
+/* don't dead lock while read/write file to/from the buffer which
+ * mmaped to just this file */
+static int mmap_tst5(char *mnt)
+{
+ char *ptr, mmap_file[256];
+ int region, fd, off, rc = 0;
+
+ region = page_size * 40;
+ off = page_size * 10;
+ sprintf(mmap_file, "%s/%s", mnt, "mmap_file5");
+
+ if (unlink(mmap_file) && errno != ENOENT) {
+ perror("unlink()");
+ return errno;
+ }
+
+ fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+ if (fd < 0) {
+ perror(mmap_file);
+ return errno;
+ }
+ ftruncate(fd, region);
+
+ ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ptr == MAP_FAILED) {
+ perror("mmap()");
+ rc = errno;
+ goto out_close;
+ }
+ memset(ptr, 'a', region);
+
+ /* cancel unused locks */
+ cancel_lru_locks("OSC");
+ if (rc)
+ goto out_unmap;
+
+ /* read/write region of file and buffer should be overlap */
+ rc = read(fd, ptr + off, off * 2);
+ if (rc != off * 2) {
+ perror("read()");
+ rc = errno;
+ goto out_unmap;
+ }
+ rc = write(fd, ptr + off, off * 2);
+ if (rc != off * 2) {
+ perror("write()");
+ rc = errno;
+ }
+ rc = 0;
+out_unmap:
+ munmap(ptr, region);
+out_close:
+ close(fd);
+ unlink(mmap_file);
+ return rc;
+}
+
+/* mmap write to a file form client1 then mmap read from client2 */
+static int mmap_tst6(char *mnt)
+{
+ char mmap_file[256], mmap_file2[256];
+ char *ptr = NULL, *ptr2 = NULL;
+ int fd = 0, fd2 = 0, rc = 0;
+
+ sprintf(mmap_file, "%s/%s", mnt, "mmap_file6");
+ sprintf(mmap_file2, "%s/%s", dir2, "mmap_file6");
+ if (unlink(mmap_file) && errno != ENOENT) {
+ perror("unlink()");
+ return errno;
+ }
+
+ fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+ if (fd < 0) {
+ perror(mmap_file);
+ return errno;
+ }
+ ftruncate(fd, page_size);
+
+ fd2 = open(mmap_file2, O_RDWR, 0600);
+ if (fd2 < 0) {
+ perror(mmap_file2);
+ goto out;
+ }
+
+ ptr = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ptr == MAP_FAILED) {
+ perror("mmap()");
+ rc = errno;
+ goto out;
+ }
+
+ ptr2 = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, 0);
+ if (ptr2 == MAP_FAILED) {
+ perror("mmap()");
+ rc = errno;
+ goto out;
+ }
+
+ cancel_lru_locks("OSC");
+ if (rc)
+ goto out;
+
+ memcpy(ptr, "blah", strlen("blah"));
+ if (strncmp(ptr, ptr2, strlen("blah"))) {
+ fprintf(stderr, "client2 mmap mismatch!\n");
+ rc = EFAULT;
+ goto out;
+ }
+ memcpy(ptr2, "foo", strlen("foo"));
+ if (strncmp(ptr, ptr2, strlen("foo"))) {
+ fprintf(stderr, "client1 mmap mismatch!\n");
+ rc = EFAULT;
+ }
+out:
+ if (ptr2)
+ munmap(ptr2, page_size);
+ if (ptr)
+ munmap(ptr, page_size);
+ if (fd2 > 0)
+ close(fd2);
+ if (fd > 0)
+ close(fd);
+ unlink(mmap_file);
+ return rc;
+}
+
static int remote_tst(int tc, char *mnt)
{
int rc = 0;
case 4:
rc = remote_tst4(mnt);
break;
- case 1:
- case 2:
default:
fprintf(stderr, "wrong test case number %d\n", tc);
rc = EINVAL;
{ 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 },
{ 4, "mmap test4: c1 write to f1 from mmaped f2, "
"c2 write to f1 from mmaped f1", mmap_tst4, 2 },
+ { 5, "mmap test5: read/write file to/from the buffer "
+ "which mmaped to just this file", mmap_tst5, 1 },
+ { 6, "mmap test6: check mmap write/read content on two nodes",
+ mmap_tst6, 2 },
{ 0, NULL, 0, 0 }
};
{
extern char *optarg;
struct test_case *test;
- int c, rc = 0, tc = 0;
+ int c, rc = 0;
for(;;) {
- c = getopt(argc, argv, "d:n:c:m:");
+ c = getopt(argc, argv, "d:m:");
if ( c == -1 )
break;
case 'd':
dir = optarg;
break;
- case 'n':
- node = optarg;
- break;
- case 'c':
- tc = atoi(optarg);
- break;
case 'm':
dir2 = optarg;
break;
if (dir == NULL)
usage();
- if (dir2 != NULL && node != NULL)
- usage();
- if (mmap_initialize(argv[0], tc) != 0) {
+ if (mmap_initialize(argv[0]) != 0) {
fprintf(stderr, "mmap_initialize failed!\n");
return EINVAL;
}
- if (tc) {
- rc = remote_tst(tc, dir);
- goto out;
- }
-
for (test = tests; test->tc; test++) {
char *rs = "skip";
rc = 0;
- if (test->node_cnt == 1 || node != NULL || dir2 != NULL) {
+ if (test->node_cnt == 1 || dir2 != NULL) {
rc = test->test_fn(dir);
rs = rc ? "fail" : "pass";
}
if (rc)
break;
}
-out:
- mmap_finalize(tc);
+
+ mmap_finalize();
return rc;
}
}
run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======"
+test_71() {
+ cp `which dbench` $DIR
+
+ [ ! -f $DIR/dbench ] && echo "dbench not installed, skip this test" && return 0
+
+ TGT=$DIR/client.txt
+ SRC=${SRC:-/usr/lib/dbench/client.txt}
+ [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+ SRC=/usr/lib/dbench/client_plain.txt
+ [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+
+ echo "copying necessary lib to $DIR"
+ if [ -d /lib64 ]; then
+ mkdir $DIR/lib64
+ cp /lib64/libc* $DIR/lib64
+ cp /lib64/ld-* $DIR/lib64
+ else
+ mkdir $DIR/lib
+ cp /lib/libc* $DIR/lib
+ cp /lib/ld-* $DIR/lib
+ fi
+
+ echo "chroot $DIR /dbench -c client.txt 2"
+ chroot $DIR /dbench -c client.txt 2
+ RC=$?
+
+ rm -f $DIR/dbench
+ rm -f $TGT
+ rm -fr $DIR/lib
+ rm -fr $DIR/lib64
+
+ return $RC
+}
+run_test 71 "Running dbench on lustre (don't segment fault) ===="
+
# on the LLNL clusters, runas will still pick up root's $TMP settings,
# which will not be writable for the runas user, and then you get a CVS
# error message with a corrupt path string (CVS bug) and panic.
run_test 15 "test out-of-space with multiple writers ==========="
test_16() {
- fsx -R -W -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
+ fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
}
run_test 16 "2500 iterations of dual-mount fsx ================="
test_18() {
./mmap_sanity -d $MOUNT1 -m $MOUNT2
+}
+run_test 18 "mmap sanity check ================================="
+
+test_18() {
+ ./mmap_sanity -d $MOUNT1 -m $MOUNT2
sync; sleep 1; sync
}
#run_test 18 "mmap sanity check ================================="