land b1_4_mmap on b1_4 (20050211_1829)

author nic <nic>

Fri, 11 Feb 2005 23:42:08 +0000 (23:42 +0000)

committer nic <nic>

Fri, 11 Feb 2005 23:42:08 +0000 (23:42 +0000)
author nic <nic>
Fri, 11 Feb 2005 23:42:08 +0000 (23:42 +0000)
committer nic <nic>
Fri, 11 Feb 2005 23:42:08 +0000 (23:42 +0000)
diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h

index d3770b6..51e8054 100644 (file)
--- a/lustre/include/linux/lustre_compat25.h
+++ b/lustre/include/linux/lustre_compat25.h
@@ -215,8 +215,24 @@ static inline void cond_resched(void)
  #define __set_page_ll_data(page, llap) page->private = (unsigned long)llap
  #define __clear_page_ll_data(page) page->private = 0
  #define PageWriteback(page) 0
+#define set_page_writeback(page)
  #define end_page_writeback(page)
  
+static inline int mapping_mapped(struct address_space *mapping)
+{
+        if (mapping->i_mmap_shared)
+                return 1;
+        if (mapping->i_mmap)
+                return 1;
+        return 0;
+}
+
+#ifdef ZAP_PAGE_RANGE_VMA
+#define ll_zap_page_range(vma, addr, len)  zap_page_range(vma, addr, len)
+#else
+#define ll_zap_page_range(vma, addr, len)  zap_page_range(vma->vm_mm, addr, len)
+#endif
+
  #endif /* end of 2.4 compat macros */
  
  #ifdef HAVE_PAGE_LIST
diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h

index 08dd922..bf4e9e2 100644 (file)
--- a/lustre/include/linux/lustre_dlm.h
+++ b/lustre/include/linux/lustre_dlm.h
@@ -95,6 +95,9 @@ typedef enum {
   * list. */
  #define LDLM_FL_KMS_IGNORE     0x200000
  
+/* Don't drop lock covering mmapped file in LRU */
+#define LDLM_FL_NO_LRU         0x400000
+
  /* The blocking callback is overloaded to perform two functions.  These flags
   * indicate which operation should be performed. */
  #define LDLM_CB_BLOCKING    1
@@ -536,6 +539,8 @@ int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags);
  int ldlm_cli_cancel(struct lustre_handle *lockh);
  int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *,
                             int flags, void *opaque);
+int ldlm_cli_join_lru(struct ldlm_namespace *, struct ldlm_res_id *,
+                      int join);
  
  /* mds/handler.c */
  /* This has to be here because recursive inclusion sucks. */
diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h

index b2adffd..97600ea 100644 (file)
--- a/lustre/include/linux/lustre_lite.h
+++ b/lustre/include/linux/lustre_lite.h
@@ -76,10 +76,11 @@ struct ll_inode_info {
          __u64                   lli_io_epoch;
          unsigned long           lli_flags;
  
-        /* this lock protects s_d_w and p_w_ll */
+        /* this lock protects s_d_w and p_w_ll and mmap_cnt */
          spinlock_t              lli_lock;
          int                     lli_send_done_writing;
          struct list_head        lli_pending_write_llaps;
+        atomic_t                lli_mmap_cnt;
  
          struct list_head        lli_close_item;
  
diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h

index 3b20425..b0d2685 100644 (file)
--- a/lustre/include/linux/obd.h
+++ b/lustre/include/linux/obd.h
@@ -658,6 +658,8 @@ struct obd_ops {
                          __u32 mode, struct lustre_handle *);
          int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
                                 int flags, void *opaque);
+        int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *, 
+                         int join);
          int (*o_san_preprw)(int cmd, struct obd_export *exp,
                              struct obdo *oa, int objcount,
                              struct obd_ioobj *obj, int niocount,
diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h

index 4e7a8e5..9b393f0 100644 (file)
--- a/lustre/include/linux/obd_class.h
+++ b/lustre/include/linux/obd_class.h
@@ -866,6 +866,18 @@ static inline int obd_cancel_unused(struct obd_export *exp,
          RETURN(rc);
  }
  
+static inline int obd_join_lru(struct obd_export *exp,
+                               struct lov_stripe_md *ea, int join)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_OP(exp, join_lru);
+        OBD_COUNTER_INCREMENT(exp->exp_obd, join_lru);
+
+        rc = OBP(exp->exp_obd, join_lru)(exp, ea, join);
+        RETURN(rc);
+}
  
  static inline int obd_san_preprw(int cmd, struct obd_export *exp,
                                   struct obdo *oa,
diff --git a/lustre/kernel_patches/patches/export-filemap_populate.patch b/lustre/kernel_patches/patches/export-filemap_populate.patch

new file mode 100644 (file)

index 0000000..8f78a79
--- /dev/null
+++ b/lustre/kernel_patches/patches/export-filemap_populate.patch
@@ -0,0 +1,25 @@
+Index: linux-2.6.7/mm/filemap.c
+===================================================================
+--- linux-2.6.7.orig/mm/filemap.c      2004-11-15 12:02:35.000000000 +0800
++++ linux-2.6.7/mm/filemap.c   2004-11-15 12:04:38.000000000 +0800
+@@ -1409,6 +1409,7 @@
+ 
+       return 0;
+ }
++EXPORT_SYMBOL_GPL(filemap_populate);
+ 
+ static struct vm_operations_struct generic_file_vm_ops = {
+       .nopage         = filemap_nopage,
+Index: linux-2.6.7/include/linux/mm.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/mm.h        2004-11-15 12:02:43.000000000 +0800
++++ linux-2.6.7/include/linux/mm.h     2004-11-15 12:04:23.000000000 +0800
+@@ -661,6 +661,8 @@
+ 
+ /* generic vm_area_ops exported for stackable file systems */
+ struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
++int filemap_populate(struct vm_area_struct *, unsigned long, unsigned long,
++                   pgprot_t, unsigned long, int);
+ 
+ /* mm/page-writeback.c */
+ int write_one_page(struct page *page, int wait);
diff --git a/lustre/kernel_patches/series/2.6-suse-lnxi.series b/lustre/kernel_patches/series/2.6-suse-lnxi.series

index 4e4adf5..5669eb8 100644 (file)
--- a/lustre/kernel_patches/series/2.6-suse-lnxi.series
+++ b/lustre/kernel_patches/series/2.6-suse-lnxi.series
@@ -4,4 +4,5 @@ bluesmoke-2.6-suse-lnxi.patch
  mtd-2.6-suse-lnxi.patch 
  perfctr-2.6-suse-lnxi.patch 
  kexec-2.6-suse-lnxi.patch
+export-filemap_populate.patch
  grab_cache_page_nowait_gfp-2.6-suse.patch 
diff --git a/lustre/kernel_patches/series/2.6-suse.series b/lustre/kernel_patches/series/2.6-suse.series

index a30d9f1..d7a9e7e 100644 (file)
--- a/lustre/kernel_patches/series/2.6-suse.series
+++ b/lustre/kernel_patches/series/2.6-suse.series
@@ -13,3 +13,4 @@ header-guards-2.6-suse.patch
  md_path_lookup-2.6-suse.patch
  ext3-super-ntohl.patch
  export-show_task-2.6-vanilla.patch
+export-filemap_populate.patch 
diff --git a/lustre/kernel_patches/series/vanilla-2.4.24 b/lustre/kernel_patches/series/vanilla-2.4.24

index 735db03..379e4cb 100644 (file)
--- a/lustre/kernel_patches/series/vanilla-2.4.24
+++ b/lustre/kernel_patches/series/vanilla-2.4.24
@@ -41,3 +41,4 @@ ext3-mballoc-2.4.24.patch
  export_num_siblings.patch
  ext3-nlinks-2.4.24.patch
  export-show_task-2.4-vanilla.patch 
+export-zap-page-range.patch
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 2168f1f..9e3add6 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -482,7 +482,8 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
                  if (ldlm_bl_to_thread(ns, NULL, lock) != 0)
                          ldlm_handle_bl_callback(ns, NULL, lock);
          } else if (ns->ns_client == LDLM_NAMESPACE_CLIENT &&
-                   !lock->l_readers && !lock->l_writers) {
+                   !lock->l_readers && !lock->l_writers &&
+                   !(lock->l_flags & LDLM_FL_NO_LRU)) {
                  /* If this is a client-side namespace and this was the last
                   * reference, put it on the LRU. */
                  LASSERT(list_empty(&lock->l_lru));
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 0647f98..b3e11ce 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -1513,6 +1513,7 @@ EXPORT_SYMBOL(ldlm_cli_convert);
  EXPORT_SYMBOL(ldlm_cli_enqueue);
  EXPORT_SYMBOL(ldlm_cli_cancel);
  EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+EXPORT_SYMBOL(ldlm_cli_join_lru);
  EXPORT_SYMBOL(ldlm_replay_locks);
  EXPORT_SYMBOL(ldlm_resource_foreach);
  EXPORT_SYMBOL(ldlm_namespace_foreach);
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index ed7e842..eec23ba 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -766,6 +766,55 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
          RETURN(ELDLM_OK);
  }
  
+/* join/split resource locks to/from lru list */
+int ldlm_cli_join_lru(struct ldlm_namespace *ns, 
+                      struct ldlm_res_id *res_id, int join)
+{
+        struct ldlm_resource *res;
+        struct ldlm_lock *lock, *n;
+        int count = 0;
+        ENTRY;
+
+        LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT);
+
+        res = ldlm_resource_get(ns, NULL, *res_id, LDLM_EXTENT, 0);
+        if (res == NULL)
+                RETURN(count);
+        LASSERT(res->lr_type == LDLM_EXTENT);
+        
+        l_lock(&ns->ns_lock);
+        if (!join)
+                goto split;
+
+        list_for_each_entry_safe (lock, n, &res->lr_granted, l_res_link) {
+                if (list_empty(&lock->l_lru) && 
+                    !lock->l_readers && !lock->l_writers &&
+                    !(lock->l_flags & LDLM_FL_LOCAL) &&
+                    !(lock->l_flags & LDLM_FL_CBPENDING)) {
+                        LASSERT(ns->ns_nr_unused >= 0);
+                        list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+                        ns->ns_nr_unused++;
+                        lock->l_flags &= ~LDLM_FL_NO_LRU;
+                        LDLM_DEBUG(lock, "join lock to lru");
+                        count++;
+                }
+        }
+        goto unlock;
+split:
+        list_for_each_entry_safe (lock, n, &ns->ns_unused_list, l_lru) {
+                if (lock->l_resource == res) {
+                        ldlm_lock_remove_from_lru(lock);
+                        lock->l_flags |= LDLM_FL_NO_LRU;
+                        LDLM_DEBUG(lock, "split lock from lru");
+                        count++;
+                }
+        }
+unlock:
+        l_unlock(&ns->ns_lock);
+        ldlm_resource_putref(res);
+        RETURN(count);
+}
+
  /* Lock iterators. */
  
  int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in

index 9492120..4daad42 100644 (file)
--- a/lustre/llite/Makefile.in
+++ b/lustre/llite/Makefile.in
@@ -1,5 +1,5 @@
  MODULES := llite
-llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o
+llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o
  
  ifeq ($(PATCHLEVEL),4)
  llite-objs += rw24.o super.o
@@ -7,4 +7,4 @@ else
  llite-objs += rw26.o super25.o
  endif
  
-@INCLUDE_RULES@
-\ No newline at end of file
+@INCLUDE_RULES@
diff --git a/lustre/llite/Makefile.mk b/lustre/llite/Makefile.mk

index 06dd10e..dabbd9e 100644 (file)
--- a/lustre/llite/Makefile.mk
+++ b/lustre/llite/Makefile.mk
@@ -8,4 +8,4 @@ include $(src)/../portals/Kernelenv
  obj-y += llite.o
  llite-objs := llite_lib.o dcache.o super.o rw.o \
         super25.o file.o dir.o symlink.o namei.o lproc_llite.o \
-       rw26.o llite_nfs.o llite_close.o special.o
+       rw26.o llite_nfs.o llite_close.o special.o llite_mmap.o
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index 801f3d7..1763524 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -376,6 +376,21 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
          CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
                 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
                 count, skip, end, discard ? " (DISCARDING)" : "");
+        
+        /* walk through the vmas on the inode and tear down mmaped pages that
+         * intersect with the lock.  this stops immediately if there are no
+         * mmap()ed regions of the file.  This is not efficient at all and
+         * should be short lived. We'll associate mmap()ed pages with the lock
+         * and will be able to find them directly */
+        for (i = start; i <= end; i += (j + skip)) {
+                j = min(count - (i % count), end - i + 1);
+                LASSERT(j > 0);
+                LASSERT(inode->i_mapping);
+                if (ll_teardown_mmaps(inode->i_mapping, 
+                                      (__u64)i << PAGE_CACHE_SHIFT,
+                                      ((__u64)(i+j) << PAGE_CACHE_SHIFT) - 1) )
+                        break;
+        }
  
          /* this is the simplistic implementation of page eviction at
           * cancelation.  It is careful to get races with other page
@@ -680,6 +695,10 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
  
          LASSERT(lockh->cookie == 0);
  
+        /* don't drop the mmapped file to LRU */
+        if (mapping_mapped(inode->i_mapping))
+                ast_flags |= LDLM_FL_NO_LRU;
+        
          /* XXX phil: can we do this?  won't it screw the file size up? */
          if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
              (sbi->ll_flags & LL_SBI_NOLCK))
@@ -737,12 +756,11 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
  static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
                              loff_t *ppos)
  {
-        struct ll_file_data *fd = filp->private_data;
          struct inode *inode = filp->f_dentry->d_inode;
          struct ll_inode_info *lli = ll_i2info(inode);
          struct lov_stripe_md *lsm = lli->lli_smd;
-        struct lustre_handle lockh = { 0 };
-        ldlm_policy_data_t policy;
+        struct ll_lock_tree tree;
+        struct ll_lock_tree_node *node;
          int rc;
          ssize_t retval;
          __u64 kms;
@@ -760,11 +778,12 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
  
          if (!lsm)
                  RETURN(0);
-
-        policy.l_extent.start = *ppos;
-        policy.l_extent.end = *ppos + count - 1;
-
-        rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &policy, &lockh, 0);
+        
+        node = ll_node_from_inode(inode, *ppos, *ppos  + count - 1, 
+                                  LCK_PR);
+        tree.lt_fd = filp->private_data;
+        rc = ll_tree_lock(&tree, node, buf, count, 
+                          filp->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
          if (rc != 0)
                  RETURN(rc);
  
@@ -791,7 +810,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
          retval = generic_file_read(filp, buf, count, ppos);
  
   out:
-        ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
+        ll_tree_unlock(&tree);
          RETURN(retval);
  }
  
@@ -801,11 +820,10 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
  static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
                               loff_t *ppos)
  {
-        struct ll_file_data *fd = file->private_data;
          struct inode *inode = file->f_dentry->d_inode;
          struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-        struct lustre_handle lockh = { 0 };
-        ldlm_policy_data_t policy;
+        struct ll_lock_tree tree;
+        struct ll_lock_tree_node *node;
          loff_t maxbytes = ll_file_maxbytes(inode);
          ssize_t retval;
          int rc;
@@ -825,16 +843,18 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
                  RETURN(-EBADF);
  
          LASSERT(lsm);
-
-        if (file->f_flags & O_APPEND) {
-                policy.l_extent.start = 0;
-                policy.l_extent.end = OBD_OBJECT_EOF;
-        } else  {
-                policy.l_extent.start = *ppos;
-                policy.l_extent.end = *ppos + count - 1;
-        }
-
-        rc = ll_extent_lock(fd, inode, lsm, LCK_PW, &policy, &lockh, 0);
+        
+        if (file->f_flags & O_APPEND)
+                node = ll_node_from_inode(inode, 0, OBD_OBJECT_EOF, LCK_PW);
+        else
+                node = ll_node_from_inode(inode, *ppos, *ppos  + count - 1, 
+                                          LCK_PW);
+        if (IS_ERR(node))
+                RETURN(PTR_ERR(node));
+        
+        tree.lt_fd = file->private_data;
+        rc = ll_tree_lock(&tree, node, buf, count, 
+                          file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
          if (rc != 0)
                  RETURN(rc);
  
@@ -859,7 +879,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
          retval = generic_file_write(file, buf, count, ppos);
  
  out:
-        ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
+        ll_tree_unlock(&tree);
          lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
                              retval > 0 ? retval : 0);
          RETURN(retval);
@@ -1410,7 +1430,7 @@ struct file_operations ll_file_operations = {
          .ioctl          = ll_file_ioctl,
          .open           = ll_file_open,
          .release        = ll_file_release,
-        .mmap           = generic_file_mmap,
+        .mmap           = ll_file_mmap,
          .llseek         = ll_file_seek,
  #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
          .sendfile       = generic_file_sendfile,
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index d218e69..7640446 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -292,6 +292,29 @@ void ll_queue_done_writing(struct inode *inode);
  void ll_close_thread_shutdown(struct ll_close_queue *lcq);
  int ll_close_thread_start(struct ll_close_queue **lcq_ret);
  
+/* llite/llite_mmap.c */
+#if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+typedef struct rb_root  rb_root_t;
+typedef struct rb_node  rb_node_t;
+#endif
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+        rb_root_t                       lt_root;
+        struct list_head                lt_locked_list;
+        struct ll_file_data             *lt_fd;
+};
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+                                              __u64 end, ldlm_mode_t mode);
+int ll_tree_lock(struct ll_lock_tree *tree, 
+                 struct ll_lock_tree_node *first_node,
+                 const char *buf, size_t count, int ast_flags);
+int ll_tree_unlock(struct ll_lock_tree *tree);
+
+
  #define LL_SBI_NOLCK            0x1
  
  #define LL_MAX_BLKSIZE          (4UL * 1024 * 1024)
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 913f64a..deadf5b 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -148,6 +148,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
  
          devno = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid,
                               strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid));
+        /* s_dev is also used in lt_compare() to compare two fs */
          sb->s_dev = devno;
  
          obd = class_name2obd(osc);
diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c

new file mode 100644 (file)

index 0000000..9aab20a
--- /dev/null
+++ b/lustre/llite/llite_mmap.c
@@ -0,0 +1,602 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/iobuf.h>
+#endif
+
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+#define VMA_DEBUG(vma, fmt, arg...)                                     \
+        CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) "   \
+               "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end,  \
+               vma->vm_pgoff, vma->vm_file->f_dentry->d_inode,               \
+               vma->vm_file->f_dentry->d_inode->i_ino,                       \
+               vma->vm_file->f_dentry->d_iname, ## arg);                     \
+
+
+struct ll_lock_tree_node {
+        rb_node_t               lt_node;
+        struct list_head        lt_locked_item;
+        __u64                   lt_oid;
+        ldlm_policy_data_t      lt_policy;
+        struct lustre_handle    lt_lockh;
+        ldlm_mode_t             lt_mode;
+        struct inode           *lt_inode;
+};
+
+__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms);
+int lt_get_mmap_locks(struct ll_lock_tree *tree,
+                      unsigned long addr, size_t count);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int *type);
+#else
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int unused);
+#endif
+
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+                                              __u64 end, ldlm_mode_t mode)
+{
+        struct ll_lock_tree_node *node;
+
+        OBD_ALLOC(node, sizeof(*node));
+        if (node == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        node->lt_inode = inode;
+        node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id;
+        node->lt_policy.l_extent.start = start;
+        node->lt_policy.l_extent.end = end;
+        memset(&node->lt_lockh, 0, sizeof(node->lt_lockh));
+        INIT_LIST_HEAD(&node->lt_locked_item);
+        node->lt_mode = mode;
+
+        return node;
+}
+
+int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two)
+{
+        /* To avoid multiple fs deadlock */
+        if (one->lt_inode->i_sb->s_dev < two->lt_inode->i_sb->s_dev)
+                return -1;
+        if (one->lt_inode->i_sb->s_dev > two->lt_inode->i_sb->s_dev)
+                return 1;
+
+        if (one->lt_oid < two->lt_oid)
+                return -1;
+        if (one->lt_oid > two->lt_oid)
+                return 1;
+
+        if (one->lt_policy.l_extent.end < two->lt_policy.l_extent.start)
+                return -1;
+        if (one->lt_policy.l_extent.start > two->lt_policy.l_extent.end)
+                return 1;
+
+        return 0; /* they are the same object and overlap */
+}
+
+static void lt_merge(struct ll_lock_tree_node *dst, 
+                     struct ll_lock_tree_node *src)
+{
+        dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start,
+                                            src->lt_policy.l_extent.start);
+        dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end,
+                                          src->lt_policy.l_extent.end);
+
+        /* XXX could be a real call to the dlm to find superset modes */
+        if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW)
+                dst->lt_mode = LCK_PW;
+}
+
+static void lt_insert(struct ll_lock_tree *tree, 
+                      struct ll_lock_tree_node *node)
+{
+        struct ll_lock_tree_node *walk;
+        rb_node_t **p, *parent;
+        ENTRY;
+
+restart:
+        p = &tree->lt_root.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                walk = rb_entry(parent, struct ll_lock_tree_node, lt_node);
+                switch (lt_compare(node, walk)) {
+                case -1:
+                        p = &(*p)->rb_left;
+                        break;
+                case 1:
+                        p = &(*p)->rb_right;
+                        break;
+                case 0:
+                        lt_merge(node, walk);
+                        rb_erase(&walk->lt_node, &tree->lt_root);
+                        OBD_FREE(walk, sizeof(*walk));
+                        goto restart;
+                        break;
+                default:
+                        LBUG();
+                        break;
+                }
+        }
+        rb_link_node(&node->lt_node, parent, p);
+        rb_insert_color(&node->lt_node, &tree->lt_root);
+        EXIT;
+}
+
+static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree)
+{
+        rb_node_t *rbnode;
+        struct ll_lock_tree_node *node = NULL;
+
+        for ( rbnode = tree->lt_root.rb_node; rbnode != NULL; 
+              rbnode = rbnode->rb_left) {
+                if (rbnode->rb_left == NULL) {
+                        node = rb_entry(rbnode, struct ll_lock_tree_node, 
+                                        lt_node);
+                        break;
+                }
+        }
+        RETURN(node);
+}
+
+int ll_tree_unlock(struct ll_lock_tree *tree)
+{
+        struct ll_lock_tree_node *node;
+        struct list_head *pos, *n;
+        struct inode *inode;
+        int rc = 0;
+        ENTRY;
+
+        list_for_each_safe(pos, n, &tree->lt_locked_list) {
+                node = list_entry(pos, struct ll_lock_tree_node, 
+                                  lt_locked_item);
+
+                inode = node->lt_inode;
+                rc = ll_extent_unlock(tree->lt_fd, inode, 
+                                      ll_i2info(inode)->lli_smd, node->lt_mode, 
+                                      &node->lt_lockh);
+                if (rc != 0) {
+                        /* XXX better message */
+                        CERROR("couldn't unlock %d\n", rc);
+                }
+                list_del(&node->lt_locked_item);
+                OBD_FREE(node, sizeof(*node));
+        }
+
+        while ((node = lt_least_node(tree))) {
+                rb_erase(&node->lt_node, &tree->lt_root);
+                OBD_FREE(node, sizeof(*node));
+        }
+
+        RETURN(rc);
+}
+
+int ll_tree_lock(struct ll_lock_tree *tree,
+                 struct ll_lock_tree_node *first_node,
+                 const char *buf, size_t count, int ast_flags)
+{
+        struct ll_lock_tree_node *node;
+        int rc = 0;
+        ENTRY;
+
+        tree->lt_root.rb_node = NULL;
+        INIT_LIST_HEAD(&tree->lt_locked_list);
+        if (first_node != NULL)
+                lt_insert(tree, first_node);
+
+        /* To avoid such subtle deadlock case: client1 try to read file1 to
+         * mmapped file2, on the same time, client2 try to read file2 to
+         * mmapped file1.*/
+        rc = lt_get_mmap_locks(tree, (unsigned long)buf, count);
+        if (rc)
+                GOTO(out, rc);
+
+        while ((node = lt_least_node(tree))) {
+                struct inode *inode = node->lt_inode;
+                rc = ll_extent_lock(tree->lt_fd, inode, 
+                                    ll_i2info(inode)->lli_smd, node->lt_mode, 
+                                    &node->lt_policy, &node->lt_lockh,
+                                    ast_flags);
+                if (rc != 0)
+                        GOTO(out, rc);
+
+                rb_erase(&node->lt_node, &tree->lt_root);
+                list_add_tail(&node->lt_locked_item, &tree->lt_locked_list);
+        }
+        RETURN(rc);
+out:
+        ll_tree_unlock(tree);
+        RETURN(rc);
+}
+
+static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma)
+{
+        /* we only want to hold PW locks if the mmap() can generate 
+         * writes back to the file and that only happens in shared
+         * writable vmas */
+        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+                return LCK_PW;
+        return LCK_PR;
+}
+
+static void policy_from_vma(ldlm_policy_data_t *policy, 
+                            struct vm_area_struct *vma, unsigned long addr,
+                            size_t count)
+{
+        policy->l_extent.start = ((addr - vma->vm_start) & PAGE_CACHE_MASK) +
+                                 (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+        policy->l_extent.end = (policy->l_extent.start + count - 1) | 
+                               (PAGE_CACHE_SIZE - 1);
+}
+
+static struct vm_area_struct * our_vma(unsigned long addr, size_t count)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma, *ret = NULL;
+        ENTRY;
+
+        spin_lock(&mm->page_table_lock);
+        for(vma = find_vma(mm, addr); 
+            vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+                if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage &&
+                    vma->vm_flags & VM_SHARED) {
+                        ret = vma;
+                        break;
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        RETURN(ret);
+}
+
+int lt_get_mmap_locks(struct ll_lock_tree *tree,
+                      unsigned long addr, size_t count)
+{
+        struct vm_area_struct *vma;
+        struct ll_lock_tree_node *node;
+        ldlm_policy_data_t policy;
+        struct inode *inode;
+        ENTRY;
+
+        if (count == 0)
+                RETURN(0);
+
+        /* we need to look up vmas on page aligned addresses */
+        count += addr & (PAGE_SIZE - 1);
+        addr &= PAGE_MASK;
+
+        while ((vma = our_vma(addr, count)) != NULL) {
+                LASSERT(vma->vm_file);
+
+                inode = vma->vm_file->f_dentry->d_inode;
+                policy_from_vma(&policy, vma, addr, count);
+                node = ll_node_from_inode(inode, policy.l_extent.start, 
+                                          policy.l_extent.end, 
+                                          mode_from_vma(vma));
+                if (IS_ERR(node)) {
+                        CERROR("not enough mem for lock_tree_node!\n");
+                        RETURN(-ENOMEM);
+                }
+                lt_insert(tree, node);
+
+                if (vma->vm_end - addr >= count)
+                        break;
+                count -= vma->vm_end - addr;
+                addr = vma->vm_end;
+        }
+        RETURN(0);
+}
+
+/* FIXME: there is a pagefault race goes as follow (only 2.4):
+ * 1. A user process on node A accesses a portion of a mapped file, 
+ *    resulting in a page fault.  The pagefault handler invokes the 
+ *    ll_nopage function, which reads the page into memory.
+ * 2. A user process on node B writes to the same portion of the file 
+ *    (either via mmap or write()), that cause node A to cancel the
+ *    lock and truncate the page.
+ * 3. Node A then executes the rest of do_no_page(), entering the 
+ *    now-invalid page into the PTEs.
+ *
+ * Make the whole do_no_page as a hook to cover both the page cache
+ * and page mapping installing with dlm lock would eliminate this race.
+ *
+ * In 2.6, the truncate_count of address_space can cover this race.
+ */
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int *type)
+#else
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                       int unused)
+#endif
+{
+        struct file *filp = vma->vm_file;
+        struct ll_file_data *fd = filp->private_data;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct lustre_handle lockh = { 0 };
+        ldlm_policy_data_t policy;
+        ldlm_mode_t mode;
+        struct page *page = NULL;
+        __u64 kms, old_mtime;
+        unsigned long pgoff, size, rand_read, seq_read;
+        int rc = 0;
+        ENTRY;
+
+        if (ll_i2info(inode)->lli_smd == NULL) {
+                CERROR("No lsm on fault?\n");
+                RETURN(NULL);
+        }
+
+        /* start and end the lock on the first and last bytes in the page */
+        policy_from_vma(&policy, vma, address, PAGE_CACHE_SIZE);
+
+        CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
+               vma, inode->i_ino, policy.l_extent.start, 
+               policy.l_extent.end);
+
+        mode = mode_from_vma(vma);
+        old_mtime = LTIME_S(inode->i_mtime);
+
+        rc = ll_extent_lock(fd, inode, ll_i2info(inode)->lli_smd, mode, &policy,
+                            &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU);
+        if (rc != 0)
+                RETURN(NULL);
+        
+        if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)
+                CWARN("binary changed. inode %lu\n", inode->i_ino);
+       
+        /* XXX change inode size without i_sem hold! there is a race condition
+         *     with truncate path. (see ll_extent_lock) */
+        kms = lov_merge_size(ll_i2info(inode)->lli_smd, 1);
+        pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+        size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        
+        if (pgoff >= size)
+                ll_glimpse_size(inode);
+        else
+                inode->i_size = kms;
+        
+        /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that 
+         * the kernel will not read other pages not covered by ldlm in 
+         * filemap_nopage. we do our readahead in ll_readpage. 
+         */
+        rand_read = vma->vm_flags & VM_RAND_READ;
+        seq_read = vma->vm_flags & VM_SEQ_READ;
+        vma->vm_flags &= ~ VM_SEQ_READ;
+        vma->vm_flags |= VM_RAND_READ;
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        page = filemap_nopage(vma, address, type);
+#else       
+        page = filemap_nopage(vma, address, unused);
+#endif  
+        vma->vm_flags &= ~VM_RAND_READ;
+        vma->vm_flags |= (rand_read | seq_read);
+
+        ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
+        RETURN(page);
+}
+
+/* To avoid cancel the locks covering mmapped region for lock cache pressure, 
+ * we track the mapped vma count by lli_mmap_cnt. 
+ * ll_vm_open():  when first vma is linked, split locks from lru.
+ * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru.
+ *
+ * XXX we don't check the if the region of vma/lock for performance.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        ENTRY;
+
+        LASSERT(vma->vm_file);
+        
+        spin_lock(&lli->lli_lock);
+        LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0);
+        
+        atomic_inc(&lli->lli_mmap_cnt);
+        if (atomic_read(&lli->lli_mmap_cnt) == 1) {
+                struct lov_stripe_md *lsm = lli->lli_smd;
+                struct ll_sb_info *sbi = ll_i2sbi(inode);
+                int count;
+                
+                spin_unlock(&lli->lli_lock);
+                count = obd_join_lru(sbi->ll_osc_exp, lsm, 0);
+                VMA_DEBUG(vma, "split %d unused locks from lru", count);
+        } else {
+                spin_unlock(&lli->lli_lock);
+        }
+
+}
+
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        ENTRY;
+
+        LASSERT(vma->vm_file);
+
+        spin_lock(&lli->lli_lock);
+        LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0);
+
+        atomic_dec(&lli->lli_mmap_cnt);
+        if (atomic_read(&lli->lli_mmap_cnt) == 0) {
+                struct lov_stripe_md *lsm = lli->lli_smd;
+                struct ll_sb_info *sbi = ll_i2sbi(inode);
+                int count;
+
+                spin_unlock(&lli->lli_lock);
+                count = obd_join_lru(sbi->ll_osc_exp, lsm, 1);
+                VMA_DEBUG(vma, "join %d unused locks to lru", count);
+        } else {
+                spin_unlock(&lli->lli_lock);
+        }
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+static int ll_populate(struct vm_area_struct *area, unsigned long address,
+                       unsigned long len, pgprot_t prot, unsigned long pgoff, 
+                       int nonblock)
+{
+        int rc = 0;
+        ENTRY;
+
+        /* always set nonblock as true to avoid page read ahead */
+        rc = filemap_populate(area, address, len, prot, pgoff, 1);
+        RETURN(rc);
+}
+#endif
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma,
+                                         __u64 byte)
+{
+        return vma->vm_start + 
+               (byte - ((__u64)vma->vm_pgoff << PAGE_SHIFT));
+
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+/* [first, last] are the byte offsets affected.  
+ * vm_{start, end} are user addresses of the first byte of the mapping and
+ *      the next byte beyond it
+ * vm_pgoff is the page index of the first byte in the mapping */
+static void teardown_vmas(struct vm_area_struct *vma, __u64 first,
+                          __u64 last)
+{
+        unsigned long address, len;
+        for (; vma ; vma = vma->vm_next_share) {
+                if (last >> PAGE_SHIFT < vma->vm_pgoff)
+                        continue;
+                if (first >> PAGE_SHIFT >= (vma->vm_pgoff + 
+                    ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
+                        continue;
+ 
+                /* XXX in case of unmap the cow pages of a running file,
+                 * don't unmap these private writeable mapping here!
+                 * though that will break private mappping a little.
+                 *
+                 * the clean way is to check the mapping of every page
+                 * and just unmap the non-cow pages, just like
+                 * unmap_mapping_range() with even_cow=0 in kernel 2.6.
+                 */
+                if (!(vma->vm_flags & VM_SHARED) &&
+                    (vma->vm_flags & VM_WRITE))
+                        continue;
+               
+                address = max((unsigned long)vma->vm_start, 
+                              file_to_user(vma, first));
+                len = min((unsigned long)vma->vm_end, 
+                          file_to_user(vma, last) + 1) - address;
+
+                VMA_DEBUG(vma, "zapping vma [first="LPU64" last="LPU64" "
+                          "address=%ld len=%ld]\n", first, last, address, len);
+                LASSERT(len > 0);
+                ll_zap_page_range(vma, address, len);
+        }
+}
+#endif
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, 
+                       __u64 last)
+{
+        int rc = -ENOENT;
+        ENTRY;
+        
+        LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        if (mapping_mapped(mapping)) {
+                rc = 0;
+                unmap_mapping_range(mapping, first + PAGE_SIZE - 1,
+                                    last - first + 1, 0);
+        }
+#else
+        spin_lock(&mapping->i_shared_lock);
+        if (mapping->i_mmap != NULL) {
+                rc = 0;
+                teardown_vmas(mapping->i_mmap, first, last);
+        }
+        if (mapping->i_mmap_shared != NULL) {
+                rc = 0;
+                teardown_vmas(mapping->i_mmap_shared, first, last);
+        }
+        spin_unlock(&mapping->i_shared_lock);
+#endif
+        RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+        .nopage         = ll_nopage,
+        .open           = ll_vm_open,
+        .close          = ll_vm_close,
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        .populate       = ll_populate,
+#endif
+};
+
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+        int rc;
+        ENTRY;
+
+        rc = generic_file_mmap(file, vma);
+        if (rc == 0) {
+                vma->vm_ops = &ll_file_vm_ops;
+                vma->vm_ops->open(vma);
+                /* update the inode's size and mtime */
+                rc = ll_glimpse_size(file->f_dentry->d_inode);
+        }
+        
+        RETURN(rc);
+}
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index 6b68ea5..cd34804 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -1078,6 +1078,49 @@ out_unlock:
          spin_unlock(&sbi->ll_lock);
          return;
  }
+int ll_writepage(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct obd_export *exp;
+        struct ll_async_page *llap;
+        int rc = 0;
+        ENTRY;
+        
+        LASSERT(!PageDirty(page));
+        LASSERT(PageLocked(page));
+        
+        exp = ll_i2obdexp(inode);
+        if (exp == NULL)
+                GOTO(out, rc = -EINVAL);
+        
+        llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
+        if (IS_ERR(llap))
+                GOTO(out, rc = PTR_ERR(llap));
+        
+        page_cache_get(page);
+        if (llap->llap_write_queued) {
+                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
+                rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
+                                         llap->llap_cookie,
+                                         ASYNC_READY | ASYNC_URGENT);
+        } else {
+                rc = queue_or_sync_write(exp, inode, llap,
+                                         PAGE_SIZE, ASYNC_READY | ASYNC_URGENT);
+        }
+        if (rc)
+                page_cache_release(page);
+out:
+        if (rc) {
+                if (!lli->lli_async_rc)
+                        lli->lli_async_rc = rc;
+                /* re-dirty page on error so it retries write */
+                SetPageDirty(page);
+                ClearPageLaunder(page); 
+                unlock_page(page);
+        }
+        RETURN(rc);
+}
  
  /*
   * for now we do our readpage the same on both 2.4 and 2.5.  The kernel's
@@ -1141,12 +1184,10 @@ int ll_readpage(struct file *filp, struct page *page)
          }
  
          if (rc == 0) {
-#if 0
                  CWARN("ino %lu page %lu (%llu) not covered by "
                        "a lock (mmap?).  check debug logs.\n",
                        inode->i_ino, page->index,
                        (long long)page->index << PAGE_CACHE_SHIFT);
-#endif
          }
  
          rc = ll_issue_page_read(exp, llap, oig, 0);
diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c

index 6ad6dcd..aa8e708 100644 (file)
--- a/lustre/llite/rw24.c
+++ b/lustre/llite/rw24.c
@@ -49,56 +49,6 @@
  #include "llite_internal.h"
  #include <linux/lustre_compat25.h>
  
-static int ll_writepage_24(struct page *page)
-{
-        struct inode *inode = page->mapping->host;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(!PageDirty(page));
-        LASSERT(PageLocked(page));
-
-        exp = ll_i2obdexp(inode);
-        if (exp == NULL)
-                GOTO(out, rc = -EINVAL);
-
-        llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
-        if (IS_ERR(llap))
-                GOTO(out, rc = PTR_ERR(llap));
-
-        page_cache_get(page);
-        if (llap->llap_write_queued) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
-                rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
-                                         llap->llap_cookie,
-                                         ASYNC_READY | ASYNC_URGENT);
-        } else {
-                llap->llap_write_queued = 1;
-                rc = obd_queue_async_io(exp, lli->lli_smd, NULL,
-                                        llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
-                                        0, ASYNC_READY | ASYNC_URGENT);
-                if (rc == 0)
-                        LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n");
-                else
-                        llap->llap_write_queued = 0;
-        }
-        if (rc)
-                page_cache_release(page);
-out:
-        if (rc) {
-                if (!lli->lli_async_rc)
-                        lli->lli_async_rc = rc;
-                /* re-dirty page on error so it retries write */
-                SetPageDirty(page);
-                ClearPageLaunder(page);
-                unlock_page(page);
-        }
-        RETURN(rc);
-}
-
  static int ll_direct_IO_24(int rw,
  #ifdef HAVE_DIO_FILE
                             struct file *file,
@@ -194,7 +144,7 @@ static int ll_max_readahead(struct inode *inode)
  struct address_space_operations ll_aops = {
          .readpage       = ll_readpage,
          .direct_IO      = ll_direct_IO_24,
-        .writepage      = ll_writepage_24,
+        .writepage      = ll_writepage,
          .prepare_write  = ll_prepare_write,
          .commit_write   = ll_commit_write,
          .removepage     = ll_removepage,
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c

index 409fbee..07b0d45 100644 (file)
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -53,53 +53,7 @@
  
  static int ll_writepage_26(struct page *page, struct writeback_control *wbc)
  {
-        struct inode *inode = page->mapping->host;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        int rc;
-        ENTRY;
-
-        LASSERT(!PageDirty(page));
-        LASSERT(PageLocked(page));
-
-        exp = ll_i2obdexp(inode);
-        if (exp == NULL)
-                GOTO(out, rc = -EINVAL);
-
-        llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
-        if (IS_ERR(llap))
-                GOTO(out, rc = PTR_ERR(llap));
-
-        page_cache_get(page);
-        if (llap->llap_write_queued) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
-                rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
-                                         llap->llap_cookie,
-                                         ASYNC_READY | ASYNC_URGENT);
-        } else {
-                llap->llap_write_queued = 1;
-                rc = obd_queue_async_io(exp, lli->lli_smd, NULL,
-                                        llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
-                                        0, ASYNC_READY | ASYNC_URGENT);
-                if (rc == 0)
-                        LL_CDEBUG_PAGE(D_PAGE, page, "mmap write queued\n");
-                else
-                        llap->llap_write_queued = 0;
-        }
-        if (rc)
-                page_cache_release(page);
-out:
-        if (rc) {
-                if (!lli->lli_async_rc)
-                        lli->lli_async_rc = rc;
-                /* re-dirty page on error so it retries write */
-                SetPageDirty(page);
-                unlock_page(page);
-        } else {
-                set_page_writeback(page);
-        }
-        RETURN(rc);
+        return ll_writepage(page);
  }
  
  /* It is safe to not check anything in invalidatepage/releasepage below
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c

index e942a02..dc6ce4f 100644 (file)
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -1377,6 +1377,42 @@ static int lov_cancel_unused(struct obd_export *exp,
          RETURN(rc);
  }
  
+static int lov_join_lru(struct obd_export *exp,
+                        struct lov_stripe_md *lsm, int join)
+{
+        struct lov_obd *lov;
+        struct lov_oinfo *loi;
+        int i, count = 0;
+        ENTRY;
+
+        ASSERT_LSM_MAGIC(lsm);
+        if (!exp || !exp->exp_obd)
+                RETURN(-ENODEV);
+
+        lov = &exp->exp_obd->u.lov;
+        for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
+                struct lov_stripe_md submd;
+                int rc = 0;
+
+                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+
+                submd.lsm_object_id = loi->loi_id;
+                submd.lsm_stripe_count = 0;
+                rc = obd_join_lru(lov->tgts[loi->loi_ost_idx].ltd_exp, 
+                                  &submd, join);
+                if (rc < 0) {
+                        CERROR("join lru failed. objid: "LPX64" subobj: "LPX64
+                               " ostidx: %d rc: %d\n", lsm->lsm_object_id,
+                               loi->loi_id, loi->loi_ost_idx, rc);
+                        return rc;
+                } else {
+                        count += rc;
+                }
+        }
+        RETURN(count);
+}
+
  #define LOV_U64_MAX ((__u64)~0ULL)
  #define LOV_SUM_MAX(tot, add)                                           \
          do {                                                            \
@@ -1803,6 +1839,7 @@ struct obd_ops lov_obd_ops = {
          .o_change_cbdata       = lov_change_cbdata,
          .o_cancel              = lov_cancel,
          .o_cancel_unused       = lov_cancel_unused,
+        .o_join_lru            = lov_join_lru,
          .o_iocontrol           = lov_iocontrol,
          .o_get_info            = lov_get_info,
          .o_set_info            = lov_set_info,
diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c

index 996dbdf..9ab5fb4 100644 (file)
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -645,6 +645,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, join_lru);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, san_preprw);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index f2493b4..3a69c1c 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -2415,7 +2415,8 @@ static int sanosc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
  #endif
  #endif
  
-static void osc_set_data_with_check(struct lustre_handle *lockh, void *data)
+static void osc_set_data_with_check(struct lustre_handle *lockh, void *data,
+                                    int flags)
  {
          struct ldlm_lock *lock = ldlm_handle2lock(lockh);
  
@@ -2439,6 +2440,7 @@ static void osc_set_data_with_check(struct lustre_handle *lockh, void *data)
          }
  #endif
          lock->l_ast_data = data;
+        lock->l_flags |= (flags & LDLM_FL_NO_LRU);
          l_unlock(&lock->l_resource->lr_namespace->ns_lock);
          LDLM_LOCK_PUT(lock);
  }
@@ -2479,7 +2481,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
          rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode,
                               lockh);
          if (rc == 1) {
-                osc_set_data_with_check(lockh, data);
+                osc_set_data_with_check(lockh, data, *flags);
                  if (*flags & LDLM_FL_HAS_INTENT) {
                          /* I would like to be able to ASSERT here that rss <=
                           * kms, but I can't, for reasons which are explained in
@@ -2510,7 +2512,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
                           * lock_match.  I want a second opinion. */
                          ldlm_lock_addref(lockh, LCK_PR);
                          ldlm_lock_decref(lockh, LCK_PW);
-                        osc_set_data_with_check(lockh, data);
+                        osc_set_data_with_check(lockh, data, *flags);
                          RETURN(ELDLM_OK);
                  }
          }
@@ -2576,7 +2578,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
                               policy, mode, lockh);
          if (rc) {
                  //if (!(*flags & LDLM_FL_TEST_LOCK))
-                        osc_set_data_with_check(lockh, data);
+                        osc_set_data_with_check(lockh, data, *flags);
                  RETURN(rc);
          }
          /* If we're trying to read, we also search for an existing PW lock.  The
@@ -2589,7 +2591,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
                          /* FIXME: This is not incredibly elegant, but it might
                           * be more elegant than adding another parameter to
                           * lock_match.  I want a second opinion. */
-                        osc_set_data_with_check(lockh, data);
+                        osc_set_data_with_check(lockh, data, *flags);
                          ldlm_lock_addref(lockh, LCK_PR);
                          ldlm_lock_decref(lockh, LCK_PW);
                  }
@@ -2617,6 +2619,15 @@ static int osc_cancel_unused(struct obd_export *exp,
                                        opaque);
  }
  
+static int osc_join_lru(struct obd_export *exp,
+                        struct lov_stripe_md *lsm, int join)
+{
+        struct obd_device *obd = class_exp2obd(exp);
+        struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+
+        return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join);
+}
+
  static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                        unsigned long max_age)
  {
@@ -3117,6 +3128,7 @@ struct obd_ops osc_obd_ops = {
          .o_change_cbdata        = osc_change_cbdata,
          .o_cancel               = osc_cancel,
          .o_cancel_unused        = osc_cancel_unused,
+        .o_join_lru             = osc_join_lru,
          .o_iocontrol            = osc_iocontrol,
          .o_get_info             = osc_get_info,
          .o_set_info             = osc_set_info,
@@ -3148,6 +3160,7 @@ struct obd_ops sanosc_obd_ops = {
          .o_change_cbdata        = osc_change_cbdata,
          .o_cancel               = osc_cancel,
          .o_cancel_unused        = osc_cancel_unused,
+        .o_join_lru             = osc_join_lru,
          .o_iocontrol            = osc_iocontrol,
          .o_import_event         = osc_import_event,
          .o_llog_init            = osc_llog_init,
diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am

index fa9d6f4..c042fe2 100644 (file)
--- a/lustre/tests/Makefile.am
+++ b/lustre/tests/Makefile.am
@@ -32,6 +32,7 @@ endif
  bin_PROGRAMS = mcreate munlink
  endif # TESTS
  
+mmap_sanity_SOURCES= mmap_sanity.c
  stat_SOURCES = stat.c stat_fs.h
  # mkdirdeep_LDADD=-L$(top_builddir)/portals/utils -lptlctl $(LIBREADLINE)
  
diff --git a/lustre/tests/mmap_sanity.c b/lustre/tests/mmap_sanity.c

index 3fd0b0e..5a61806 100644 (file)
--- a/lustre/tests/mmap_sanity.c
+++ b/lustre/tests/mmap_sanity.c
@@ -13,148 +13,25 @@
  #include <sys/socket.h>
  #include <netdb.h>
  #include <string.h>
+#include <sys/wait.h>
  
-char *dir = NULL, *node = NULL, *dir2 = NULL;
+char *dir = NULL, *dir2 = NULL;
  long page_size;
  char mmap_sanity[256];
  
  
  static void usage(void)
  {
-        printf("Usage: mmap_sanity -d dir [-n node | -m dir2]\n");
+        printf("Usage: mmap_sanity -d dir [-m dir2]\n");
          printf("       dir      lustre mount point\n");
-        printf("       node     another client\n");
          printf("       dir2     another mount point\n");
          exit(127);
  }
  
-#define MMAP_NOTIFY_PORT        7676
-static int mmap_notify(char *target, char *str, int delay)
-{
-       unsigned short port = MMAP_NOTIFY_PORT;
-       int socket_type = SOCK_DGRAM;
-       struct sockaddr_in server;
-       struct hostent *hp;
-       int len, sockfd, rc = 0;
-
-        if (target == NULL)
-                return 0;
-
-       sockfd = socket(AF_INET, socket_type, 0);
-       if (sockfd < 0) {
-                perror("socket()");
-               return errno;
-       }
-
-        if ((hp = gethostbyname(target)) == NULL) {
-                perror(target);
-                rc = errno;
-                goto out_close;
-       }
-
-       memset(&server,0,sizeof(server));
-       memcpy(&(server.sin_addr), hp->h_addr, hp->h_length);
-       server.sin_family = AF_INET;
-       server.sin_port = htons(port);
-        
-        len = sizeof(server);
-        if (delay)
-                sleep(delay);
-        
-        rc = sendto(sockfd, str, strlen(str), 0, 
-                    (struct sockaddr *)&server, len);
-        if (rc < 0) {
-                perror("sendto()");
-                rc = errno;
-        } else
-                rc = 0;
-
-out_close:
-        close(sockfd);
-        return rc;
-}
-
-static int mmap_wait(char *str, int timeout)
-{
-       unsigned short port = MMAP_NOTIFY_PORT;
-       int socket_type = SOCK_DGRAM;
-       struct sockaddr_in local, from;
-       char host[256];
-       struct hostent *hp;
-        fd_set rfds;
-        struct timeval tv;
-        int sockfd, rc = 0;
-
-        if (dir2 != NULL)
-                return 0;
-        
-       memset(host, 0, sizeof(host));
-       if (gethostname(host, sizeof(host))) {
-                perror("gethostname()");
-                return errno;
-       }
-        
-       if ((hp = gethostbyname(host)) == NULL) {
-                perror(host);
-                return errno;
-       }
-
-       local.sin_family = AF_INET;
-       memcpy(&(local.sin_addr), hp->h_addr, hp->h_length);
-       local.sin_port = htons(port);
-       
-       sockfd = socket(AF_INET, socket_type, 0);
-       if (sockfd < 0) {
-                perror("socket()");
-               return errno;
-       }
-
-       rc = bind(sockfd, (struct sockaddr *)&local, sizeof(local));
-        if (rc < 0) {
-                perror("bind()");
-                rc = errno;
-                goto out_close;
-       }
-
-        FD_ZERO(&rfds);
-        FD_SET(sockfd, &rfds);
-        tv.tv_sec = timeout ? timeout : 5;
-        tv.tv_usec = 0;
-
-        rc = select(sockfd + 1, &rfds, NULL, NULL, &tv);
-        if (rc) {       /* got data */
-                char buffer[1024];
-                int fromlen =sizeof(from);
-                
-               memset(buffer, 0, sizeof(buffer));
-               rc = recvfrom(sockfd, buffer, sizeof(buffer), 0, 
-                              (struct sockaddr *)&from, &fromlen);
-                if (rc <= 0) {
-                        perror("recvfrom()");
-                        rc = errno;
-                        goto out_close;
-                }
-                rc = 0;
-
-                if (strncmp(str, buffer, strlen(str)) != 0) {
-                        fprintf(stderr, "expected string mismatch!\n");
-                        rc = EINVAL;
-                }
-        } else {        /* timeout */
-                fprintf(stderr, "timeout!\n");
-                rc = ETIME;
-        }
-
-out_close:
-        close(sockfd);
-        return rc;
-}
-
  static int remote_tst(int tc, char *mnt);
-static int mmap_run(char *host, int tc)
+static int mmap_run(int tc)
  {
          pid_t child;
-        char nodearg[256], command[256];
          int rc = 0;
  
          child = fork();
@@ -166,17 +43,13 @@ static int mmap_run(char *host, int tc)
          if (dir2 != NULL) {
                  rc = remote_tst(tc, dir2);
          } else {
-                sprintf(nodearg, "-w %s", node);
-                sprintf(command, "%s -d %s -n %s -c %d", 
-                        mmap_sanity, dir, host, tc);
-                rc = execlp("pdsh", "pdsh", "-S", nodearg, command, NULL);
-                if (rc)
-                        perror("execlp()");
+                rc = EINVAL;
+                fprintf(stderr, "invalid argument!\n");
          }
          _exit(rc);
  }
  
-static int mmap_initialize(char *myself, int tc)
+static int mmap_initialize(char *myself)
  {
          char buf[1024], *file;
          int fdr, fdw, count, rc = 0;
@@ -186,8 +59,6 @@ static int mmap_initialize(char *myself, int tc)
                  perror("sysconf(_SC_PAGESIZE)");
                  return errno;
          }
-        if (tc)
-                return 0;
  
          /* copy myself to lustre for another client */
          fdr = open(myself, O_RDONLY);
@@ -230,10 +101,8 @@ static int mmap_initialize(char *myself, int tc)
          return rc;
  }
  
-static void mmap_finalize(int tc)
+static void mmap_finalize()
  {
-        if (tc)
-                return;
          unlink(mmap_sanity);
  }
  
@@ -332,7 +201,7 @@ out_close:
  /* cocurrent mmap operations on two nodes */
  static int mmap_tst3(char *mnt)
  {
-        char *ptr, mmap_file[256], host[256];
+        char *ptr, mmap_file[256];
          int region, fd, rc = 0;
  
          region = page_size * 100;
@@ -357,19 +226,11 @@ static int mmap_tst3(char *mnt)
                  goto out_close;
          }
  
-        if (gethostname(host, sizeof(host))) {
-                perror("gethostname()");
-                rc = errno;
-                goto out_unmap;
-       }
-        
-        rc = mmap_run(host, 3);
+        rc = mmap_run(3);
          if (rc)
                  goto out_unmap;
          
-        rc = mmap_wait("mmap done", 10);
          memset(ptr, 'a', region);
-
          sleep(2);       /* wait for remote test finish */
  out_unmap:
          munmap(ptr, region);
@@ -400,14 +261,8 @@ static int remote_tst3(char *mnt)
                  goto out_close;
          }
          memset(ptr, 'b', region);
-
-        rc = mmap_notify(node, "mmap done", 1);
-        if (rc)
-                goto out_unmap;
-        
          memset(ptr, 'c', region);
          
-out_unmap:
          munmap(ptr, region);
  out_close:
          close(fd);
@@ -418,7 +273,7 @@ out_close:
   * client2 write to file_4b from mmap()ed file_4a. */
  static int mmap_tst4(char *mnt)
  {
-        char *ptr, filea[256], fileb[256], host[256];
+        char *ptr, filea[256], fileb[256];
          int region, fdr, fdw, rc = 0;
  
          region = page_size * 100;
@@ -456,17 +311,7 @@ static int mmap_tst4(char *mnt)
                  goto out_close;
          }
  
-        if (gethostname(host, sizeof(host))) {
-                perror("gethostname()");
-                rc = errno;
-                goto out_unmap;
-       }
-        
-        rc = mmap_run(host, 4);
-        if (rc)
-                goto out_unmap;
-        
-        rc = mmap_wait("mmap done", 10);
+        rc = mmap_run(4);
          if (rc)
                  goto out_unmap;
          
@@ -521,10 +366,6 @@ static int remote_tst4(char *mnt)
                  goto out_close;
          }
  
-        rc = mmap_notify(node, "mmap done", 1);
-        if (rc)
-                goto out_unmap;
-
          memset(ptr, '2', region);
  
          rc = write(fdw, ptr, region);
@@ -534,7 +375,6 @@ static int remote_tst4(char *mnt)
          } else
                  rc = 0;
       
-out_unmap:
          munmap(ptr, region);
  out_close:
          if (fdr >= 0)
@@ -544,6 +384,188 @@ out_close:
          return rc;
  }
  
+static int cancel_lru_locks(char *prefix)
+{
+        char cmd[256], line[1024];
+        FILE *file;
+        pid_t child;
+        int len = 1024, rc = 0;
+
+        child = fork();
+        if (child < 0)
+                return errno;
+        else if (child) {
+                int status;
+                rc = waitpid(child, &status, WNOHANG);
+                if (rc == child)
+                        rc = 0;
+                return rc;
+        }
+
+        if (prefix)
+                sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/%s_*/lru_size", prefix);
+        else
+                sprintf(cmd, "ls /proc/fs/lustre/ldlm/namespaces/*/lru_size");
+
+        file = popen(cmd, "r");
+        if (file == NULL) {
+                perror("popen()");
+                return errno;
+        }
+
+        while (fgets(line, len, file)) {
+                FILE *f;
+
+                if (!strlen(line))
+                        continue;
+                /* trim newline character */
+                *(line + strlen(line) - 1) = '\0';
+                f = fopen(line, "w");
+                if (f == NULL) {
+                        perror("fopen()");
+                        rc = errno;
+                        break;
+                }
+                rc = fwrite("clear", strlen("clear") + 1, 1, f);
+                if (rc < 1) {
+                        perror("fwrite()");
+                        rc = errno;
+                        fclose(f);
+                        break;
+                }
+                fclose(f);
+        }
+
+        pclose(file);
+        _exit(rc);
+}
+
+/* don't dead lock while read/write file to/from the buffer which
+ * mmaped to just this file */
+static int mmap_tst5(char *mnt)
+{
+        char *ptr, mmap_file[256];
+        int region, fd, off, rc = 0;
+
+        region = page_size * 40;
+        off = page_size * 10;
+        sprintf(mmap_file, "%s/%s", mnt, "mmap_file5");
+
+        if (unlink(mmap_file) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+
+        fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+        ftruncate(fd, region);
+
+        ptr = mmap(NULL, region, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out_close;
+        }
+        memset(ptr, 'a', region);
+
+        /* cancel unused locks */
+        cancel_lru_locks("OSC");
+        if (rc)
+                goto out_unmap;
+
+        /* read/write region of file and buffer should be overlap */
+        rc = read(fd, ptr + off, off * 2);
+        if (rc != off * 2) {
+                perror("read()");
+                rc = errno;
+                goto out_unmap;
+        }
+        rc = write(fd, ptr + off, off * 2);
+        if (rc != off * 2) {
+                perror("write()");
+                rc = errno;
+        }
+        rc = 0;
+out_unmap:
+        munmap(ptr, region);
+out_close:
+        close(fd);
+        unlink(mmap_file);
+        return rc;
+}
+
+/* mmap write to a file form client1 then mmap read from client2 */
+static int mmap_tst6(char *mnt)
+{
+        char mmap_file[256], mmap_file2[256];
+        char *ptr = NULL, *ptr2 = NULL;
+        int fd = 0, fd2 = 0, rc = 0;
+
+        sprintf(mmap_file, "%s/%s", mnt, "mmap_file6");
+        sprintf(mmap_file2, "%s/%s", dir2, "mmap_file6");
+        if (unlink(mmap_file) && errno != ENOENT) {
+                perror("unlink()");
+                return errno;
+        }
+
+        fd = open(mmap_file, O_CREAT|O_RDWR, 0600);
+        if (fd < 0) {
+                perror(mmap_file);
+                return errno;
+        }
+        ftruncate(fd, page_size);
+
+        fd2 = open(mmap_file2, O_RDWR, 0600);
+        if (fd2 < 0) {
+                perror(mmap_file2);
+                goto out;
+        }
+
+        ptr = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if (ptr == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out;
+        }
+        
+        ptr2 = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, 0);
+        if (ptr2 == MAP_FAILED) {
+                perror("mmap()");
+                rc = errno;
+                goto out;
+        }
+
+        cancel_lru_locks("OSC");
+        if (rc)
+                goto out;
+
+        memcpy(ptr, "blah", strlen("blah"));
+        if (strncmp(ptr, ptr2, strlen("blah"))) {
+                fprintf(stderr, "client2 mmap mismatch!\n");
+                rc = EFAULT;
+                goto out;
+        }
+        memcpy(ptr2, "foo", strlen("foo"));
+        if (strncmp(ptr, ptr2, strlen("foo"))) {
+                fprintf(stderr, "client1 mmap mismatch!\n");
+                rc = EFAULT;
+        }
+out:
+        if (ptr2)
+                munmap(ptr2, page_size);
+        if (ptr)
+                munmap(ptr, page_size);
+        if (fd2 > 0)
+                close(fd2);
+        if (fd > 0)
+                close(fd);
+        unlink(mmap_file);
+        return rc;
+}
+
  static int remote_tst(int tc, char *mnt)
  {
          int rc = 0;
@@ -554,8 +576,6 @@ static int remote_tst(int tc, char *mnt)
          case 4:
                  rc = remote_tst4(mnt);
                  break;
-        case 1:
-        case 2:
          default:
                  fprintf(stderr, "wrong test case number %d\n", tc);
                  rc = EINVAL;
@@ -577,6 +597,10 @@ struct test_case tests[] = {
          { 3, "mmap test3: cocurrent mmap ops on two nodes", mmap_tst3, 2 },
          { 4, "mmap test4: c1 write to f1 from mmaped f2, " 
               "c2 write to f1 from mmaped f1", mmap_tst4, 2 },
+        { 5, "mmap test5: read/write file to/from the buffer "
+             "which mmaped to just this file", mmap_tst5, 1 },
+        { 6, "mmap test6: check mmap write/read content on two nodes", 
+                mmap_tst6, 2 },
          { 0, NULL, 0, 0 }
  };
  
@@ -584,10 +608,10 @@ int main(int argc, char **argv)
  {
          extern char *optarg;
          struct test_case *test;
-        int c, rc = 0, tc = 0;
+        int c, rc = 0;
  
          for(;;) {
-                c = getopt(argc, argv, "d:n:c:m:");
+                c = getopt(argc, argv, "d:m:");
                  if ( c == -1 )
                          break;
  
@@ -595,12 +619,6 @@ int main(int argc, char **argv)
                          case 'd':
                                  dir = optarg;
                                  break;
-                        case 'n':
-                                node = optarg;
-                                break;
-                        case 'c':
-                                tc = atoi(optarg);
-                                break;
                          case 'm':
                                  dir2 = optarg;
                                  break;
@@ -613,23 +631,16 @@ int main(int argc, char **argv)
  
          if (dir == NULL)
                  usage();
-        if (dir2 != NULL && node != NULL)
-                usage();
  
-        if (mmap_initialize(argv[0], tc) != 0) {
+        if (mmap_initialize(argv[0]) != 0) {
                  fprintf(stderr, "mmap_initialize failed!\n");
                  return EINVAL;
          }
  
-        if (tc) {
-                rc = remote_tst(tc, dir);
-                goto out;
-        }
-        
          for (test = tests; test->tc; test++) {
                  char *rs = "skip";
                  rc = 0;
-                if (test->node_cnt == 1 || node != NULL || dir2 != NULL) {
+                if (test->node_cnt == 1 || dir2 != NULL) {
                          rc = test->test_fn(dir);
                          rs = rc ? "fail" : "pass";
                  }
@@ -637,7 +648,7 @@ int main(int argc, char **argv)
                  if (rc)
                          break;
          }
-out:
-        mmap_finalize(tc);
+
+        mmap_finalize();
          return rc;
  }
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index efb0f44..699c2f2 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -2298,6 +2298,41 @@ test_69() {
  }
  run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======"
  
+test_71() {
+       cp `which dbench` $DIR
+
+       [ ! -f $DIR/dbench ] && echo "dbench not installed, skip this test" && return 0
+
+       TGT=$DIR/client.txt
+       SRC=${SRC:-/usr/lib/dbench/client.txt}
+       [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+       SRC=/usr/lib/dbench/client_plain.txt
+       [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+
+       echo "copying necessary lib to $DIR"
+       if [ -d /lib64 ]; then
+               mkdir $DIR/lib64
+               cp /lib64/libc* $DIR/lib64
+               cp /lib64/ld-* $DIR/lib64
+       else 
+               mkdir $DIR/lib
+               cp /lib/libc* $DIR/lib
+               cp /lib/ld-* $DIR/lib
+       fi
+       
+       echo "chroot $DIR /dbench -c client.txt 2"
+       chroot $DIR /dbench -c client.txt 2
+       RC=$?
+
+       rm -f $DIR/dbench
+       rm -f $TGT
+       rm -fr $DIR/lib
+       rm -fr $DIR/lib64
+
+       return $RC
+}
+run_test 71 "Running dbench on lustre (don't segment fault) ===="
+
  # on the LLNL clusters, runas will still pick up root's $TMP settings,
  # which will not be writable for the runas user, and then you get a CVS
  # error message with a corrupt path string (CVS bug) and panic.
diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh

index 47d7bba..d2135ca 100644 (file)
--- a/lustre/tests/sanityN.sh
+++ b/lustre/tests/sanityN.sh
@@ -346,7 +346,7 @@ test_15() { # bug 974 - ENOSPC
  run_test 15 "test out-of-space with multiple writers ==========="
  
  test_16() {
-       fsx -R -W -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
+       fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
  }
  run_test 16 "2500 iterations of dual-mount fsx ================="
  
@@ -373,6 +373,11 @@ run_test 17 "resource creation/LVB creation race ==============="
  
  test_18() {
         ./mmap_sanity -d $MOUNT1 -m $MOUNT2
+}
+run_test 18 "mmap sanity check ================================="
+
+test_18() {
+       ./mmap_sanity -d $MOUNT1 -m $MOUNT2
         sync; sleep 1; sync
  }
  #run_test 18 "mmap sanity check ================================="
author	nic <nic>
	Fri, 11 Feb 2005 23:42:08 +0000 (23:42 +0000)
committer	nic <nic>
	Fri, 11 Feb 2005 23:42:08 +0000 (23:42 +0000)
lustre/include/linux/lustre_compat25.h		patch \| blob \| history
lustre/include/linux/lustre_dlm.h		patch \| blob \| history
lustre/include/linux/lustre_lite.h		patch \| blob \| history
lustre/include/linux/obd.h		patch \| blob \| history
lustre/include/linux/obd_class.h		patch \| blob \| history
lustre/kernel_patches/patches/export-filemap_populate.patch	[new file with mode: 0644]	patch \| blob
lustre/kernel_patches/series/2.6-suse-lnxi.series		patch \| blob \| history
lustre/kernel_patches/series/2.6-suse.series		patch \| blob \| history
lustre/kernel_patches/series/vanilla-2.4.24		patch \| blob \| history
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history
lustre/llite/Makefile.in		patch \| blob \| history
lustre/llite/Makefile.mk		patch \| blob \| history
lustre/llite/file.c		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/llite_mmap.c	[new file with mode: 0644]	patch \| blob
lustre/llite/rw.c		patch \| blob \| history
lustre/llite/rw24.c		patch \| blob \| history
lustre/llite/rw26.c		patch \| blob \| history
lustre/lov/lov_obd.c		patch \| blob \| history
lustre/obdclass/lprocfs_status.c		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history
lustre/tests/Makefile.am		patch \| blob \| history
lustre/tests/mmap_sanity.c		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history
lustre/tests/sanityN.sh		patch \| blob \| history