LU-1030 osc: new IO engine implementation

[fs/lustre-release.git] / lustre / llite / vvp_io.c
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c

index 4a65b9e..f534184 100644 (file)
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -36,6 +36,7 @@
   * Implementation of cl_io for VVP layer.
   *
   *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
   */
  
  #define DEBUG_SUBSYSTEM S_LLITE
@@ -347,44 +348,8 @@ static int vvp_io_setattr_trunc(const struct lu_env *env,
                                  const struct cl_io_slice *ios,
                                  struct inode *inode, loff_t size)
  {
-        struct vvp_io        *vio   = cl2vvp_io(env, ios);
-        struct cl_io         *io    = ios->cis_io;
-        struct cl_object     *obj   = ios->cis_obj;
-        pgoff_t               start = cl_index(obj, size);
-        int                   result;
-
-        DOWN_WRITE_I_ALLOC_SEM(inode);
-
-        result = vvp_do_vmtruncate(inode, size);
-
-        /*
-         * If a page is partially truncated, keep it owned across truncate to
-         * prevent... races.
-         *
-         * XXX this properly belongs to osc, because races in question are OST
-         * specific.
-         */
-        if (cl_offset(obj, start) != size) {
-                struct cl_object_header *hdr;
-
-                hdr = cl_object_header(obj);
-                cfs_spin_lock(&hdr->coh_page_guard);
-                vio->cui_partpage = cl_page_lookup(hdr, start);
-                cfs_spin_unlock(&hdr->coh_page_guard);
-
-                if (vio->cui_partpage != NULL)
-                        /*
-                         * Wait for the transfer completion for a partially
-                         * truncated page to avoid dead-locking an OST with
-                         * the concurrent page-wise overlapping WRITE and
-                         * PUNCH requests. BUG:17397.
-                         *
-                         * Partial page is disowned in vvp_io_trunc_end().
-                         */
-                        cl_page_own(env, io, vio->cui_partpage);
-        } else
-                vio->cui_partpage = NULL;
-        return result;
+       DOWN_WRITE_I_ALLOC_SEM(inode);
+       return 0;
  }
  
  static int vvp_io_setattr_time(const struct lu_env *env,
@@ -434,23 +399,15 @@ static int vvp_io_setattr_start(const struct lu_env *env,
  static void vvp_io_setattr_end(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
  {
-        struct vvp_io        *vio   = cl2vvp_io(env, ios);
          struct cl_io         *io    = ios->cis_io;
          struct inode         *inode = ccc_object_inode(io->ci_obj);
  
          if (!cl_io_is_trunc(io))
                  return;
-        if (vio->cui_partpage != NULL) {
-                cl_page_disown(env, ios->cis_io, vio->cui_partpage);
-                cl_page_put(env, vio->cui_partpage);
-                vio->cui_partpage = NULL;
-        }
  
-        /*
-         * Do vmtruncate again, to remove possible stale pages populated by
-         * competing read threads. bz20645.
-         */
-        vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+       /* Truncate in memory pages - they must be clean pages because osc
+        * has already notified to destroy osc_extents. */
+       vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
  }
  
  static void vvp_io_setattr_fini(const struct lu_env *env,
@@ -721,6 +678,11 @@ static int vvp_io_fault_start(const struct lu_env *env,
  
          /* must return locked page */
          if (fio->ft_mkwrite) {
+               /* we grab alloc_sem to exclude truncate case.
+                * Otherwise, we could add dirty pages into osc cache
+                * while truncate is on-going. */
+               DOWN_READ_I_ALLOC_SEM(inode);
+
                  LASSERT(cfio->ft_vmpage != NULL);
                  lock_page(cfio->ft_vmpage);
          } else {
@@ -766,15 +728,22 @@ static int vvp_io_fault_start(const struct lu_env *env,
                           * started before the page is really made dirty, we
                           * still have chance to detect it. */
                          result = cl_page_cache_add(env, io, page, CRT_WRITE);
-                        if (result < 0) {
-                                cl_page_unassume(env, io, page);
-                                cl_page_put(env, page);
-
-                                /* we're in big trouble, what can we do now? */
-                                if (result == -EDQUOT)
-                                        result = -ENOSPC;
-                                GOTO(out, result);
-                        }
+                       LASSERT(cl_page_is_owned(page, io));
+
+                       vmpage = NULL;
+                       if (result < 0) {
+                               cl_page_unmap(env, io, page);
+                               cl_page_discard(env, io, page);
+                               cl_page_disown(env, io, page);
+
+                               cl_page_put(env, page);
+
+                               /* we're in big trouble, what can we do now? */
+                               if (result == -EDQUOT)
+                                       result = -ENOSPC;
+                               GOTO(out, result);
+                       } else
+                               cl_page_disown(env, io, page);
                  }
          }
  
@@ -795,18 +764,23 @@ static int vvp_io_fault_start(const struct lu_env *env,
  
  out:
          /* return unlocked vmpage to avoid deadlocking */
-        unlock_page(vmpage);
+       if (vmpage != NULL)
+               unlock_page(vmpage);
+       if (fio->ft_mkwrite)
+               UP_READ_I_ALLOC_SEM(inode);
  #ifdef HAVE_VM_OP_FAULT
-        cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+       cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
  #endif
-        return result;
+       return result;
  }
  
-static void vvp_io_fsync_end(const struct lu_env *env,
-                            const struct cl_io_slice *ios)
+static int vvp_io_fsync_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
  {
-       /* never try to verify there is no dirty pages in sync range
-        * because page_mkwrite() can generate new dirty pages any time. */
+       /* we should mark TOWRITE bit to each dirty page in radix tree to
+        * verify pages have been written, but this is difficult because of
+        * race. */
+       return 0;
  }
  
  static int vvp_io_read_page(const struct lu_env *env,
@@ -874,7 +848,7 @@ static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
          queue = &io->ci_queue;
          cl_2queue_init_page(queue, page);
  
-        result = cl_io_submit_sync(env, io, crt, queue, CRP_NORMAL, 0);
+       result = cl_io_submit_sync(env, io, crt, queue, 0);
          LASSERT(cl_page_is_owned(page, io));
  
          if (crt == CRT_READ)
@@ -1049,6 +1023,7 @@ static int vvp_io_commit_write(const struct lu_env *env,
                          }
                          if (need_clip)
                                  cl_page_clip(env, pg, 0, to);
+                       clear_page_dirty_for_io(vmpage);
                          result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
                          if (result)
                                  CERROR("Write page %lu of inode %p failed %d\n",
@@ -1108,7 +1083,7 @@ static const struct cl_io_operations vvp_io_ops = {
                          .cio_end       = ccc_io_end
                  },
                 [CIT_FSYNC] = {
-                       .cio_end    = vvp_io_fsync_end,
+                       .cio_start  = vvp_io_fsync_start,
                         .cio_fini   = vvp_io_fini
                 },
                  [CIT_MISC] = {