merge b_devel into HEAD, which will become 0.7.3

[fs/lustre-release.git] / lustre / llite / file.c
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index 943ba1b..67d18fd 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -32,8 +32,7 @@
  #include <linux/lustre_compat25.h>
  #endif
  
-int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
-extern int ll_setattr(struct dentry *de, struct iattr *attr);
+#include "llite_internal.h"
  
  static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
                          struct file *file)
@@ -135,28 +134,21 @@ int ll_file_release(struct inode *inode, struct file *file)
          lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE);
          fd = (struct ll_file_data *)file->private_data;
          if (!fd) /* no process opened the file after an mcreate */
-                RETURN(rc = 0);
+                RETURN(0);
  
          /* we might not be able to get a valid handle on this file
           * again so we really want to flush our write cache.. */
-        if (S_ISREG(inode->i_mode)) {
-                filemap_fdatasync(inode->i_mapping);
-                filemap_fdatawait(inode->i_mapping);
-
-                if (lsm != NULL) {
-                        memset(&oa, 0, sizeof(oa));
-                        oa.o_id = lsm->lsm_object_id;
-                        oa.o_mode = S_IFREG;
-                        oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
-
-                        memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE);
-                        oa.o_valid |= OBD_MD_FLHANDLE;
+        if (S_ISREG(inode->i_mode) && lsm) {
+                write_inode_now(inode, 0);
+                obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+                memcpy(obdo_handle(&oa), &fd->fd_ost_och, FD_OSTDATA_SIZE);
+                oa.o_valid |= OBD_MD_FLHANDLE;
  
-                        rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
-                        if (rc)
-                                CERROR("inode %lu object close failed: rc = "
-                                       "%d\n", inode->i_ino, rc);
-                }
+                rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                if (rc)
+                        CERROR("inode %lu object close failed: rc %d\n",
+                               inode->i_ino, rc);
          }
  
          rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
@@ -206,16 +198,16 @@ static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
                  RETURN(-ENOMEM);
          oa->o_id = lsm->lsm_object_id;
          oa->o_mode = S_IFREG;
-        oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
-                       OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+        oa->o_valid = OBD_MD_FLID;
+        obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
          rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
          if (rc)
                  GOTO(out, rc);
  
          file->f_flags &= ~O_LOV_DELAY_CREATE;
-        obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
-                                 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-
+        obdo_refresh_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                      OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                                      OBD_MD_FLCTIME);
          EXIT;
  out:
          obdo_free(oa);
@@ -236,24 +228,33 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
          struct obdo *oa;
          struct iattr iattr;
          struct mdc_op_data op_data;
-        int rc, err, lmm_size = 0;;
+        struct obd_trans_info oti = { 0 };
+        int rc, err, lmm_size = 0;
          ENTRY;
  
          oa = obdo_alloc();
          if (!oa)
                  RETURN(-ENOMEM);
  
+        LASSERT(S_ISREG(inode->i_mode));
          oa->o_mode = S_IFREG | 0600;
          oa->o_id = inode->i_ino;
+        oa->o_generation = inode->i_generation;
          /* Keep these 0 for now, because chown/chgrp does not change the
           * ownership on the OST, and we don't want to allow BA OST NFS
           * users to access these objects by mistake. */
          oa->o_uid = 0;
          oa->o_gid = 0;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
-                OBD_MD_FLUID | OBD_MD_FLGID;
+        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
+                OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
+#ifdef ENABLE_ORPHANS
+        oa->o_valid |= OBD_MD_FLCOOKIE;
+#endif
  
-        rc = obd_create(conn, oa, &lsm, NULL);
+        obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
+                        OBD_MD_FLCTIME | (inode->i_size ? OBD_MD_FLSIZE : 0));
+
+        rc = obd_create(conn, oa, &lsm, &oti);
          if (rc) {
                  CERROR("error creating objects for inode %lu: rc = %d\n",
                         inode->i_ino, rc);
@@ -263,7 +264,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
                  }
                  GOTO(out_oa, rc);
          }
-        obdo_to_inode(inode, oa, OBD_MD_FLBLKSZ);
+        obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ);
  
          LASSERT(lsm && lsm->lsm_object_id);
          rc = obd_packmd(conn, &lmm, lsm);
@@ -278,11 +279,18 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
  
          ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
  
-        rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data,
-                         &iattr, lmm, lmm_size, &req);
+#if 0
+#warning FIXME: next line is for debugging purposes only
+        obd_log_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, oti.oti_numcookies,
+                       oti.oti_logcookies, OBD_LLOG_FL_SENDNOW);
+#endif
+
+        rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data, &iattr,
+                         lmm, lmm_size, oti.oti_logcookies,
+                         oti.oti_numcookies * sizeof(oti.oti_onecookie), &req);
          ptlrpc_req_finished(req);
  
-        obd_free_diskmd (conn, &lmm);
+        obd_free_diskmd(conn, &lmm);
  
          /* If we couldn't complete mdc_open() and store the stripe MD on the
           * MDS, we need to destroy the objects now or they will be leaked.
@@ -297,13 +305,21 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
  
          EXIT;
  out_oa:
+        oti_free_cookies(&oti);
          obdo_free(oa);
          return rc;
  
  out_destroy:
-        obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
          oa->o_id = lsm->lsm_object_id;
-        oa->o_valid |= OBD_MD_FLID;
+        oa->o_valid = OBD_MD_FLID;
+        obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
+#if 0
+        err = obd_log_cancel(conn, lsm, oti.oti_numcookies, oti.oti_logcookies,
+                             OBD_LLOG_FL_SENDNOW);
+        if (err)
+                CERROR("error cancelling inode %lu log cookies: rc %d\n",
+                       inode->i_ino, err);
+#endif
          err = obd_destroy(conn, oa, lsm, NULL);
          obd_free_memmd(conn, &lsm);
          if (err)
@@ -327,8 +343,6 @@ out_destroy:
   * before returning in the O_LOV_DELAY_CREATE case and dropping it here
   * or in ll_file_release(), but I'm not sure that is desirable/necessary.
   */
-extern int ll_it_open_error(int phase, struct lookup_intent *it);
-
  int ll_file_open(struct inode *inode, struct file *file)
  {
          struct ll_sb_info *sbi = ll_i2sbi(inode);
@@ -346,9 +360,10 @@ int ll_file_open(struct inode *inode, struct file *file)
          if (inode->i_sb->s_root == file->f_dentry)
                  RETURN(0);
  
+        it = file->f_it;
          lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
-        LL_GET_INTENT(file->f_dentry, it);
-        rc = ll_it_open_error(IT_OPEN_OPEN, it);
+
+        rc = ll_it_open_error(DISP_OPEN_OPEN, it);
          if (rc)
                  RETURN(rc);
  
@@ -363,7 +378,8 @@ int ll_file_open(struct inode *inode, struct file *file)
  
          lsm = lli->lli_smd;
          if (lsm == NULL) {
-                if (file->f_flags & O_LOV_DELAY_CREATE) {
+                if (file->f_flags & O_LOV_DELAY_CREATE ||
+                    !(file->f_mode & FMODE_WRITE)) {
                          CDEBUG(D_INODE, "delaying object creation\n");
                          RETURN(0);
                  }
@@ -418,7 +434,7 @@ int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
                  OBD_MD_FLCTIME;
  
          if (ostdata != NULL) {
-                memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
+                memcpy(obdo_handle(&oa), ostdata, FD_OSTDATA_SIZE);
                  oa.o_valid |= OBD_MD_FLHANDLE;
          }
  
@@ -455,8 +471,8 @@ int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
                   (aft != 0 || after < before) &&
                   oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
  
-        obdo_to_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
-                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME));
+        obdo_refresh_inode(inode, &oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                       OBD_MD_FLMTIME | OBD_MD_FLCTIME);
          if (inode->i_blksize < PAGE_CACHE_SIZE)
                  inode->i_blksize = PAGE_CACHE_SIZE;
  
@@ -477,102 +493,6 @@ int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
          RETURN(0);
  }
  
-/*
- * some callers, notably truncate, really don't want i_size set based
- * on the the size returned by the getattr, or lock acquisition in
- * the future.
- */
-int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
-                   struct lov_stripe_md *lsm,
-                   int mode, struct ldlm_extent *extent,
-                   struct lustre_handle *lockh)
-{
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int rc, flags = 0;
-        ENTRY;
-
-        LASSERT(lockh->cookie == 0);
-
-        /* XXX phil: can we do this?  won't it screw the file size up? */
-        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
-            (sbi->ll_flags & LL_SBI_NOLCK))
-                RETURN(0);
-
-        CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
-               inode->i_ino, extent->start, extent->end);
-
-        rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
-                         sizeof(extent), mode, &flags, ll_extent_lock_callback,
-                         inode, lockh);
-
-        RETURN(rc);
-}
-
-/*
- * this grabs a lock and manually implements behaviour that makes it look like
- * the OST is returning the file size with each lock acquisition.
- */
-int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
-                   struct lov_stripe_md *lsm, int mode,
-                   struct ldlm_extent *extent, struct lustre_handle *lockh)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct ldlm_extent size_lock;
-        struct lustre_handle match_lockh = {0};
-        int flags, rc, matched;
-        ENTRY;
-
-        rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
-        if (rc != ELDLM_OK)
-                RETURN(rc);
-
-        if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
-                RETURN(0);
-
-        rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
-        if (rc) {
-                ll_extent_unlock(fd, inode, lsm, mode, lockh);
-                RETURN(rc);
-        }
-
-        size_lock.start = inode->i_size;
-        size_lock.end = OBD_OBJECT_EOF;
-
-        /* XXX I bet we should be checking the lock ignore flags.. */
-        flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
-        matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
-                            &size_lock, sizeof(size_lock), LCK_PR, &flags,
-                            inode, &match_lockh);
-
-        /* hey, alright, we hold a size lock that covers the size we
-         * just found, its not going to change for a while.. */
-        if (matched == 1) {
-                set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
-                obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
-                           &match_lockh);
-        }
-
-        RETURN(0);
-}
-
-int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
-                struct lov_stripe_md *lsm, int mode,
-                struct lustre_handle *lockh)
-{
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int rc;
-        ENTRY;
-
-        /* XXX phil: can we do this?  won't it screw the file size up? */
-        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
-            (sbi->ll_flags & LL_SBI_NOLCK))
-                RETURN(0);
-
-        rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
-
-        RETURN(rc);
-}
-
  static inline void ll_remove_suid(struct inode *inode)
  {
          unsigned int mode;
@@ -591,22 +511,10 @@ static inline void ll_remove_suid(struct inode *inode)
  #if 0
  static void ll_update_atime(struct inode *inode)
  {
-#ifdef USE_ATIME
-        struct iattr attr;
-
-        attr.ia_atime = LTIME_S(CURRENT_TIME);
-        attr.ia_valid = ATTR_ATIME;
-
-        if (inode->i_atime == attr.ia_atime) return;
          if (IS_RDONLY(inode)) return;
-        if (IS_NOATIME(inode)) return;
  
-        /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */
-        ll_inode_setattr(inode, &attr, 0);
-#else
          /* update atime, but don't explicitly write it out just this change */
          inode->i_atime = CURRENT_TIME;
-#endif
  }
  #endif
  
@@ -676,19 +584,19 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
  
          /* start writeback on dirty pages in the extent when its PW */
          for (i = start, j = start % count;
-                        lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
+             lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
                  if (j == count) {
                          i += skip;
                          j = 0;
                  }
                  /* its unlikely, but give us a chance to bail when we're out */
-                PGCACHE_WRLOCK(inode->i_mapping);
+                ll_pgcache_lock(inode->i_mapping);
                  if (list_empty(&inode->i_mapping->dirty_pages)) {
                          CDEBUG(D_INODE, "dirty list empty\n");
-                        PGCACHE_WRUNLOCK(inode->i_mapping);
+                        ll_pgcache_unlock(inode->i_mapping);
                          break;
                  }
-                PGCACHE_WRUNLOCK(inode->i_mapping);
+                ll_pgcache_unlock(inode->i_mapping);
  
                  if (need_resched())
                          schedule();
@@ -702,10 +610,10 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
                  }
                  if (PageDirty(page)) {
                          CDEBUG(D_INODE, "writing page %p\n", page);
-                        PGCACHE_WRLOCK(inode->i_mapping);
+                        ll_pgcache_lock(inode->i_mapping);
                          list_del(&page->list);
                          list_add(&page->list, &inode->i_mapping->locked_pages);
-                        PGCACHE_WRUNLOCK(inode->i_mapping);
+                        ll_pgcache_unlock(inode->i_mapping);
  
                          /* this writepage might write out pages outside
                           * this extent, but that's ok, the pages are only
@@ -730,19 +638,19 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
          LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
          LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
          for (i = start, j = start % count ; i < end ; j++, i++) {
-                if ( j == count ) {
+                if (j == count) {
                          i += skip;
                          j = 0;
                  }
-                PGCACHE_WRLOCK(inode->i_mapping);
+                ll_pgcache_lock(inode->i_mapping);
                  if (list_empty(&inode->i_mapping->dirty_pages) &&
                       list_empty(&inode->i_mapping->clean_pages) &&
                       list_empty(&inode->i_mapping->locked_pages)) {
                          CDEBUG(D_INODE, "nothing left\n");
-                        PGCACHE_WRUNLOCK(inode->i_mapping);
+                        ll_pgcache_unlock(inode->i_mapping);
                          break;
                  }
-                PGCACHE_WRUNLOCK(inode->i_mapping);
+                ll_pgcache_unlock(inode->i_mapping);
                  if (need_resched())
                          schedule();
                  page = find_get_page(inode->i_mapping, i);
@@ -755,15 +663,16 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
                          truncate_complete_page(page);
  #else
                          truncate_complete_page(page->mapping, page);
-#endif                
+#endif
                  unlock_page(page);
                  page_cache_release(page);
          }
          EXIT;
  }
  
-int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
-                            void *data, int flag)
+static int ll_extent_lock_callback(struct ldlm_lock *lock,
+                                   struct ldlm_lock_desc *new, void *data,
+                                   int flag)
  {
          struct inode *inode = data;
          struct ll_inode_info *lli = ll_i2info(inode);
@@ -771,7 +680,10 @@ int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
          int rc;
          ENTRY;
  
-        LASSERT(inode != NULL);
+        if ((unsigned long)inode < 0x1000) {
+                LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
+                LBUG();
+        }
  
          switch (flag) {
          case LDLM_CB_BLOCKING:
@@ -785,9 +697,15 @@ int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
                   * could know to write-back or simply throw away the pages
                   * based on if the cancel comes from a desire to, say,
                   * read or truncate.. */
-                LASSERT((unsigned long)inode > 0x1000);
-                LASSERT((unsigned long)lli > 0x1000);
-                LASSERT((unsigned long)lli->lli_smd > 0x1000);
+                if ((unsigned long)lli->lli_smd < 0x1000) {
+                        /* note that lli is part of the inode itself, so it
+                         * is valid if as checked the inode pointer above. */
+                        CERROR("inode %lu, sb %p, lli %p, lli_smd %p\n",
+                               inode->i_ino, inode->i_sb, lli, lli->lli_smd);
+                        LDLM_ERROR(lock, "cancel lock on bad inode %p", inode);
+                        LBUG();
+                }
+
                  ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
                  break;
          default:
@@ -797,6 +715,102 @@ int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
          RETURN(0);
  }
  
+/*
+ * some callers, notably truncate, really don't want i_size set based
+ * on the the size returned by the getattr, or lock acquisition in
+ * the future.
+ */
+int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
+                   struct lov_stripe_md *lsm,
+                   int mode, struct ldlm_extent *extent,
+                   struct lustre_handle *lockh)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        int rc, flags = 0;
+        ENTRY;
+
+        LASSERT(lockh->cookie == 0);
+
+        /* XXX phil: can we do this?  won't it screw the file size up? */
+        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
+            (sbi->ll_flags & LL_SBI_NOLCK))
+                RETURN(0);
+
+        CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
+               inode->i_ino, extent->start, extent->end);
+
+        rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
+                         sizeof(extent), mode, &flags, ll_extent_lock_callback,
+                         inode, lockh);
+
+        RETURN(rc);
+}
+
+/*
+ * this grabs a lock and manually implements behaviour that makes it look like
+ * the OST is returning the file size with each lock acquisition.
+ */
+int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
+                   struct lov_stripe_md *lsm, int mode,
+                   struct ldlm_extent *extent, struct lustre_handle *lockh)
+{
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ldlm_extent size_lock;
+        struct lustre_handle match_lockh = {0};
+        int flags, rc, matched;
+        ENTRY;
+
+        rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
+        if (rc != ELDLM_OK)
+                RETURN(rc);
+
+        if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
+                RETURN(0);
+
+        rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
+        if (rc) {
+                ll_extent_unlock(fd, inode, lsm, mode, lockh);
+                RETURN(rc);
+        }
+
+        size_lock.start = inode->i_size;
+        size_lock.end = OBD_OBJECT_EOF;
+
+        /* XXX I bet we should be checking the lock ignore flags.. */
+        flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
+        matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
+                            &size_lock, sizeof(size_lock), LCK_PR, &flags,
+                            inode, &match_lockh);
+
+        /* hey, alright, we hold a size lock that covers the size we
+         * just found, its not going to change for a while.. */
+        if (matched == 1) {
+                set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
+                obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
+                           &match_lockh);
+        }
+
+        RETURN(0);
+}
+
+int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
+                struct lov_stripe_md *lsm, int mode,
+                struct lustre_handle *lockh)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        int rc;
+        ENTRY;
+
+        /* XXX phil: can we do this?  won't it screw the file size up? */
+        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
+            (sbi->ll_flags & LL_SBI_NOLCK))
+                RETURN(0);
+
+        rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
+
+        RETURN(rc);
+}
+
  static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
                              loff_t *ppos)
  {
@@ -819,6 +833,10 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
  
          lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
                              count);
+
+        if (!lsm)
+                RETURN(0);
+
          /* grab a -> eof extent to push extending writes out of node's caches
           * so we can see them at the getattr after lock acquisition.  this will
           * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
@@ -852,8 +870,8 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
  /*
   * Write to a file (through the page cache).
   */
-static ssize_t
-ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
+static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
+                             loff_t *ppos)
  {
          struct ll_file_data *fd = file->private_data;
          struct inode *inode = file->f_dentry->d_inode;
@@ -868,6 +886,7 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
                 inode->i_ino, inode->i_generation, inode, count, *ppos);
  
+        SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
          /*
           * sleep doing some writeback work of this mount's dirty data
           * if the VM thinks we're low on memory.. other dirtying code
@@ -875,12 +894,14 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
           * careful not to hold locked pages while they do so.  like
           * ll_prepare_write.  *cough*
           */
-        LL_CHECK_DIRTY(inode->i_sb);
+        ll_check_dirty(inode->i_sb);
  
          /* POSIX, but surprised the VFS doesn't check this already */
          if (count == 0)
                  RETURN(0);
  
+        LASSERT(lsm);
+
          if (file->f_flags & O_APPEND) {
                  extent.start = 0;
                  extent.end = OBD_OBJECT_EOF;
@@ -943,7 +964,8 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file,
          lsm = lli->lli_smd;
          if (lsm) {
                  up(&lli->lli_open_sem);
-                CERROR("stripe already exists for ino %lu\n", inode->i_ino);
+                CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+                       inode->i_ino);
                  /* If we haven't already done the open, do so now */
                  if (file->f_flags & O_LOV_DELAY_CREATE) {
                          int rc2 = ll_osc_open(conn, inode, file, lsm);
@@ -987,6 +1009,7 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
          struct ll_file_data *fd = file->private_data;
          struct lustre_handle *conn;
          int flags;
+
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
                 inode->i_generation, inode, cmd);
  
@@ -1077,8 +1100,8 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
  
  int ll_fsync(struct file *file, struct dentry *dentry, int data)
  {
-        int ret;
          struct inode *inode = dentry->d_inode;
+        int rc;
          ENTRY;
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
                 inode->i_generation, inode);
@@ -1090,17 +1113,17 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data)
           * still holding the PW lock that covered the dirty pages.  XXX we
           * should probably get a reference on it, though, just to be clear.
           */
-        ret = filemap_fdatasync(dentry->d_inode->i_mapping);
-        if ( ret == 0 )
-                ret = filemap_fdatawait(dentry->d_inode->i_mapping);
+        rc = filemap_fdatasync(inode->i_mapping);
+        if (rc == 0)
+                rc = filemap_fdatawait(inode->i_mapping);
  
-        RETURN(ret);
+        RETURN(rc);
  }
  
-int ll_inode_revalidate(struct dentry *dentry)
+int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
  {
          struct inode *inode = dentry->d_inode;
-        struct lov_stripe_md *lsm = NULL;
+        struct lov_stripe_md *lsm;
          ENTRY;
  
          if (!inode) {
@@ -1118,70 +1141,41 @@ int ll_inode_revalidate(struct dentry *dentry)
             below when the lock is marked CB_PENDING.  That RPC may not
             go out because someone else may be in another RPC waiting for
             that lock*/
-        if (!(dentry->d_it && dentry->d_it->it_lock_mode) &&
-            !ll_have_md_lock(dentry)) {
+        if (!(it && it->it_lock_mode) && !ll_have_md_lock(dentry)) {
+                struct lustre_md md;
                  struct ptlrpc_request *req = NULL;
                  struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
                  struct ll_fid fid;
-                struct mds_body *body;
-                struct lov_mds_md *lmm;
                  unsigned long valid = 0;
-                int eadatalen = 0, rc;
+                int rc;
+                int ealen = 0;
  
-                /* Why don't we update all valid MDS fields here, if we're
-                 * doing an RPC anyways?  -phil */
                  if (S_ISREG(inode->i_mode)) {
-                        eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
+                        ealen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
                          valid |= OBD_MD_FLEASIZE;
                  }
                  ll_inode2fid(&fid, inode);
-                rc = mdc_getattr(&sbi->ll_mdc_conn, &fid,
-                                 valid, eadatalen, &req);
+                rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid, ealen, &req);
                  if (rc) {
                          CERROR("failure %d inode %lu\n", rc, inode->i_ino);
                          RETURN(-abs(rc));
                  }
-
-                body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
-                LASSERT (body != NULL);         /* checked by mdc_getattr() */
-                LASSERT_REPSWABBED (req, 0);    /* swabbed by mdc_getattr() */
-
-                if (S_ISREG(inode->i_mode) &&
-                    (body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))) {
-                        CERROR("MDS sent back size for regular file\n");
-                        body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
-                }
+                rc = mdc_req2lustre_md(req, 0, &sbi->ll_osc_conn, &md);
  
                  /* XXX Too paranoid? */
-                if ((body->valid ^ valid) & OBD_MD_FLEASIZE)
+                if ((md.body->valid ^ valid) & OBD_MD_FLEASIZE)
                          CERROR("Asked for %s eadata but got %s\n",
                                 (valid & OBD_MD_FLEASIZE) ? "some" : "no",
-                               (body->valid & OBD_MD_FLEASIZE) ? "some":"none");
-
-                if (S_ISREG(inode->i_mode) &&
-                    (body->valid & OBD_MD_FLEASIZE)) {
-                        if (body->eadatasize == 0) { /* no EA data */
-                                CERROR("OBD_MD_FLEASIZE set but no data\n");
-                                RETURN(-EPROTO);
-                        }
-                        /* Only bother with this if inode's lsm not set? */
-                        lmm = lustre_msg_buf(req->rq_repmsg,1,body->eadatasize);
-                        LASSERT(lmm != NULL);       /* mdc_getattr() checked */
-                        LASSERT_REPSWABBED(req, 1); /* mdc_getattr() swabbed */
-
-                        rc = obd_unpackmd (&sbi->ll_osc_conn,
-                                           &lsm, lmm, body->eadatasize);
-                        if (rc < 0) {
-                                CERROR("Error %d unpacking eadata\n", rc);
-                                ptlrpc_req_finished(req);
-                                RETURN(rc);
-                        }
-                        LASSERT(rc >= sizeof(*lsm));
+                               (md.body->valid & OBD_MD_FLEASIZE) ? "some":
+                               "none");
+                if (rc) {
+                        ptlrpc_req_finished(req);
+                        RETURN(rc);
                  }
  
-                ll_update_inode(inode, body, lsm);
-                if (lsm != NULL && ll_i2info(inode)->lli_smd != lsm)
-                        obd_free_memmd(&sbi->ll_osc_conn, &lsm);
+                ll_update_inode(inode, md.body, md.lsm);
+                if (md.lsm != NULL && ll_i2info(inode)->lli_smd != md.lsm)
+                        obd_free_memmd(&sbi->ll_osc_conn, &md.lsm);
  
                  ptlrpc_req_finished(req);
          }
@@ -1211,19 +1205,20 @@ int ll_inode_revalidate(struct dentry *dentry)
  }
  
  #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-static int ll_getattr(struct vfsmount *mnt, struct dentry *de,
+int ll_getattr(struct vfsmount *mnt, struct dentry *de,
+                      struct lookup_intent *it, 
                        struct kstat *stat)
  {
          int res = 0;
          struct inode *inode = de->d_inode;
  
+        res = ll_inode_revalidate_it(de, it);
          lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
-        res = ll_inode_revalidate(de);
+
          if (res)
                  return res;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        stat->dev = inode->i_dev;
-#endif
+
+        stat->dev = inode->i_sb->s_dev;
          stat->ino = inode->i_ino;
          stat->mode = inode->i_mode;
          stat->nlink = inode->i_nlink;
@@ -1234,6 +1229,8 @@ static int ll_getattr(struct vfsmount *mnt, struct dentry *de,
          stat->mtime = inode->i_mtime;
          stat->ctime = inode->i_ctime;
          stat->size = inode->i_size;
+        stat->blksize = inode->i_blksize;
+        stat->blocks = inode->i_blocks;
          return 0;
  }
  #endif
@@ -1254,9 +1251,9 @@ struct inode_operations ll_file_inode_operations = {
          setattr:    ll_setattr,
          truncate:   ll_truncate,
  #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        getattr: ll_getattr,
+        getattr_it: ll_getattr,
  #else
-        revalidate: ll_inode_revalidate,
+        revalidate_it: ll_inode_revalidate_it,
  #endif
  };
  
@@ -1264,8 +1261,8 @@ struct inode_operations ll_special_inode_operations = {
          setattr_raw:    ll_setattr_raw,
          setattr:    ll_setattr,
  #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        getattr:    ll_getattr,
+        getattr_it:    ll_getattr,
  #else
-        revalidate: ll_inode_revalidate,
+        revalidate_it: ll_inode_revalidate_it,
  #endif
  };