Whamcloud - gitweb
Fix eric's extremely well-spotted locking bug. It's not clear that we even
[fs/lustre-release.git] / lustre / llite / rw.c
index e1341a3..823d18a 100644 (file)
@@ -1,7 +1,9 @@
-/*
- * Lustre Light I/O Page Cache
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Copyright (C) 2002, Cluster File Systems, Inc. 
+ * Lustre Lite I/O Page Cache
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
  */
 
 #include <linux/config.h>
@@ -9,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
+#include <linux/iobuf.h>
 #include <linux/errno.h>
 #include <linux/locks.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 
-#define DEBUG_SUBSYSTEM S_LLIGHT
+#define DEBUG_SUBSYSTEM S_LLITE
 
-#include <linux/obd_support.h>
-#include <linux/lustre_lib.h>
-#include <linux/lustre_idl.h>
 #include <linux/lustre_mds.h>
-#include <linux/lustre_light.h>
-
-int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,10))
-/*
- * Add a page to the dirty page list.
- */
-void __set_page_dirty(struct page *page)
-{
-        struct address_space *mapping;
-        spinlock_t *pg_lock;
-
-        pg_lock = PAGECACHE_LOCK(page);
-        spin_lock(pg_lock);
-
-        mapping = page->mapping;
-        spin_lock(&mapping->page_lock);
-
-        list_del(&page->list);
-        list_add(&page->list, &mapping->dirty_pages);
-
-        spin_unlock(&mapping->page_lock);
-        spin_unlock(pg_lock);
-
-        if (mapping->host)
-                mark_inode_dirty_pages(mapping->host);
-}
-#else
-/*
- * Add a page to the dirty page list.
- */
-void set_page_dirty(struct page *page)
-{
-       if (!test_and_set_bit(PG_dirty, &page->flags)) {
-               struct address_space *mapping = page->mapping;
-
-               if (mapping) {
-                       spin_lock(&pagecache_lock);
-                       list_del(&page->list);
-                       list_add(&page->list, &mapping->dirty_pages);
-                       spin_unlock(&pagecache_lock);
-
-                       if (mapping->host)
-                               mark_inode_dirty_pages(mapping->host);
-               }
-       }
-}
-#endif
-
-inline struct obdo * ll_oa_from_inode(struct inode *inode, int valid)
-{
-        struct ll_inode_info *oinfo = ll_i2info(inode);
-       struct obdo *oa = obdo_alloc();
-        if ( !oa ) {
-               printk(__FUNCTION__ ": no memory to allocate obdo!\n"); 
-                return NULL;
-        }
-       oa->o_valid = valid;
-
-        if ( oa->o_valid & OBD_MD_FLID )
-                oa->o_id = oinfo->lli_objid;
-        if ( oa->o_valid & OBD_MD_FLATIME )
-                oa->o_atime = inode->i_atime;
-        if ( oa->o_valid & OBD_MD_FLMTIME )
-                oa->o_mtime = inode->i_mtime;
-        if ( oa->o_valid & OBD_MD_FLCTIME )
-                oa->o_ctime = inode->i_ctime;
-        if ( oa->o_valid & OBD_MD_FLSIZE )
-                oa->o_size = inode->i_size;
-        if ( oa->o_valid & OBD_MD_FLBLOCKS )   /* allocation of space */
-                oa->o_blocks = inode->i_blocks;
-        if ( oa->o_valid & OBD_MD_FLBLKSZ )
-                oa->o_blksize = inode->i_blksize;
-        if ( oa->o_valid & OBD_MD_FLMODE )
-                oa->o_mode = inode->i_mode;
-        if ( oa->o_valid & OBD_MD_FLUID )
-                oa->o_uid = inode->i_uid;
-        if ( oa->o_valid & OBD_MD_FLGID )
-                oa->o_gid = inode->i_gid;
-        if ( oa->o_valid & OBD_MD_FLFLAGS )
-                oa->o_flags = inode->i_flags;
-        if ( oa->o_valid & OBD_MD_FLNLINK )
-                oa->o_nlink = inode->i_nlink;
-        if ( oa->o_valid & OBD_MD_FLGENER ) 
-                oa->o_generation = inode->i_generation;
-
-        CDEBUG(D_INFO, "src inode %ld, dst obdo %ld valid 0x%08x\n",
-               inode->i_ino, (long)oa->o_id, oa->o_valid);
-        obdo_from_inode(oa, inode);
-       
-       /* this will transfer metadata for the logical object to 
-          the oa: that metadata could contain the constituent objects
-       */
-       if (ll_has_inline(inode)) {
-                CDEBUG(D_INODE, "copying inline data from inode to obdo\n");
-                memcpy(oa->o_inline, oinfo->lli_inline, OBD_INLINESZ);
-                oa->o_obdflags |= OBD_FL_INLINEDATA;
-                oa->o_valid |= OBD_MD_FLINLINE;
-        }
-       return oa;
-} /* ll_oa_from_inode */
-
-
+#include <linux/lustre_lite.h>
+#include <linux/lustre_lib.h>
 
-/*
- * Remove page from dirty list
- */
-void __set_page_clean(struct page *page)
-{
-       struct address_space *mapping = page->mapping;
-       struct inode *inode;
-       
-       if (!mapping)
-               return;
-
-       list_del(&page->list);
-       list_add(&page->list, &mapping->clean_pages);
-
-       inode = mapping->host;
-       if (list_empty(&mapping->dirty_pages)) { 
-               CDEBUG(D_INODE, "inode clean\n");
-               inode->i_state &= ~I_DIRTY_PAGES;
-       }
-       EXIT;
-}
 
 /* SYNCHRONOUS I/O to object storage for an inode */
 static int ll_brw(int rw, struct inode *inode, struct page *page, int create)
 {
-        obd_count        num_obdo = 1;
-        obd_count        bufs_per_obdo = 1;
-        struct obdo     *oa;
-        obd_size         count = PAGE_SIZE;
-        obd_off          offset = ((obd_off)page->index) << PAGE_SHIFT;
-        obd_flag         flags = create ? OBD_BRW_CREATE : 0;
-        int              err;
-
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lov_stripe_md *md = lli->lli_smd;
+        struct brw_page pg;
+        int err;
+        struct io_cb_data *cbd = ll_init_cb();
         ENTRY;
 
-        oa = ll_oa_from_inode(inode, OBD_MD_FLNOTOBD);
-       if (!oa) { 
-               return -ENOMEM;
-       }
-        err = obd_brw(rw, IID(inode), num_obdo, &oa, &bufs_per_obdo,
-                               &page, &count, &offset, &flags);
+        if (!cbd)
+                RETURN(-ENOMEM);
 
-        obdo_free(oa);
-        EXIT;
-        return err;
-} /* ll_brw */
-
-extern void set_page_clean(struct page *);
+        pg.pg = page;
+        pg.count = PAGE_SIZE;
+        pg.off = ((obd_off)page->index) << PAGE_SHIFT;
+        pg.flag = create ? OBD_BRW_CREATE : 0;
 
+        err = obd_brw(rw, ll_i2obdconn(inode), md, 1, &pg, ll_sync_io_cb, cbd);
 
+        RETURN(err);
+} /* ll_brw */
 
 /* returns the page unlocked, but with a reference */
-int ll_readpage(struct file *file, struct page *page)
+static int ll_readpage(struct file *file, struct page *page)
 {
-       struct inode *inode = page->mapping->host;
-        int rc;
-
+        struct inode *inode = page->mapping->host;
+        obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
+        int rc = 0;
         ENTRY;
 
-       if ( ((inode->i_size + PAGE_CACHE_SIZE -1)>>PAGE_SHIFT) 
-            <= page->index) {
-               memset(kmap(page), 0, PAGE_CACHE_SIZE);
-               kunmap(page);
-               goto readpage_out;
-       }
+        if (!PageLocked(page))
+                LBUG();
 
-       if (Page_Uptodate(page)) {
-               EXIT;
-               goto readpage_out;
-       }
+        if (inode->i_size <= offset) {
+                memset(kmap(page), 0, PAGE_SIZE);
+                kunmap(page);
+                GOTO(readpage_out, rc);
+        }
+
+        if (Page_Uptodate(page)) {
+                CERROR("Explain this please?\n");
+                GOTO(readpage_out, rc);
+        }
 
         rc = ll_brw(OBD_BRW_READ, inode, page, 0);
-        if ( rc ) {
-               EXIT; 
-               return rc;
-        } 
+        EXIT;
 
  readpage_out:
-       SetPageUptodate(page);
-       obd_unlock_page(page);
-        EXIT;
+        if (!rc)
+                SetPageUptodate(page);
+        UnlockPage(page);
         return 0;
 } /* ll_readpage */
 
 
-int ll_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
+static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
+                            unsigned to)
 {
         struct inode *inode = page->mapping->host;
         obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
         int rc = 0;
-        ENTRY; 
-        
-       kmap(page);
-        if (Page_Uptodate(page)) { 
-                EXIT;
-               goto prepare_done;
-        }
+        char *addr;
+        ENTRY;
+
+        addr = kmap(page);
+        if (!PageLocked(page))
+                LBUG();
+
+        if (Page_Uptodate(page))
+                GOTO(prepare_done, rc);
 
-        if ( (from <= offset) && (to >= offset + PAGE_SIZE) ) {
-                EXIT;
-                return 0;
+        /* We're completely overwriting an existing page, so _don't_ set it up
+         * to date until commit_write */
+        if (from == 0 && to == PAGE_SIZE)
+                RETURN(0);
+
+        /* We are writing to a new page, no need to read old data */
+        if (inode->i_size <= offset) {
+                memset(addr, 0, PAGE_SIZE);
+                goto prepare_done;
         }
-        
+
         rc = ll_brw(OBD_BRW_READ, inode, page, 0);
-        if ( !rc ) {
-                SetPageUptodate(page);
-        } 
 
- prepare_done:
-       set_page_dirty(page);
         EXIT;
+ prepare_done:
+        if (!rc)
+                SetPageUptodate(page);
+
         return rc;
 }
 
 /* returns the page unlocked, but with a reference */
-int ll_writepage(struct page *page)
+static int ll_writepage(struct page *page)
 {
         struct inode *inode = page->mapping->host;
         int err;
         ENTRY;
 
-       err = ll_brw(OBD_BRW_WRITE, inode, page, 1);
+        if (!PageLocked(page))
+                LBUG();
+
+        err = ll_brw(OBD_BRW_WRITE, inode, page, 1);
         if ( !err ) {
-                SetPageUptodate(page);
-               set_page_clean(page);
-       } else {
-               printk(__FUNCTION__ ": ll_brw failure %d\n", err);
-       }
-        EXIT;
-       return err;
+                //SetPageUptodate(page);
+                set_page_clean(page);
+        } else {
+                CERROR("ll_brw failure %d\n", err);
+        }
+        unlock_page(page);
+        RETURN(err);
 }
 
-/* SYNCHRONOUS I/O to object storage for an inode -- object attr will be updated too */
-int ll_commit_write(struct file *file, struct page *page, 
-                   unsigned from, unsigned to)
+
+/* SYNCHRONOUS I/O to object storage for an inode -- object attr will be updated
+ * too */
+static int ll_commit_write(struct file *file, struct page *page,
+                           unsigned from, unsigned to)
 {
-       int create = 1;
-       struct inode *inode = page->mapping->host;
-        obd_count        num_obdo = 1;
-        obd_count        bufs_per_obdo = 1;
-        struct obdo     *oa;
-        obd_size         count = to;
-        obd_off          offset = (((obd_off)page->index) << PAGE_SHIFT) + to;
-        obd_flag         flags = create ? OBD_BRW_CREATE : 0;
-        int              err;
-       struct iattr     iattr;
+        int create = 1;
+        struct inode *inode = page->mapping->host;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lov_stripe_md *md = lli->lli_smd;
+        struct brw_page pg;
+        int err;
+        loff_t size;
+        struct io_cb_data *cbd = ll_init_cb();
+
+        pg.pg = page;
+        pg.count = to;
+        pg.off = (((obd_off)page->index) << PAGE_SHIFT);
+        pg.flag = create ? OBD_BRW_CREATE : 0;
 
         ENTRY;
-        oa = ll_oa_from_inode(inode, OBD_MD_FLNOTOBD);
-       if (! oa ) { 
-               return -ENOMEM;
-       }
+        if (!cbd)
+                RETURN(-ENOMEM);
 
-       CDEBUG(D_INODE, "commit_page writing (at %d) to %d, count %Ld\n", 
-              from, to, count);
+        SetPageUptodate(page);
 
-        err = obd_brw(OBD_BRW_WRITE, IID(inode), num_obdo, &oa, &bufs_per_obdo,
-                     &page, &count, &offset, &flags);
-        if ( !err ) {
-                SetPageUptodate(page);
-               set_page_clean(page);
-       }
+        if (!PageLocked(page))
+                LBUG();
+
+        CDEBUG(D_INODE, "commit_page writing (at %d) to %d, count %Ld\n",
+               from, to, (unsigned long long)pg.count);
+
+        err = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode), md,
+                      1, &pg, ll_sync_io_cb, cbd);
         kunmap(page);
 
-       if (offset > inode->i_size) {
-               iattr.ia_valid = ATTR_SIZE;
-               iattr.ia_size = offset;
-               /* do NOT truncate */
-               err = ll_inode_setattr(inode, &iattr, 0);
-               if (err) {
-                       printk(__FUNCTION__ ": failed - %d.\n", err);
-                       obdo_free(oa);
-                       EXIT;
-                       return -EIO;
-               }
-       }
-
-        obdo_free(oa);
-        EXIT;
-        return err;
-} /* ll_brw */
+        size = pg.off + pg.count;
+        /* do NOT truncate when writing in the middle of a file */
+        if (size > inode->i_size)
+                inode->i_size = size;
+
+        RETURN(err);
+} /* ll_commit_write */
 
 void ll_truncate(struct inode *inode)
 {
-        struct obdo *oa;
+        struct obdo oa = {0};
+        struct lov_stripe_md *md = ll_i2info(inode)->lli_smd;
+        struct lustre_handle *lockhs = NULL;
         int err;
         ENTRY;
 
-       oa = ll_oa_from_inode(inode, OBD_MD_FLNOTOBD);
-        if ( !oa ) {
-                printk(__FUNCTION__ ": no memory to allocate obdo!\n");
-               return; 
-        } 
-       
-       CDEBUG(D_INFO, "calling punch for %ld (%Lu bytes at 0)\n",
-              (long)oa->o_id, oa->o_size);
-       err = obd_punch(IID(inode), oa, oa->o_size, 0);
-       obdo_free(oa);
+        if (!md) {
+                /* object not yet allocated */
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                return;
+        }
 
+        oa.o_id = md->lmd_object_id;
+        oa.o_size = inode->i_size;
+
+        CDEBUG(D_INFO, "calling punch for %ld (all bytes after %Ld)\n",
+               (long)oa.o_id, (unsigned long long)oa.o_size);
+
+        err = ll_size_lock(inode, md, oa.o_size, LCK_PW, &lockhs);
         if (err) {
-                printk(__FUNCTION__ ": obd_truncate fails (%d)\n", err);
+                CERROR("ll_size_lock failed: %d\n", err);
+                /* FIXME: What to do here?  It's too late to back out... */
+                LBUG();
         }
+
+        oa.o_valid = OBD_MD_FLID;
+        /* truncate == punch to/from start from/to end:
+           set end to -1 for that. */
+        err = obd_punch(ll_i2obdconn(inode), &oa, md, inode->i_size,
+                        0xffffffffffffffff);
+        if (err)
+                CERROR("obd_truncate fails (%d)\n", err);
+        else
+                obdo_to_inode(inode, &oa, oa.o_valid);
+
+        err = ll_size_unlock(inode, md, LCK_PW, lockhs);
+        if (err)
+                CERROR("ll_size_unlock failed: %d\n", err);
+
         EXIT;
-       return; 
+        return;
 } /* ll_truncate */
 
+int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
+                 unsigned long blocknr, int blocksize)
+{
+        obd_count        bufs_per_obdo = iobuf->nr_pages;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lov_stripe_md *md = lli->lli_smd;
+        struct brw_page *pga;
+        int              rc = 0;
+        int i;
+        struct io_cb_data *cbd = ll_init_cb();
+
+        ENTRY;
+        if (!cbd)
+                RETURN(-ENOMEM);
+
+        if (blocksize != PAGE_SIZE) {
+                CERROR("direct_IO blocksize != PAGE_SIZE\n");
+                return -EINVAL;
+        }
+
+        OBD_ALLOC(pga, sizeof(*pga) * bufs_per_obdo);
+        if (!pga)
+                GOTO(out, rc = -ENOMEM);
+
+        /* NB: we can't use iobuf->maplist[i]->index for the offset
+         * instead of "blocknr" because ->index contains garbage.
+         */
+        for (i = 0; i < bufs_per_obdo; i++, blocknr++) {
+                pga[i].pg = iobuf->maplist[i];
+                pga[i].count = PAGE_SIZE;
+                pga[i].off = (obd_off)blocknr << PAGE_SHIFT;
+                pga[i].flag = OBD_BRW_CREATE;
+        }
+
+        if (!md || !md->lmd_object_id)
+                GOTO(out, rc = -ENOMEM);
+
+        rc = obd_brw(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
+                     ll_i2obdconn(inode), md, bufs_per_obdo, pga,
+                     ll_sync_io_cb, cbd);
+        if (rc == 0)
+                rc = bufs_per_obdo * PAGE_SIZE;
+
+out:
+        OBD_FREE(pga, sizeof(*pga) * bufs_per_obdo);
+        RETURN(rc);
+}
+
+
+int ll_flush_inode_pages(struct inode * inode)
+{
+        obd_count        bufs_per_obdo = 0;
+        obd_size         *count = NULL;
+        obd_off          *offset = NULL;
+        obd_flag         *flags = NULL;
+        int              err = 0;
+
+        ENTRY;
+
+        spin_lock(&pagecache_lock);
+
+        spin_unlock(&pagecache_lock);
+
+
+        OBD_ALLOC(count, sizeof(*count) * bufs_per_obdo);
+        OBD_ALLOC(offset, sizeof(*offset) * bufs_per_obdo);
+        OBD_ALLOC(flags, sizeof(*flags) * bufs_per_obdo);
+        if (!count || !offset || !flags)
+                GOTO(out, err=-ENOMEM);
+
+#if 0
+        for (i = 0 ; i < bufs_per_obdo ; i++) {
+                count[i] = PAGE_SIZE;
+                offset[i] = ((obd_off)(iobuf->maplist[i])->index) << PAGE_SHIFT;
+                flags[i] = OBD_BRW_CREATE;
+        }
+
+        err = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode),
+                      ll_i2info(inode)->lli_smd, bufs_per_obdo,
+                      iobuf->maplist, count, offset, flags, NULL, NULL);
+        if (err == 0)
+                err = bufs_per_obdo * 4096;
+#endif
+ out:
+        OBD_FREE(flags, sizeof(*flags) * bufs_per_obdo);
+        OBD_FREE(count, sizeof(*count) * bufs_per_obdo);
+        OBD_FREE(offset, sizeof(*offset) * bufs_per_obdo);
+        RETURN(err);
+}
+
+
+
 struct address_space_operations ll_aops = {
         readpage: ll_readpage,
         writepage: ll_writepage,
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,4,17))
+        direct_IO: ll_direct_IO,
+#endif
         sync_page: block_sync_page,
-        prepare_write: ll_prepare_write, 
+        prepare_write: ll_prepare_write,
         commit_write: ll_commit_write,
         bmap: NULL
 };
-