Whamcloud - gitweb
LU-12275 sec: deal with encrypted object size
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_io.c
index 0cffae6..961e10d 100644 (file)
@@ -666,10 +666,12 @@ static struct page *osd_get_page(const struct lu_env *env, struct dt_object *dt,
                page = find_or_create_page(inode->i_mapping,
                                           offset >> PAGE_SHIFT, gfp_mask);
 
-               if (likely(page))
-                       LASSERT(!test_bit(PG_private_2, &page->flags));
-               else
+               if (likely(page)) {
+                       LASSERT(!PagePrivate2(page));
+                       wait_on_page_writeback(page);
+               } else {
                        lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1);
+               }
 
                return page;
        }
@@ -678,34 +680,29 @@ static struct page *osd_get_page(const struct lu_env *env, struct dt_object *dt,
                /* consult with pagecache, but do not create new pages */
                /* this is normally used once */
                page = find_lock_page(inode->i_mapping, offset >> PAGE_SHIFT);
-               if (page)
+               if (page) {
+                       wait_on_page_writeback(page);
                        return page;
+               }
        }
 
        LASSERT(oti->oti_dio_pages);
        cur = oti->oti_dio_pages_used;
+       page = oti->oti_dio_pages[cur];
 
-       if (unlikely(!oti->oti_dio_pages[cur])) {
+       if (unlikely(!page)) {
                LASSERT(cur < PTLRPC_MAX_BRW_PAGES);
                page = alloc_page(gfp_mask);
                if (!page)
                        return NULL;
                oti->oti_dio_pages[cur] = page;
+               SetPagePrivate2(page);
+               lock_page(page);
        }
 
-       page = oti->oti_dio_pages[cur];
-       LASSERT(!test_bit(PG_private_2, &page->flags));
-       set_bit(PG_private_2, &page->flags);
-       oti->oti_dio_pages_used++;
-
-       LASSERT(!PageLocked(page));
-       lock_page(page);
-
-       LASSERT(!page->mapping);
-       LASSERT(!PageWriteback(page));
        ClearPageUptodate(page);
-
        page->index = offset >> PAGE_SHIFT;
+       oti->oti_dio_pages_used++;
 
        return page;
 }
@@ -758,11 +755,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
 
                /* if the page isn't cached, then reset uptodate
                 * to prevent reuse */
-               if (test_bit(PG_private_2, &page->flags)) {
-                       clear_bit(PG_private_2, &page->flags);
-                       ClearPageUptodate(page);
-                       if (lnb[i].lnb_locked)
-                               unlock_page(page);
+               if (PagePrivate2(page)) {
                        oti->oti_dio_pages_used--;
                } else {
                        if (lnb[i].lnb_locked)
@@ -770,7 +763,6 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
                        if (pagevec_add(&pvec, page) == 0)
                                pagevec_release(&pvec);
                }
-               dt_object_put(env, dt);
 
                lnb[i].lnb_page = NULL;
        }
@@ -865,8 +857,7 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
 
 bypass_checks:
        if (!cache && unlikely(!oti->oti_dio_pages)) {
-               OBD_ALLOC(oti->oti_dio_pages,
-                         sizeof(struct page *) * PTLRPC_MAX_BRW_PAGES);
+               OBD_ALLOC_PTR_ARRAY(oti->oti_dio_pages, PTLRPC_MAX_BRW_PAGES);
                if (!oti->oti_dio_pages)
                        return -ENOMEM;
        }
@@ -881,10 +872,6 @@ bypass_checks:
                        GOTO(cleanup, rc = -ENOMEM);
 
                lnb->lnb_locked = 1;
-               wait_on_page_writeback(lnb->lnb_page);
-               BUG_ON(PageWriteback(lnb->lnb_page));
-
-               lu_object_get(&dt->do_lu);
        }
 
 #if 0
@@ -1133,13 +1120,13 @@ static int osd_declare_write_commit(const struct lu_env *env,
        enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
        ENTRY;
 
-        LASSERT(handle != NULL);
-        oh = container_of0(handle, struct osd_thandle, ot_super);
-        LASSERT(oh->ot_handle == NULL);
+       LASSERT(handle != NULL);
+       oh = container_of(handle, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle == NULL);
 
-        newblocks = npages;
+       newblocks = npages;
 
-        /* calculate number of extents (probably better to pass nb) */
+       /* calculate number of extents (probably better to pass nb) */
        for (i = 0; i < npages; i++) {
                if (i && lnb[i].lnb_file_offset !=
                    lnb[i - 1].lnb_file_offset + lnb[i - 1].lnb_len)
@@ -1234,26 +1221,31 @@ static int osd_declare_write_commit(const struct lu_env *env,
 
 /* Check if a block is allocated or not */
 static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
-                            struct niobuf_local *lnb, int npages,
-                            struct thandle *thandle)
+                           struct niobuf_local *lnb, int npages,
+                           struct thandle *thandle, __u64 user_size)
 {
-        struct osd_thread_info *oti = osd_oti_get(env);
-        struct osd_iobuf *iobuf = &oti->oti_iobuf;
-        struct inode *inode = osd_dt_obj(dt)->oo_inode;
-        struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
-        loff_t isize;
-        int rc = 0, i;
+       struct osd_thread_info *oti = osd_oti_get(env);
+       struct osd_iobuf *iobuf = &oti->oti_iobuf;
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
+       loff_t disk_size;
+       int rc = 0, i;
 
-        LASSERT(inode);
+       LASSERT(inode);
 
        rc = osd_init_iobuf(osd, iobuf, 1, npages);
        if (unlikely(rc != 0))
                RETURN(rc);
 
-       isize = i_size_read(inode);
+       disk_size = i_size_read(inode);
+       /* if disk_size is already bigger than specified user_size,
+        * ignore user_size
+        */
+       if (disk_size > user_size)
+               user_size = 0;
        dquot_initialize(inode);
 
-        for (i = 0; i < npages; i++) {
+       for (i = 0; i < npages; i++) {
                if (lnb[i].lnb_rc == -ENOSPC &&
                    (lnb[i].lnb_flags & OBD_BRW_MAPPED)) {
                        /* Allow the write to proceed if overwriting an
@@ -1273,8 +1265,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                LASSERT(PageLocked(lnb[i].lnb_page));
                LASSERT(!PageWriteback(lnb[i].lnb_page));
 
-               if (lnb[i].lnb_file_offset + lnb[i].lnb_len > isize)
-                       isize = lnb[i].lnb_file_offset + lnb[i].lnb_len;
+               if (lnb[i].lnb_file_offset + lnb[i].lnb_len > disk_size)
+                       disk_size = lnb[i].lnb_file_offset + lnb[i].lnb_len;
 
                /*
                 * Since write and truncate are serialized by oo_sem, even
@@ -1286,28 +1278,31 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                SetPageUptodate(lnb[i].lnb_page);
 
                osd_iobuf_add_page(iobuf, &lnb[i]);
-        }
+       }
+       /* if file has grown, take user_size into account */
+       if (user_size && disk_size > user_size)
+               disk_size = user_size;
 
        osd_trans_exec_op(env, thandle, OSD_OT_WRITE);
 
-        if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) {
-                rc = -ENOSPC;
-        } else if (iobuf->dr_npages > 0) {
+       if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) {
+               rc = -ENOSPC;
+       } else if (iobuf->dr_npages > 0) {
                rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages,
                                                 iobuf->dr_npages,
                                                 iobuf->dr_blocks, 1);
-        } else {
-                /* no pages to write, no transno is needed */
-                thandle->th_local = 1;
-        }
+       } else {
+               /* no pages to write, no transno is needed */
+               thandle->th_local = 1;
+       }
 
        if (likely(rc == 0)) {
                spin_lock(&inode->i_lock);
-               if (isize > i_size_read(inode)) {
-                       i_size_write(inode, isize);
-                       LDISKFS_I(inode)->i_disksize = isize;
+               if (disk_size > i_size_read(inode)) {
+                       i_size_write(inode, disk_size);
+                       LDISKFS_I(inode)->i_disksize = disk_size;
                        spin_unlock(&inode->i_lock);
-                       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+                       osd_dirty_inode(inode, I_DIRTY_DATASYNC);
                } else {
                        spin_unlock(&inode->i_lock);
                }
@@ -1326,9 +1321,11 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                for (i = 0; i < npages; i++) {
                        if (lnb[i].lnb_page == NULL)
                                continue;
-                       LASSERT(PageLocked(lnb[i].lnb_page));
-                       generic_error_remove_page(inode->i_mapping,
-                                                 lnb[i].lnb_page);
+                       if (!PagePrivate2(lnb[i].lnb_page)) {
+                               LASSERT(PageLocked(lnb[i].lnb_page));
+                               generic_error_remove_page(inode->i_mapping,
+                                                         lnb[i].lnb_page);
+                       }
                }
        }
 
@@ -1411,7 +1408,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
                /* early release to let others read data during the bulk */
                for (i = 0; i < iobuf->dr_npages; i++) {
                        LASSERT(PageLocked(iobuf->dr_pages[i]));
-                       unlock_page(iobuf->dr_pages[i]);
+                       if (!PagePrivate2(iobuf->dr_pages[i]))
+                               unlock_page(iobuf->dr_pages[i]);
                }
        }
 
@@ -1498,19 +1496,27 @@ int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs)
 static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
                        struct lu_buf *buf, loff_t *pos)
 {
-        struct inode *inode = osd_dt_obj(dt)->oo_inode;
-        int           rc;
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       int rc;
 
-        /* Read small symlink from inode body as we need to maintain correct
-         * on-disk symlinks for ldiskfs.
-         */
-        if (S_ISLNK(dt->do_lu.lo_header->loh_attr) &&
-            (buf->lb_len < sizeof(LDISKFS_I(inode)->i_data)))
-                rc = osd_ldiskfs_readlink(inode, buf->lb_buf, buf->lb_len);
-        else
-                rc = osd_ldiskfs_read(inode, buf->lb_buf, buf->lb_len, pos);
+       /* Read small symlink from inode body as we need to maintain correct
+        * on-disk symlinks for ldiskfs.
+        */
+       if (S_ISLNK(dt->do_lu.lo_header->loh_attr)) {
+               loff_t size = i_size_read(inode);
+
+               if (buf->lb_len < size)
+                       return -EOVERFLOW;
 
-        return rc;
+               if (size < sizeof(LDISKFS_I(inode)->i_data))
+                       rc = osd_ldiskfs_readlink(inode, buf->lb_buf, size);
+               else
+                       rc = osd_ldiskfs_read(inode, buf->lb_buf, size, pos);
+       } else {
+               rc = osd_ldiskfs_read(inode, buf->lb_buf, buf->lb_len, pos);
+       }
+
+       return rc;
 }
 
 static inline int osd_extents_enabled(struct super_block *sb,
@@ -1588,10 +1594,10 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
        ENTRY;
 
        LASSERT(buf != NULL);
-        LASSERT(handle != NULL);
+       LASSERT(handle != NULL);
 
-        oh = container_of0(handle, struct osd_thandle, ot_super);
-        LASSERT(oh->ot_handle == NULL);
+       oh = container_of(handle, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle == NULL);
 
        size = buf->lb_len;
        bits = sb->s_blocksize_bits;
@@ -1676,7 +1682,7 @@ static int osd_ldiskfs_writelink(struct inode *inode, char *buffer, int buflen)
        LDISKFS_I(inode)->i_disksize = buflen;
        i_size_write(inode, buflen);
        spin_unlock(&inode->i_lock);
-       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+       osd_dirty_inode(inode, I_DIRTY_DATASYNC);
 
        return 0;
 }
@@ -1691,12 +1697,12 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
         loff_t              new_size  = i_size_read(inode);
         unsigned long       block;
         int                 blocksize = 1 << inode->i_blkbits;
+       struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
         int                 err = 0;
         int                 size;
         int                 boffs;
         int                 dirty_inode = 0;
-       struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
-       bool create, sparse;
+       bool create, sparse, sync = false;
 
        if (write_NUL) {
                /*
@@ -1708,13 +1714,14 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
                ++bufsize;
        }
 
+       dirty_inode = test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA, &ei->i_flags);
+
        /* sparse checking is racy, but sparse is very rare case, leave as is */
        sparse = (new_size > 0 && (inode->i_blocks >> (inode->i_blkbits - 9)) <
                  ((new_size - 1) >> inode->i_blkbits) + 1);
 
        while (bufsize > 0) {
                int credits = handle->h_buffer_credits;
-               bool sync;
                unsigned long last_block = (new_size == 0) ? 0 :
                                           (new_size - 1) >> inode->i_blkbits;
 
@@ -1751,8 +1758,10 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
                        bh = __ldiskfs_bread(handle, inode, block, flags);
                        create = true;
                } else {
-                       if (sync)
+                       if (sync) {
                                up(&ei->i_append_sem);
+                               sync = false;
+                       }
                        create = false;
                }
                if (IS_ERR_OR_NULL(bh)) {
@@ -1781,8 +1790,10 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
                         boffs, size, (unsigned long)bh->b_size);
                if (create) {
                        memset(bh->b_data, 0, bh->b_size);
-                       if (sync)
+                       if (sync) {
                                up(&ei->i_append_sem);
+                               sync = false;
+                       }
                }
                memcpy(bh->b_data + boffs, buf, size);
                err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
@@ -1795,8 +1806,11 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
                 bufsize -= size;
                 buf += size;
         }
-        if (bh)
-                brelse(bh);
+       if (sync)
+               up(&ei->i_append_sem);
+
+       if (bh)
+               brelse(bh);
 
        if (write_NUL)
                --new_size;
@@ -1805,14 +1819,14 @@ static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
                spin_lock(&inode->i_lock);
                if (new_size > i_size_read(inode))
                        i_size_write(inode, new_size);
-               if (i_size_read(inode) > LDISKFS_I(inode)->i_disksize) {
-                       LDISKFS_I(inode)->i_disksize = i_size_read(inode);
+               if (i_size_read(inode) > ei->i_disksize) {
+                       ei->i_disksize = i_size_read(inode);
                        dirty_inode = 1;
                }
                spin_unlock(&inode->i_lock);
-               if (dirty_inode)
-                       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
         }
+       if (dirty_inode)
+               osd_dirty_inode(inode, I_DIRTY_DATASYNC);
 
         if (err == 0)
                 *offs = offset;
@@ -1860,6 +1874,69 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
        return result;
 }
 
+static int osd_declare_fallocate(const struct lu_env *env,
+                                struct dt_object *dt, struct thandle *th)
+{
+       struct osd_thandle *oh;
+       struct inode *inode;
+       int rc;
+       ENTRY;
+
+       LASSERT(th);
+       oh = container_of(th, struct osd_thandle, ot_super);
+
+       osd_trans_declare_op(env, oh, OSD_OT_PREALLOC,
+                            osd_dto_credits_noquota[DTO_WRITE_BLOCK]);
+       inode = osd_dt_obj(dt)->oo_inode;
+       LASSERT(inode);
+
+       rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
+                                  i_projid_read(inode), 0, oh, osd_dt_obj(dt),
+                                  NULL, OSD_QID_BLK);
+       RETURN(rc);
+}
+
+static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
+                        __u64 start, __u64 end, int mode, struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode = obj->oo_inode;
+       int rc = 0;
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct dentry *dentry = &info->oti_obj_dentry;
+       struct file *file = &info->oti_file;
+
+       ENTRY;
+       /*
+        * Only mode == 0 (which is standard prealloc) is supported now.
+        * Rest of mode options is not supported yet.
+        */
+       if (mode & ~FALLOC_FL_KEEP_SIZE)
+               RETURN(-EOPNOTSUPP);
+
+       LASSERT(dt_object_exists(dt));
+       LASSERT(osd_invariant(obj));
+       LASSERT(inode != NULL);
+       dquot_initialize(inode);
+
+       LASSERT(th);
+
+       osd_trans_exec_op(env, th, OSD_OT_PREALLOC);
+
+       /*
+        * Because f_op->fallocate() does not have an inode arg
+        */
+       dentry->d_inode = inode;
+       dentry->d_sb = inode->i_sb;
+       file->f_path.dentry = dentry;
+       file->f_mapping = inode->i_mapping;
+       file->f_op = inode->i_fop;
+       file->f_inode = inode;
+       rc = file->f_op->fallocate(file, mode, start, end - start);
+
+       RETURN(rc);
+}
+
 static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
                              __u64 start, __u64 end, struct thandle *th)
 {
@@ -1907,7 +1984,6 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt,
        bool grow = false;
        ENTRY;
 
-       LASSERT(end == OBD_OBJECT_EOF);
        LASSERT(dt_object_exists(dt));
        LASSERT(osd_invariant(obj));
        LASSERT(inode != NULL);
@@ -1942,10 +2018,12 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt,
                GOTO(out, rc);
        }
 
+       inode_lock(inode);
        /* add to orphan list to ensure truncate completion
         * if this transaction succeed. ldiskfs_truncate()
         * will take the inode out of the list */
        rc = ldiskfs_orphan_add(oh->ot_handle, inode);
+       inode_unlock(inode);
        if (rc != 0)
                GOTO(out, rc);
 
@@ -2080,6 +2158,8 @@ const struct dt_body_operations osd_body_ops = {
        .dbo_punch                      = osd_punch,
        .dbo_fiemap_get                 = osd_fiemap_get,
        .dbo_ladvise                    = osd_ladvise,
+       .dbo_declare_fallocate          = osd_declare_fallocate,
+       .dbo_fallocate                  = osd_fallocate,
 };
 
 /**
@@ -2124,13 +2204,14 @@ int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, bool shared)
        else
                down_write(&obj->oo_ext_idx_sem);
        al->tl_shared = shared;
+       lu_object_get(&obj->oo_dt.do_lu);
 
        list_add(&al->tl_list, &oh->ot_trunc_locks);
 
        return 0;
 }
 
-void osd_trunc_unlock_all(struct list_head *list)
+void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
 {
        struct osd_access_lock *al, *tmp;
        list_for_each_entry_safe(al, tmp, list, tl_list) {
@@ -2138,6 +2219,7 @@ void osd_trunc_unlock_all(struct list_head *list)
                        up_read(&al->tl_obj->oo_ext_idx_sem);
                else
                        up_write(&al->tl_obj->oo_ext_idx_sem);
+               osd_object_put(env, al->tl_obj);
                list_del(&al->tl_list);
                OBD_FREE_PTR(al);
        }
@@ -2160,7 +2242,9 @@ void osd_execute_truncate(struct osd_object *obj)
                return;
        }
 
+       inode_lock(inode);
        ldiskfs_truncate(inode);
+       inode_unlock(inode);
 
        /*
         * For a partial-page truncate, flush the page to disk immediately to