X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_io.c;h=eb6b2a0056bc24cf46e6f1ef722cf312050756a0;hb=refs%2Fchanges%2F77%2F4777%2F8;hp=7b479615ac7fabeb1b05b5b0bc1b2aabb22c79bc;hpb=81b8dc81c5fe85278656ab12dc84389aed54b244;p=fs%2Flustre-release.git diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 7b47961..eb6b2a0 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -29,7 +27,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Whamcloud, Inc. + * Copyright (c) 2012, 2013, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -51,11 +49,6 @@ /* prerequisite for linux/xattr.h */ #include -/* ext_depth() */ -#include -#include -#include - /* * struct OBD_{ALLOC,FREE}*() * OBD_FAIL_CHECK @@ -64,51 +57,73 @@ #include "osd_internal.h" +/* ext_depth() */ +#include + #ifndef HAVE_PAGE_CONSTANT #define mapping_cap_page_constant_write(mapping) 0 #define SetPageConstant(page) do {} while (0) #define ClearPageConstant(page) do {} while (0) #endif -#ifndef HAS_GENERIC_ERROR_REMOVE_PAGE -int generic_error_remove_page(struct address_space *mapping, struct page *page) +static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, + int rw, int line, int pages) { - if (mapping == NULL) - return -EINVAL; - - if (mapping != page->mapping) - return -EIO; - /* - * Only punch for normal data pages for now. - * Handling other types like directories would need more auditing. - */ - if (!S_ISREG(mapping->host->i_mode)) - return -EIO; + int blocks, i; - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - truncate_complete_page(mapping, page); - return 0; -} -#endif + LASSERTF(iobuf->dr_elapsed_valid == 0, + "iobuf %p, reqs %d, rw %d, line %d\n", iobuf, + cfs_atomic_read(&iobuf->dr_numreqs), iobuf->dr_rw, + iobuf->dr_init_at); + LASSERT(pages <= PTLRPC_MAX_BRW_PAGES); -static void osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf,int rw) -{ cfs_waitq_init(&iobuf->dr_wait); cfs_atomic_set(&iobuf->dr_numreqs, 0); - iobuf->dr_max_pages = PTLRPC_MAX_BRW_PAGES; iobuf->dr_npages = 0; iobuf->dr_error = 0; iobuf->dr_dev = d; iobuf->dr_frags = 0; iobuf->dr_elapsed = 0; /* must be counted before, so assert */ - LASSERT(iobuf->dr_elapsed_valid == 0); iobuf->dr_rw = rw; + iobuf->dr_init_at = line; + + blocks = pages * (PAGE_CACHE_SIZE >> osd_sb(d)->s_blocksize_bits); + if (iobuf->dr_bl_buf.lb_len >= blocks * sizeof(iobuf->dr_blocks[0])) { + LASSERT(iobuf->dr_pg_buf.lb_len >= + pages * sizeof(iobuf->dr_pages[0])); + return 0; + } + + /* start with 1MB for 4K blocks */ + i = 256; + while (i <= PTLRPC_MAX_BRW_PAGES && i < pages) + i <<= 1; + + CDEBUG(D_OTHER, "realloc %u for %u (%u) pages\n", + (unsigned)(pages * sizeof(iobuf->dr_pages[0])), i, pages); + pages = i; + blocks = pages * (PAGE_CACHE_SIZE >> osd_sb(d)->s_blocksize_bits); + iobuf->dr_max_pages = 0; + CDEBUG(D_OTHER, "realloc %u for %u blocks\n", + (unsigned)(blocks * sizeof(iobuf->dr_blocks[0])), blocks); + + lu_buf_realloc(&iobuf->dr_bl_buf, blocks * sizeof(iobuf->dr_blocks[0])); + iobuf->dr_blocks = iobuf->dr_bl_buf.lb_buf; + if (unlikely(iobuf->dr_blocks == NULL)) + return -ENOMEM; + + lu_buf_realloc(&iobuf->dr_pg_buf, pages * sizeof(iobuf->dr_pages[0])); + iobuf->dr_pages = iobuf->dr_pg_buf.lb_buf; + if (unlikely(iobuf->dr_pages == NULL)) + return -ENOMEM; + + iobuf->dr_max_pages = pages; + + return 0; } +#define osd_init_iobuf(dev, iobuf, rw, pages) \ + __osd_init_iobuf(dev, iobuf, rw, __LINE__, pages) static void osd_iobuf_add_page(struct osd_iobuf *iobuf, struct page *page) { @@ -132,6 +147,10 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf) } } +#ifndef REQ_WRITE /* pre-2.6.35 */ +#define __REQ_WRITE BIO_RW +#endif + #ifdef HAVE_BIO_ENDIO_2ARG #define DIO_RETURN(a) static void dio_complete_routine(struct bio *bio, int error) @@ -166,7 +185,7 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) } /* the check is outside of the cycle for performance reason -bzzz */ - if (!cfs_test_bit(BIO_RW, &bio->bi_rw)) { + if (!test_bit(__REQ_WRITE, &bio->bi_rw)) { bio_for_each_segment(bvl, bio, i) { if (likely(error == 0)) SetPageUptodate(bvl->bv_page); @@ -190,11 +209,19 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) if (error != 0 && iobuf->dr_error == 0) iobuf->dr_error = error; - if (cfs_atomic_dec_and_test(&iobuf->dr_numreqs)) { - iobuf->dr_elapsed = jiffies - iobuf->dr_start_time; - iobuf->dr_elapsed_valid = 1; - cfs_waitq_signal(&iobuf->dr_wait); - } + /* + * set dr_elapsed before dr_numreqs turns to 0, otherwise + * it's possible that service thread will see dr_numreqs + * is zero, but dr_elapsed is not set yet, leading to lost + * data in this processing and an assertion in a subsequent + * call to OSD. + */ + if (cfs_atomic_read(&iobuf->dr_numreqs) == 1) { + iobuf->dr_elapsed = jiffies - iobuf->dr_start_time; + iobuf->dr_elapsed_valid = 1; + } + if (cfs_atomic_dec_and_test(&iobuf->dr_numreqs)) + cfs_waitq_signal(&iobuf->dr_wait); /* Completed bios used to be chained off iobuf->dr_bios and freed in * filter_clear_dreq(). It was then possible to exhaust the biovec-256 @@ -251,7 +278,7 @@ static int can_be_merged(struct bio *bio, sector_t sector) static int osd_do_bio(struct osd_device *osd, struct inode *inode, struct osd_iobuf *iobuf) { - int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits; + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; struct page **pages = iobuf->dr_pages; int npages = iobuf->dr_npages; unsigned long *blocks = iobuf->dr_blocks; @@ -339,10 +366,10 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, osd_submit_bio(iobuf->dr_rw, bio); } - /* allocate new bio, limited by max BIO size, b=9945 */ - bio = bio_alloc(GFP_NOIO, max(BIO_MAX_PAGES, - (npages - page_idx) * - blocks_per_page)); + /* allocate new bio */ + bio = bio_alloc(GFP_NOIO, min(BIO_MAX_PAGES, + (npages - page_idx) * + blocks_per_page)); if (bio == NULL) { CERROR("Can't allocate bio %u*%u = %u pages\n", (npages - page_idx), blocks_per_page, @@ -353,6 +380,7 @@ static int osd_do_bio(struct osd_device *osd, struct inode *inode, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_sector = sector; + bio->bi_rw = (iobuf->dr_rw == 0) ? READ : WRITE; bio->bi_end_io = dio_complete_routine; bio->bi_private = iobuf; @@ -391,13 +419,13 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, *nrpages = 0; while (len > 0) { - int poff = offset & (CFS_PAGE_SIZE - 1); - int plen = CFS_PAGE_SIZE - poff; + int poff = offset & (PAGE_CACHE_SIZE - 1); + int plen = PAGE_CACHE_SIZE - poff; if (plen > len) plen = len; - lnb->offset = offset; - /* lnb->lnb_page_offset = poff; */ + lnb->lnb_file_offset = offset; + lnb->lnb_page_offset = poff; lnb->len = plen; /* lb->flags = rnb->flags; */ lnb->flags = 0; @@ -423,7 +451,7 @@ struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) LASSERT(inode); - page = find_or_create_page(inode->i_mapping, offset >> CFS_PAGE_SHIFT, + page = find_or_create_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, GFP_NOFS | __GFP_HIGHMEM); if (unlikely(page == NULL)) lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1); @@ -434,7 +462,6 @@ struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) /* * there are following "locks": * journal_start - * i_alloc_sem * i_mutex * page lock @@ -468,7 +495,7 @@ int osd_bufs_get(const struct lu_env *env, struct dt_object *d, loff_t pos, * needs to keep the pages all aligned properly. */ lnb->dentry = (void *) obj; - lnb->page = osd_get_page(d, lnb->offset, rw); + lnb->page = osd_get_page(d, lnb->lnb_file_offset, rw); if (lnb->page == NULL) GOTO(cleanup, rc = -ENOMEM); @@ -533,18 +560,20 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); - osd_init_iobuf(osd, iobuf, 0); + rc = osd_init_iobuf(osd, iobuf, 0, npages); + if (unlikely(rc != 0)) + RETURN(rc); - isize = i_size_read(inode); - maxidx = ((isize + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT) - 1; + isize = i_size_read(inode); + maxidx = ((isize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 1; if (osd->od_writethrough_cache) cache = 1; if (isize > osd->od_readcache_max_filesize) cache = 0; - cfs_gettimeofday(&start); - for (i = 0; i < npages; i++) { + do_gettimeofday(&start); + for (i = 0; i < npages; i++) { if (cache == 0) generic_error_remove_page(inode->i_mapping, @@ -557,7 +586,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, */ ClearPageUptodate(lnb[i].page); - if (lnb[i].len == CFS_PAGE_SIZE) + if (lnb[i].len == PAGE_CACHE_SIZE) continue; if (maxidx >= lnb[i].page->index) { @@ -566,26 +595,25 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, long off; char *p = kmap(lnb[i].page); - off = lnb[i].offset; + off = lnb[i].lnb_page_offset; + if (off) + memset(p, 0, off); + off = (lnb[i].lnb_page_offset + lnb[i].len) & + ~CFS_PAGE_MASK; if (off) - memset(p, 0, off); - off = lnb[i].offset + lnb[i].len; - off &= ~CFS_PAGE_MASK; - if (off) - memset(p + off, 0, CFS_PAGE_SIZE - off); + memset(p + off, 0, PAGE_CACHE_SIZE - off); kunmap(lnb[i].page); - } - } - cfs_gettimeofday(&end); - timediff = cfs_timeval_sub(&end, &start, NULL); - lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); + } + } + do_gettimeofday(&end); + timediff = cfs_timeval_sub(&end, &start, NULL); + lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); if (iobuf->dr_npages) { - rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, - oti->oti_created, - 0, NULL); + rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, + iobuf->dr_npages, + iobuf->dr_blocks, + 0, NULL); if (likely(rc == 0)) { rc = osd_do_bio(osd, inode, iobuf); /* do IO stats for preparation reads */ @@ -595,6 +623,31 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, RETURN(rc); } +/* Check if a block is allocated or not */ +static int osd_is_mapped(struct inode *inode, obd_size offset) +{ + sector_t (*fs_bmap)(struct address_space *, sector_t); + + fs_bmap = inode->i_mapping->a_ops->bmap; + + /* We can't know if we are overwriting or not */ + if (unlikely(fs_bmap == NULL)) + return 0; + + if (i_size_read(inode) == 0) + return 0; + + /* Beyond EOF, must not be mapped */ + if (((i_size_read(inode) - 1) >> inode->i_blkbits) < + (offset >> inode->i_blkbits)) + return 0; + + if (fs_bmap(inode->i_mapping, offset >> inode->i_blkbits) == 0) + return 0; + + return 1; +} + static int osd_declare_write_commit(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages, @@ -607,20 +660,41 @@ static int osd_declare_write_commit(const struct lu_env *env, int depth; int i; int newblocks; - int old; + int rc = 0; + int flags = 0; + bool ignore_quota = false; + long long quota_space = 0; + ENTRY; LASSERT(handle != NULL); oh = container_of0(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); - old = oh->ot_credits; newblocks = npages; /* calculate number of extents (probably better to pass nb) */ - for (i = 1; i < npages; i++) - if (lnb[i].offset != - lnb[i - 1].offset + lnb[i - 1].len) - extents++; + for (i = 0; i < npages; i++) { + if (i && lnb[i].lnb_file_offset != + lnb[i - 1].lnb_file_offset + lnb[i - 1].len) + extents++; + + if (!osd_is_mapped(inode, lnb[i].lnb_file_offset)) + quota_space += PAGE_CACHE_SIZE; + + /* ignore quota for the whole request if any page is from + * client cache or written by root. + * + * XXX once we drop the 1.8 client support, the checking + * for whether page is from cache can be simplified as: + * !(lnb[i].flags & OBD_BRW_SYNC) + * + * XXX we could handle this on per-lnb basis as done by + * grant. */ + if ((lnb[i].flags & OBD_BRW_NOQUOTA) || + (lnb[i].flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == + OBD_BRW_FROM_GRANT) + ignore_quota = true; + } /* * each extent can go into new leaf causing a split @@ -644,6 +718,12 @@ static int osd_declare_write_commit(const struct lu_env *env, oh->ot_credits += depth * extents; } + /* quota space for metadata blocks */ + quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd)); + + /* quota space should be reported in 1K blocks */ + quota_space = toqb(quota_space); + /* each new block can go in different group (bitmap + gd) */ /* we can't dirty more bitmap blocks than exist */ @@ -658,26 +738,25 @@ static int osd_declare_write_commit(const struct lu_env *env, else oh->ot_credits += newblocks; - RETURN(0); -} - -/* Check if a block is allocated or not */ -static int osd_is_mapped(struct inode *inode, obd_size offset) -{ - sector_t (*fs_bmap)(struct address_space *, sector_t); - - fs_bmap = inode->i_mapping->a_ops->bmap; + /* make sure the over quota flags were not set */ + lnb[0].flags &= ~(OBD_BRW_OVER_USRQUOTA | OBD_BRW_OVER_GRPQUOTA); - /* We can't know if we are overwriting or not */ - if (fs_bmap == NULL) - return 0; + rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, + quota_space, oh, true, true, &flags, + ignore_quota); - if (fs_bmap(inode->i_mapping, offset >> inode->i_blkbits) == 0) - return 0; + /* we need only to store the overquota flags in the first lnb for + * now, once we support multiple objects BRW, this code needs be + * revised. */ + if (flags & QUOTA_FL_OVER_USRQUOTA) + lnb[0].flags |= OBD_BRW_OVER_USRQUOTA; + if (flags & QUOTA_FL_OVER_GRPQUOTA) + lnb[0].flags |= OBD_BRW_OVER_GRPQUOTA; - return 1; + RETURN(rc); } +/* Check if a block is allocated or not */ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages, struct thandle *thandle) @@ -691,12 +770,16 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); - osd_init_iobuf(osd, iobuf, 1); - isize = i_size_read(inode); + rc = osd_init_iobuf(osd, iobuf, 1, npages); + if (unlikely(rc != 0)) + RETURN(rc); + + isize = i_size_read(inode); + ll_vfs_dq_init(inode); for (i = 0; i < npages; i++) { if (lnb[i].rc == -ENOSPC && - osd_is_mapped(inode, lnb[i].offset)) { + osd_is_mapped(inode, lnb[i].lnb_file_offset)) { /* Allow the write to proceed if overwriting an * existing block */ lnb[i].rc = 0; @@ -713,8 +796,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, LASSERT(PageLocked(lnb[i].page)); LASSERT(!PageWriteback(lnb[i].page)); - if (lnb[i].offset + lnb[i].len > isize) - isize = lnb[i].offset + lnb[i].len; + if (lnb[i].lnb_file_offset + lnb[i].len > isize) + isize = lnb[i].lnb_file_offset + lnb[i].len; /* * Since write and truncate are serialized by oo_sem, even @@ -732,10 +815,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, rc = -ENOSPC; } else if (iobuf->dr_npages > 0) { rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, - oti->oti_created, - 1, NULL); + iobuf->dr_npages, + iobuf->dr_blocks, + 1, NULL); } else { /* no pages to write, no transno is needed */ thandle->th_local = 1; @@ -745,7 +827,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, if (isize > i_size_read(inode)) { i_size_write(inode, isize); LDISKFS_I(inode)->i_disksize = isize; - inode->i_sb->s_op->dirty_inode(inode); + ll_dirty_inode(inode, I_DIRTY_DATASYNC); } rc = osd_do_bio(osd, inode, iobuf); @@ -779,24 +861,26 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); - osd_init_iobuf(osd, iobuf, 0); + rc = osd_init_iobuf(osd, iobuf, 0, npages); + if (unlikely(rc != 0)) + RETURN(rc); - if (osd->od_read_cache) - cache = 1; - if (i_size_read(inode) > osd->od_readcache_max_filesize) - cache = 0; + if (osd->od_read_cache) + cache = 1; + if (i_size_read(inode) > osd->od_readcache_max_filesize) + cache = 0; - cfs_gettimeofday(&start); - for (i = 0; i < npages; i++) { + do_gettimeofday(&start); + for (i = 0; i < npages; i++) { - if (i_size_read(inode) <= lnb[i].offset) + if (i_size_read(inode) <= lnb[i].lnb_file_offset) /* If there's no more data, abort early. * lnb->rc == 0, so it's easy to detect later. */ break; if (i_size_read(inode) < - lnb[i].offset + lnb[i].len - 1) - lnb[i].rc = i_size_read(inode) - lnb[i].offset; + lnb[i].lnb_file_offset + lnb[i].len - 1) + lnb[i].rc = i_size_read(inode) - lnb[i].lnb_file_offset; else lnb[i].rc = lnb[i].len; m += lnb[i].len; @@ -810,19 +894,18 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, LPROC_OSD_CACHE_MISS, 1); osd_iobuf_add_page(iobuf, lnb[i].page); } - if (cache == 0) - generic_error_remove_page(inode->i_mapping,lnb[i].page); - } - cfs_gettimeofday(&end); - timediff = cfs_timeval_sub(&end, &start, NULL); - lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); + if (cache == 0) + generic_error_remove_page(inode->i_mapping,lnb[i].page); + } + do_gettimeofday(&end); + timediff = cfs_timeval_sub(&end, &start, NULL); + lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); if (iobuf->dr_npages) { - rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, - oti->oti_created, - 0, NULL); + rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, + iobuf->dr_npages, + iobuf->dr_blocks, + 0, NULL); rc = osd_do_bio(osd, inode, iobuf); /* IO stats will be done in osd_bufs_put() */ @@ -862,20 +945,22 @@ int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs) int err; /* prevent reading after eof */ - cfs_spin_lock(&inode->i_lock); - if (i_size_read(inode) < *offs + size) { - size = i_size_read(inode) - *offs; - cfs_spin_unlock(&inode->i_lock); - if (size < 0) { - CDEBUG(D_EXT2, "size %llu is too short to read @%llu\n", - i_size_read(inode), *offs); - return -EBADR; - } else if (size == 0) { - return 0; - } - } else { - cfs_spin_unlock(&inode->i_lock); - } + spin_lock(&inode->i_lock); + if (i_size_read(inode) < *offs + size) { + loff_t diff = i_size_read(inode) - *offs; + spin_unlock(&inode->i_lock); + if (diff < 0) { + CDEBUG(D_EXT2, "size %llu is too short to read @%llu\n", + i_size_read(inode), *offs); + return -EBADR; + } else if (diff == 0) { + return 0; + } else { + size = diff; + } + } else { + spin_unlock(&inode->i_lock); + } blocksize = 1 << inode->i_blkbits; osize = size; @@ -915,7 +1000,7 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, * on-disk symlinks for ldiskfs. */ if (S_ISLNK(dt->do_lu.lo_header->loh_attr) && - (buf->lb_len <= sizeof(LDISKFS_I(inode)->i_data))) + (buf->lb_len < sizeof(LDISKFS_I(inode)->i_data))) rc = osd_ldiskfs_readlink(inode, buf->lb_buf, buf->lb_len); else rc = osd_ldiskfs_read(inode, buf->lb_buf, buf->lb_len, pos); @@ -929,50 +1014,48 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, { struct osd_thandle *oh; int credits; + struct inode *inode; + int rc; + ENTRY; LASSERT(handle != NULL); oh = container_of0(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); - /* XXX: size == 0 or INT_MAX indicating a catalog header update or - * llog write, see comment in mdd_declare_llog_record(). - * - * This hack will be removed with llog over OSD landing - */ - if (size == DECLARE_LLOG_REWRITE) - credits = 2; - else if (size == DECLARE_LLOG_WRITE) - credits = 6; - else - credits = osd_dto_credits_noquota[DTO_WRITE_BLOCK]; + credits = osd_dto_credits_noquota[DTO_WRITE_BLOCK]; - OSD_DECLARE_OP(oh, write); - oh->ot_credits += credits; + osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits); - if (osd_dt_obj(dt)->oo_inode == NULL) - return 0; + inode = osd_dt_obj(dt)->oo_inode; - osd_declare_qid(dt, oh, USRQUOTA, osd_dt_obj(dt)->oo_inode->i_uid, - osd_dt_obj(dt)->oo_inode); - osd_declare_qid(dt, oh, GRPQUOTA, osd_dt_obj(dt)->oo_inode->i_gid, - osd_dt_obj(dt)->oo_inode); - return 0; + /* we may declare write to non-exist llog */ + if (inode == NULL) + RETURN(0); + + /* dt_declare_write() is usually called for system objects, such + * as llog or last_rcvd files. We needn't enforce quota on those + * objects, so always set the lqi_space as 0. */ + rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh, + true, true, NULL, false); + RETURN(rc); } static int osd_ldiskfs_writelink(struct inode *inode, char *buffer, int buflen) { + /* LU-2634: clear the extent format for fast symlink */ + ldiskfs_clear_inode_flag(inode, LDISKFS_INODE_EXTENTS); memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen); LDISKFS_I(inode)->i_disksize = buflen; i_size_write(inode, buflen); - inode->i_sb->s_op->dirty_inode(inode); + ll_dirty_inode(inode, I_DIRTY_DATASYNC); return 0; } -static int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, - loff_t *offs, handle_t *handle) +int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, + int write_NUL, loff_t *offs, handle_t *handle) { struct buffer_head *bh = NULL; loff_t offset = *offs; @@ -984,6 +1067,15 @@ static int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, int boffs; int dirty_inode = 0; + if (write_NUL) { + /* + * long symlink write does not count the NUL terminator in + * bufsize, we write it, and the inode's file size does not + * count the NUL terminator as well. + */ + ((char *)buf)[bufsize] = '\0'; + ++bufsize; + } while (bufsize > 0) { if (bh != NULL) brelse(bh); @@ -1022,18 +1114,20 @@ static int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, if (bh) brelse(bh); + if (write_NUL) + --new_size; /* correct in-core and on-disk sizes */ if (new_size > i_size_read(inode)) { - cfs_spin_lock(&inode->i_lock); - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (i_size_read(inode) > LDISKFS_I(inode)->i_disksize) { - LDISKFS_I(inode)->i_disksize = i_size_read(inode); - dirty_inode = 1; - } - cfs_spin_unlock(&inode->i_lock); - if (dirty_inode) - inode->i_sb->s_op->dirty_inode(inode); + spin_lock(&inode->i_lock); + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (i_size_read(inode) > LDISKFS_I(inode)->i_disksize) { + LDISKFS_I(inode)->i_disksize = i_size_read(inode); + dirty_inode = 1; + } + spin_unlock(&inode->i_lock); + if (dirty_inode) + ll_dirty_inode(inode, I_DIRTY_DATASYNC); } if (err == 0) @@ -1046,12 +1140,10 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, struct thandle *handle, struct lustre_capa *capa, int ignore_quota) { - struct inode *inode = osd_dt_obj(dt)->oo_inode; - struct osd_thandle *oh; - ssize_t result; -#ifdef HAVE_QUOTA_SUPPORT - cfs_cap_t save = cfs_curproc_cap_pack(); -#endif + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct osd_thandle *oh; + ssize_t result; + int is_link; LASSERT(dt_object_exists(dt)); @@ -1059,31 +1151,26 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, return -EACCES; LASSERT(handle != NULL); + LASSERT(inode != NULL); + ll_vfs_dq_init(inode); /* XXX: don't check: one declared chunk can be used many times */ - /* OSD_EXEC_OP(handle, write); */ + /* osd_trans_exec_op(env, handle, OSD_OT_WRITE); */ oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle->h_transaction != NULL); -#ifdef HAVE_QUOTA_SUPPORT - if (ignore_quota) - cfs_cap_raise(CFS_CAP_SYS_RESOURCE); - else - cfs_cap_lower(CFS_CAP_SYS_RESOURCE); -#endif - /* Write small symlink to inode body as we need to maintain correct - * on-disk symlinks for ldiskfs. - */ - if (S_ISLNK(dt->do_lu.lo_header->loh_attr) && - (buf->lb_len < sizeof(LDISKFS_I(inode)->i_data))) - result = osd_ldiskfs_writelink(inode, buf->lb_buf, buf->lb_len); - else - result = osd_ldiskfs_write_record(inode, buf->lb_buf, - buf->lb_len, pos, - oh->ot_handle); -#ifdef HAVE_QUOTA_SUPPORT - cfs_curproc_cap_unpack(save); -#endif + /* Write small symlink to inode body as we need to maintain correct + * on-disk symlinks for ldiskfs. + * Note: the buf->lb_buf contains a NUL terminator while buf->lb_len + * does not count it in. + */ + is_link = S_ISLNK(dt->do_lu.lo_header->loh_attr); + if (is_link && (buf->lb_len < sizeof(LDISKFS_I(inode)->i_data))) + result = osd_ldiskfs_writelink(inode, buf->lb_buf, buf->lb_len); + else + result = osd_ldiskfs_write_record(inode, buf->lb_buf, + buf->lb_len, is_link, pos, + oh->ot_handle); if (result == 0) result = buf->lb_len; return result; @@ -1093,13 +1180,13 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, __u64 start, __u64 end, struct thandle *th) { struct osd_thandle *oh; + struct inode *inode; + int rc; ENTRY; LASSERT(th); oh = container_of(th, struct osd_thandle, ot_super); - OSD_DECLARE_OP(oh, punch); - /* * we don't need to reserve credits for whole truncate * it's not possible as truncate may need to free too many @@ -1108,10 +1195,15 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, * orphan list. if needed truncate will extend or restart * transaction */ - oh->ot_credits += osd_dto_credits_noquota[DTO_ATTR_SET_BASE]; - oh->ot_credits += 3; + osd_trans_declare_op(env, oh, OSD_OT_PUNCH, + osd_dto_credits_noquota[DTO_ATTR_SET_BASE] + 3); - RETURN(0); + inode = osd_dt_obj(dt)->oo_inode; + LASSERT(inode); + + rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh, + true, true, NULL, false); + RETURN(rc); } static int osd_punch(const struct lu_env *env, struct dt_object *dt, @@ -1123,28 +1215,35 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, struct inode *inode = obj->oo_inode; handle_t *h; tid_t tid; - int rc, rc2 = 0; + loff_t oldsize; + int rc = 0, rc2 = 0; ENTRY; LASSERT(end == OBD_OBJECT_EOF); LASSERT(dt_object_exists(dt)); LASSERT(osd_invariant(obj)); + LASSERT(inode != NULL); + ll_vfs_dq_init(inode); LASSERT(th); oh = container_of(th, struct osd_thandle, ot_super); LASSERT(oh->ot_handle->h_transaction != NULL); - OSD_EXEC_OP(th, punch); + osd_trans_exec_op(env, th, OSD_OT_PUNCH); tid = oh->ot_handle->h_transaction->t_tid; - rc = vmtruncate(inode, start); + oldsize=inode->i_size; + i_size_write(inode, start); + truncate_pagecache(inode, oldsize, start); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); /* * For a partial-page truncate, flush the page to disk immediately to * avoid data corruption during direct disk write. b=17397 */ - if (rc == 0 && (start & ~CFS_PAGE_MASK) != 0) + if ((start & ~CFS_PAGE_MASK) != 0) rc = filemap_fdatawrite_range(inode->i_mapping, start, start+1); h = journal_current_handle(); @@ -1178,6 +1277,7 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = inode->i_fop; @@ -1212,8 +1312,8 @@ const struct dt_body_operations osd_body_ops = { .dbo_declare_write_commit = osd_declare_write_commit, .dbo_write_commit = osd_write_commit, .dbo_read_prep = osd_read_prep, - .do_declare_punch = osd_declare_punch, - .do_punch = osd_punch, + .dbo_declare_punch = osd_declare_punch, + .dbo_punch = osd_punch, .dbo_fiemap_get = osd_fiemap_get, };