lustre/osd-ldiskfs/osd_io.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2012, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  *
  31  * lustre/osd/osd_io.c
  32  *
  33  * body operations
  34  *
  35  * Author: Nikita Danilov <nikita@clusterfs.com>
  36  * Author: Alex Zhuravlev <bzzz@whamcloud.com>
  37  *
  38  */
  39
  40 #define DEBUG_SUBSYSTEM S_OSD
  41
  42 /* prerequisite for linux/xattr.h */
  43 #include <linux/types.h>
  44 /* prerequisite for linux/xattr.h */
  45 #include <linux/fs.h>
  46 #include <linux/mm.h>
  47 #include <linux/swap.h>
  48 #include <linux/pagevec.h>
  49
  50 /*
  51  * struct OBD_{ALLOC,FREE}*()
  52  * OBD_FAIL_CHECK
  53  */
  54 #include <obd_support.h>
  55
  56 #include "osd_internal.h"
  57
  58 /* ext_depth() */
  59 #include <ldiskfs/ldiskfs_extents.h>
  60 #include <ldiskfs/ldiskfs.h>
  61
  62 static inline bool osd_use_page_cache(struct osd_device *d)
  63 {
  64         /* do not use pagecache if write and read caching are disabled */
  65         if (d->od_writethrough_cache + d->od_read_cache == 0)
  66                 return false;
  67         /* use pagecache by default */
  68         return true;
  69 }
  70
  71 static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf,
  72                             int rw, int line, int pages)
  73 {
  74         int blocks, i;
  75
  76         LASSERTF(iobuf->dr_elapsed_valid == 0,
  77                  "iobuf %p, reqs %d, rw %d, line %d\n", iobuf,
  78                  atomic_read(&iobuf->dr_numreqs), iobuf->dr_rw,
  79                  iobuf->dr_init_at);
  80         LASSERT(pages <= PTLRPC_MAX_BRW_PAGES);
  81
  82         init_waitqueue_head(&iobuf->dr_wait);
  83         atomic_set(&iobuf->dr_numreqs, 0);
  84         iobuf->dr_npages = 0;
  85         iobuf->dr_error = 0;
  86         iobuf->dr_dev = d;
  87         iobuf->dr_frags = 0;
  88         iobuf->dr_elapsed = ktime_set(0, 0);
  89         /* must be counted before, so assert */
  90         iobuf->dr_rw = rw;
  91         iobuf->dr_init_at = line;
  92
  93         blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits);
  94         if (iobuf->dr_bl_buf.lb_len >= blocks * sizeof(iobuf->dr_blocks[0])) {
  95                 LASSERT(iobuf->dr_pg_buf.lb_len >=
  96                         pages * sizeof(iobuf->dr_pages[0]));
  97                 return 0;
  98         }
  99
 100         /* start with 1MB for 4K blocks */
 101         i = 256;
 102         while (i <= PTLRPC_MAX_BRW_PAGES && i < pages)
 103                 i <<= 1;
 104
 105         CDEBUG(D_OTHER, "realloc %u for %u (%u) pages\n",
 106                (unsigned int)(pages * sizeof(iobuf->dr_pages[0])), i, pages);
 107         pages = i;
 108         blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits);
 109         iobuf->dr_max_pages = 0;
 110         CDEBUG(D_OTHER, "realloc %u for %u blocks\n",
 111                (unsigned int)(blocks * sizeof(iobuf->dr_blocks[0])), blocks);
 112
 113         lu_buf_realloc(&iobuf->dr_bl_buf, blocks * sizeof(iobuf->dr_blocks[0]));
 114         iobuf->dr_blocks = iobuf->dr_bl_buf.lb_buf;
 115         if (unlikely(iobuf->dr_blocks == NULL))
 116                 return -ENOMEM;
 117
 118         lu_buf_realloc(&iobuf->dr_pg_buf, pages * sizeof(iobuf->dr_pages[0]));
 119         iobuf->dr_pages = iobuf->dr_pg_buf.lb_buf;
 120         if (unlikely(iobuf->dr_pages == NULL))
 121                 return -ENOMEM;
 122
 123         lu_buf_realloc(&iobuf->dr_lnb_buf,
 124                        pages * sizeof(iobuf->dr_lnbs[0]));
 125         iobuf->dr_lnbs = iobuf->dr_lnb_buf.lb_buf;
 126         if (unlikely(iobuf->dr_lnbs == NULL))
 127                 return -ENOMEM;
 128
 129         iobuf->dr_max_pages = pages;
 130
 131         return 0;
 132 }
 133 #define osd_init_iobuf(dev, iobuf, rw, pages) \
 134         __osd_init_iobuf(dev, iobuf, rw, __LINE__, pages)
 135
 136 static void osd_iobuf_add_page(struct osd_iobuf *iobuf,
 137                                struct niobuf_local *lnb)
 138 {
 139         LASSERT(iobuf->dr_npages < iobuf->dr_max_pages);
 140         iobuf->dr_pages[iobuf->dr_npages] = lnb->lnb_page;
 141         iobuf->dr_lnbs[iobuf->dr_npages] = lnb;
 142         iobuf->dr_npages++;
 143 }
 144
 145 void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
 146 {
 147         int rw = iobuf->dr_rw;
 148
 149         if (iobuf->dr_elapsed_valid) {
 150                 struct brw_stats *h = &d->od_brw_stats;
 151
 152                 iobuf->dr_elapsed_valid = 0;
 153                 LASSERT(iobuf->dr_dev == d);
 154                 LASSERT(iobuf->dr_frags > 0);
 155                 lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_DIO_FRAGS+rw],
 156                                       iobuf->dr_frags);
 157                 lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_IO_TIME+rw],
 158                                            ktime_to_ms(iobuf->dr_elapsed));
 159         }
 160 }
 161
 162 #ifdef HAVE_BIO_ENDIO_USES_ONE_ARG
 163 static void dio_complete_routine(struct bio *bio)
 164 {
 165         int error = blk_status_to_errno(bio->bi_status);
 166 #else
 167 static void dio_complete_routine(struct bio *bio, int error)
 168 {
 169 #endif
 170         struct osd_iobuf *iobuf = bio->bi_private;
 171         struct bio_vec *bvl;
 172
 173         /* CAVEAT EMPTOR: possibly in IRQ context
 174          * DO NOT record procfs stats here!!!
 175          */
 176
 177         if (unlikely(iobuf == NULL)) {
 178                 CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
 179                 CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
 180                        ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
 181                        bio->bi_next, (unsigned long)bio->bi_flags,
 182                        (unsigned int)bio->bi_opf, bio->bi_vcnt, bio_idx(bio),
 183                        bio_sectors(bio) << 9, bio->bi_end_io,
 184                        atomic_read(&bio->__bi_cnt),
 185                        bio->bi_private);
 186                 return;
 187         }
 188
 189         /* the check is outside of the cycle for performance reason -bzzz */
 190         if (!bio_data_dir(bio)) {
 191                 DECLARE_BVEC_ITER_ALL(iter_all);
 192
 193                 bio_for_each_segment_all(bvl, bio, iter_all) {
 194                         if (likely(error == 0))
 195                                 SetPageUptodate(bvl_to_page(bvl));
 196                         LASSERT(PageLocked(bvl_to_page(bvl)));
 197                 }
 198                 atomic_dec(&iobuf->dr_dev->od_r_in_flight);
 199         } else {
 200                 atomic_dec(&iobuf->dr_dev->od_w_in_flight);
 201         }
 202
 203         /* any real error is good enough -bzzz */
 204         if (error != 0 && iobuf->dr_error == 0)
 205                 iobuf->dr_error = error;
 206
 207         /*
 208          * set dr_elapsed before dr_numreqs turns to 0, otherwise
 209          * it's possible that service thread will see dr_numreqs
 210          * is zero, but dr_elapsed is not set yet, leading to lost
 211          * data in this processing and an assertion in a subsequent
 212          * call to OSD.
 213          */
 214         if (atomic_read(&iobuf->dr_numreqs) == 1) {
 215                 ktime_t now = ktime_get();
 216
 217                 iobuf->dr_elapsed = ktime_sub(now, iobuf->dr_start_time);
 218                 iobuf->dr_elapsed_valid = 1;
 219         }
 220         if (atomic_dec_and_test(&iobuf->dr_numreqs))
 221                 wake_up(&iobuf->dr_wait);
 222
 223         /* Completed bios used to be chained off iobuf->dr_bios and freed in
 224          * filter_clear_dreq().  It was then possible to exhaust the biovec-256
 225          * mempool when serious on-disk fragmentation was encountered,
 226          * deadlocking the OST.  The bios are now released as soon as complete
 227          * so the pool cannot be exhausted while IOs are competing. b=10076
 228          */
 229         bio_put(bio);
 230 }
 231
 232 static void record_start_io(struct osd_iobuf *iobuf, int size)
 233 {
 234         struct osd_device *osd = iobuf->dr_dev;
 235         struct brw_stats *h = &osd->od_brw_stats;
 236
 237         iobuf->dr_frags++;
 238         atomic_inc(&iobuf->dr_numreqs);
 239
 240         if (iobuf->dr_rw == 0) {
 241                 atomic_inc(&osd->od_r_in_flight);
 242                 lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_RPC_HIST],
 243                                  atomic_read(&osd->od_r_in_flight));
 244                 lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_DISK_IOSIZE],
 245                                            size);
 246         } else if (iobuf->dr_rw == 1) {
 247                 atomic_inc(&osd->od_w_in_flight);
 248                 lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_RPC_HIST],
 249                                  atomic_read(&osd->od_w_in_flight));
 250                 lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_W_DISK_IOSIZE],
 251                                            size);
 252         } else {
 253                 LBUG();
 254         }
 255 }
 256
 257 static void osd_submit_bio(int rw, struct bio *bio)
 258 {
 259         LASSERTF(rw == 0 || rw == 1, "%x\n", rw);
 260 #ifdef HAVE_SUBMIT_BIO_2ARGS
 261         submit_bio(rw ? WRITE : READ, bio);
 262 #else
 263         bio->bi_opf |= rw;
 264         submit_bio(bio);
 265 #endif
 266 }
 267
 268 static int can_be_merged(struct bio *bio, sector_t sector)
 269 {
 270         if (bio == NULL)
 271                 return 0;
 272
 273         return bio_end_sector(bio) == sector ? 1 : 0;
 274 }
 275
 276 #if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
 277 /*
 278  * This function will change the data written, thus it should only be
 279  * used when checking data integrity feature
 280  */
 281 static void bio_integrity_fault_inject(struct bio *bio)
 282 {
 283         struct bio_vec *bvec;
 284         DECLARE_BVEC_ITER_ALL(iter_all);
 285         void *kaddr;
 286         char *addr;
 287
 288         bio_for_each_segment_all(bvec, bio, iter_all) {
 289                 struct page *page = bvec->bv_page;
 290
 291                 kaddr = kmap(page);
 292                 addr = kaddr;
 293                 *addr = ~(*addr);
 294                 kunmap(page);
 295                 break;
 296         }
 297 }
 298
 299 static int bio_dif_compare(__u16 *expected_guard_buf, void *bio_prot_buf,
 300                            unsigned int sectors, int tuple_size)
 301 {
 302         __u16 *expected_guard;
 303         __u16 *bio_guard;
 304         int i;
 305
 306         expected_guard = expected_guard_buf;
 307         for (i = 0; i < sectors; i++) {
 308                 bio_guard = (__u16 *)bio_prot_buf;
 309                 if (*bio_guard != *expected_guard) {
 310                         CERROR(
 311                                "unexpected guard tags on sector %d expected guard %u, bio guard %u, sectors %u, tuple size %d\n",
 312                                i, *expected_guard, *bio_guard, sectors,
 313                                tuple_size);
 314                         return -EIO;
 315                 }
 316                 expected_guard++;
 317                 bio_prot_buf += tuple_size;
 318         }
 319         return 0;
 320 }
 321
 322 static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev,
 323                                      struct osd_iobuf *iobuf, int index)
 324 {
 325         struct blk_integrity *bi = bdev_get_integrity(bdev);
 326         struct bio_integrity_payload *bip = bio->bi_integrity;
 327         struct niobuf_local *lnb = NULL;
 328         unsigned short sector_size = blk_integrity_interval(bi);
 329         void *bio_prot_buf = page_address(bip->bip_vec->bv_page) +
 330                 bip->bip_vec->bv_offset;
 331         struct bio_vec *bv;
 332         sector_t sector = bio_start_sector(bio);
 333         unsigned int i, sectors, total;
 334         DECLARE_BVEC_ITER_ALL(iter_all);
 335         __u16 *expected_guard;
 336         int rc;
 337
 338         total = 0;
 339         bio_for_each_segment_all(bv, bio, iter_all) {
 340                 for (i = index; i < iobuf->dr_npages; i++) {
 341                         if (iobuf->dr_pages[i] == bv->bv_page) {
 342                                 lnb = iobuf->dr_lnbs[i];
 343                                 break;
 344                         }
 345                 }
 346                 if (!lnb)
 347                         continue;
 348                 expected_guard = lnb->lnb_guards;
 349                 sectors = bv->bv_len / sector_size;
 350                 if (lnb->lnb_guard_rpc) {
 351                         rc = bio_dif_compare(expected_guard, bio_prot_buf,
 352                                              sectors, bi->tuple_size);
 353                         if (rc)
 354                                 return rc;
 355                 }
 356
 357                 sector += sectors;
 358                 bio_prot_buf += sectors * bi->tuple_size;
 359                 total += sectors * bi->tuple_size;
 360                 LASSERT(total <= bip_size(bio->bi_integrity));
 361                 index++;
 362                 lnb = NULL;
 363         }
 364         return 0;
 365 }
 366
 367 static int osd_bio_integrity_handle(struct osd_device *osd, struct bio *bio,
 368                                     struct osd_iobuf *iobuf,
 369                                     int start_page_idx, bool fault_inject,
 370                                     bool integrity_enabled)
 371 {
 372         struct super_block *sb = osd_sb(osd);
 373         integrity_gen_fn *generate_fn = NULL;
 374         integrity_vrfy_fn *verify_fn = NULL;
 375         int rc;
 376
 377         ENTRY;
 378
 379         if (!integrity_enabled)
 380                 RETURN(0);
 381
 382         rc = osd_get_integrity_profile(osd, &generate_fn, &verify_fn);
 383         if (rc)
 384                 RETURN(rc);
 385
 386         rc = bio_integrity_prep_fn(bio, generate_fn, verify_fn);
 387         if (rc)
 388                 RETURN(rc);
 389
 390         /* Verify and inject fault only when writing */
 391         if (iobuf->dr_rw == 1) {
 392                 if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_OST_INTEGRITY_CMP))) {
 393                         rc = osd_bio_integrity_compare(bio, sb->s_bdev, iobuf,
 394                                                        start_page_idx);
 395                         if (rc)
 396                                 RETURN(rc);
 397                 }
 398
 399                 if (unlikely(fault_inject))
 400                         bio_integrity_fault_inject(bio);
 401         }
 402
 403         RETURN(0);
 404 }
 405
 406 #ifdef HAVE_BIO_INTEGRITY_PREP_FN
 407 #  ifdef HAVE_BIO_ENDIO_USES_ONE_ARG
 408 static void dio_integrity_complete_routine(struct bio *bio)
 409 #  else
 410 static void dio_integrity_complete_routine(struct bio *bio, int error)
 411 #  endif
 412 {
 413         struct osd_bio_private *bio_private = bio->bi_private;
 414
 415         bio->bi_private = bio_private->obp_iobuf;
 416         osd_dio_complete_routine(bio, error);
 417
 418         OBD_FREE_PTR(bio_private);
 419 }
 420 #endif /* HAVE_BIO_INTEGRITY_PREP_FN */
 421 #else  /* !CONFIG_BLK_DEV_INTEGRITY */
 422 #define osd_bio_integrity_handle(osd, bio, iobuf, start_page_idx, \
 423                                  fault_inject, integrity_enabled) 0
 424 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 425
 426 static int osd_bio_init(struct bio *bio, struct osd_iobuf *iobuf,
 427                         bool integrity_enabled, int start_page_idx,
 428                         struct osd_bio_private **pprivate)
 429 {
 430         ENTRY;
 431
 432         *pprivate = NULL;
 433
 434 #ifdef HAVE_BIO_INTEGRITY_PREP_FN
 435         if (integrity_enabled) {
 436                 struct osd_bio_private *bio_private = NULL;
 437
 438                 OBD_ALLOC_GFP(bio_private, sizeof(*bio_private), GFP_NOIO);
 439                 if (bio_private == NULL)
 440                         RETURN(-ENOMEM);
 441                 bio->bi_end_io = dio_integrity_complete_routine;
 442                 bio->bi_private = bio_private;
 443                 bio_private->obp_start_page_idx = start_page_idx;
 444                 bio_private->obp_iobuf = iobuf;
 445                 *pprivate = bio_private;
 446         } else
 447 #endif
 448         {
 449                 bio->bi_end_io = dio_complete_routine;
 450                 bio->bi_private = iobuf;
 451         }
 452
 453         RETURN(0);
 454 }
 455
 456 static void osd_mark_page_io_done(struct osd_iobuf *iobuf,
 457                                   struct inode *inode,
 458                                   sector_t start_blocks,
 459                                   sector_t count)
 460 {
 461         struct niobuf_local *lnb;
 462         int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
 463         pgoff_t pg_start, pg_end;
 464
 465         pg_start = start_blocks / blocks_per_page;
 466         if (start_blocks % blocks_per_page)
 467                 pg_start++;
 468         if (count >= blocks_per_page)
 469                 pg_end = (start_blocks + count -
 470                           blocks_per_page) / blocks_per_page;
 471         else
 472                 return; /* nothing to mark */
 473         for ( ; pg_start <= pg_end; pg_start++) {
 474                 lnb = iobuf->dr_lnbs[pg_start];
 475                 lnb->lnb_flags |= OBD_BRW_DONE;
 476         }
 477 }
 478
 479 static int osd_do_bio(struct osd_device *osd, struct inode *inode,
 480                       struct osd_iobuf *iobuf, sector_t start_blocks,
 481                       sector_t count)
 482 {
 483         int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
 484         struct page **pages = iobuf->dr_pages;
 485         int npages = iobuf->dr_npages;
 486         sector_t *blocks = iobuf->dr_blocks;
 487         struct super_block *sb = inode->i_sb;
 488         int sector_bits = sb->s_blocksize_bits - 9;
 489         unsigned int blocksize = sb->s_blocksize;
 490         struct block_device *bdev = sb->s_bdev;
 491         struct osd_bio_private *bio_private = NULL;
 492         struct bio *bio = NULL;
 493         int bio_start_page_idx;
 494         struct page *page;
 495         unsigned int page_offset;
 496         sector_t sector;
 497         int nblocks;
 498         int block_idx, block_idx_end;
 499         int page_idx, page_idx_start;
 500         int i;
 501         int rc = 0;
 502         bool fault_inject;
 503         bool integrity_enabled;
 504         struct blk_plug plug;
 505         int blocks_left_page;
 506
 507         ENTRY;
 508
 509         fault_inject = OBD_FAIL_CHECK(OBD_FAIL_OST_INTEGRITY_FAULT);
 510         LASSERT(iobuf->dr_npages == npages);
 511
 512         integrity_enabled = bdev_integrity_enabled(bdev, iobuf->dr_rw);
 513
 514         osd_brw_stats_update(osd, iobuf);
 515         iobuf->dr_start_time = ktime_get();
 516
 517         if (!count)
 518                 count = npages * blocks_per_page;
 519         block_idx_end = start_blocks + count;
 520
 521         blk_start_plug(&plug);
 522
 523         page_idx_start = start_blocks / blocks_per_page;
 524         for (page_idx = page_idx_start, block_idx = start_blocks;
 525              block_idx < block_idx_end; page_idx++,
 526              block_idx += blocks_left_page) {
 527                 /* For cases where the filesystems blocksize is not the
 528                  * same as PAGE_SIZE (e.g. ARM with PAGE_SIZE=64KB and
 529                  * blocksize=4KB), there will be multiple blocks to
 530                  * read/write per page. Also, the start and end block may
 531                  * not be aligned to the start and end of the page, so the
 532                  * first page may skip some blocks at the start ("i != 0",
 533                  * "blocks_left_page" is reduced), and the last page may
 534                  * skip some blocks at the end (limited by "count").
 535                  */
 536                 page = pages[page_idx];
 537                 LASSERT(page_idx < iobuf->dr_npages);
 538
 539                 i = block_idx % blocks_per_page;
 540                 blocks_left_page = blocks_per_page - i;
 541                 if (block_idx + blocks_left_page > block_idx_end)
 542                         blocks_left_page = block_idx_end - block_idx;
 543                 page_offset = i * blocksize;
 544                 for (i = 0; i < blocks_left_page;
 545                      i += nblocks, page_offset += blocksize * nblocks) {
 546                         nblocks = 1;
 547
 548                         if (blocks[block_idx + i] == 0) {  /* hole */
 549                                 LASSERTF(iobuf->dr_rw == 0,
 550                                          "page_idx %u, block_idx %u, i %u,"
 551                                          "start_blocks: %llu, count: %llu, npages: %d\n",
 552                                          page_idx, block_idx, i,
 553                                          (unsigned long long)start_blocks,
 554                                          (unsigned long long)count, npages);
 555                                 memset(kmap(page) + page_offset, 0, blocksize);
 556                                 kunmap(page);
 557                                 continue;
 558                         }
 559
 560                         sector = (sector_t)blocks[block_idx + i] << sector_bits;
 561
 562                         /* Additional contiguous file blocks? */
 563                         while (i + nblocks < blocks_left_page &&
 564                                (sector + (nblocks << sector_bits)) ==
 565                                ((sector_t)blocks[block_idx + i + nblocks] <<
 566                                  sector_bits))
 567                                 nblocks++;
 568
 569                         if (bio && can_be_merged(bio, sector) &&
 570                             bio_add_page(bio, page, blocksize * nblocks,
 571                                          page_offset) != 0)
 572                                 continue;       /* added this frag OK */
 573
 574                         if (bio != NULL) {
 575                                 struct request_queue *q = bio_get_queue(bio);
 576                                 unsigned int bi_size = bio_sectors(bio) << 9;
 577
 578                                 /* Dang! I have to fragment this I/O */
 579                                 CDEBUG(D_INODE,
 580                                        "bio++ sz %d vcnt %d(%d) sectors %d(%d) psg %d(%d)\n",
 581                                        bi_size, bio->bi_vcnt, bio->bi_max_vecs,
 582                                        bio_sectors(bio),
 583                                        queue_max_sectors(q),
 584                                        osd_bio_nr_segs(bio),
 585                                        queue_max_segments(q));
 586                                 rc = osd_bio_integrity_handle(osd, bio,
 587                                         iobuf, bio_start_page_idx,
 588                                         fault_inject, integrity_enabled);
 589                                 if (rc) {
 590                                         bio_put(bio);
 591                                         goto out;
 592                                 }
 593
 594                                 record_start_io(iobuf, bi_size);
 595                                 osd_submit_bio(iobuf->dr_rw, bio);
 596                         }
 597
 598                         bio_start_page_idx = page_idx;
 599                         /* allocate new bio */
 600                         bio = bio_alloc(GFP_NOIO, min(BIO_MAX_PAGES,
 601                                         (block_idx_end - block_idx +
 602                                          blocks_left_page - 1)));
 603                         if (bio == NULL) {
 604                                 CERROR("Can't allocate bio %u pages\n",
 605                                        block_idx_end - block_idx +
 606                                        blocks_left_page - 1);
 607                                 rc = -ENOMEM;
 608                                 goto out;
 609                         }
 610
 611                         bio_set_dev(bio, bdev);
 612                         bio_set_sector(bio, sector);
 613                         bio->bi_opf = iobuf->dr_rw ? WRITE : READ;
 614                         rc = osd_bio_init(bio, iobuf, integrity_enabled,
 615                                           bio_start_page_idx, &bio_private);
 616                         if (rc) {
 617                                 bio_put(bio);
 618                                 goto out;
 619                         }
 620
 621                         rc = bio_add_page(bio, page,
 622                                           blocksize * nblocks, page_offset);
 623                         LASSERT(rc != 0);
 624                 }
 625         }
 626
 627         if (bio != NULL) {
 628                 rc = osd_bio_integrity_handle(osd, bio, iobuf,
 629                                               bio_start_page_idx,
 630                                               fault_inject,
 631                                               integrity_enabled);
 632                 if (rc) {
 633                         bio_put(bio);
 634                         goto out;
 635                 }
 636
 637                 record_start_io(iobuf, bio_sectors(bio) << 9);
 638                 osd_submit_bio(iobuf->dr_rw, bio);
 639                 rc = 0;
 640         }
 641
 642 out:
 643         blk_finish_plug(&plug);
 644
 645         /* in order to achieve better IO throughput, we don't wait for writes
 646          * completion here. instead we proceed with transaction commit in
 647          * parallel and wait for IO completion once transaction is stopped
 648          * see osd_trans_stop() for more details -bzzz
 649          */
 650         if (iobuf->dr_rw == 0 || fault_inject) {
 651                 wait_event(iobuf->dr_wait,
 652                            atomic_read(&iobuf->dr_numreqs) == 0);
 653                 osd_fini_iobuf(osd, iobuf);
 654         }
 655
 656         if (rc == 0) {
 657                 rc = iobuf->dr_error;
 658         } else {
 659                 if (bio_private)
 660                         OBD_FREE_PTR(bio_private);
 661         }
 662
 663         /* Write only now */
 664         if (rc == 0 && iobuf->dr_rw)
 665                 osd_mark_page_io_done(iobuf, inode,
 666                                       start_blocks, count);
 667
 668         RETURN(rc);
 669 }
 670
 671 static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages,
 672                                    struct niobuf_local *lnb, int maxlnb)
 673 {
 674         int rc = 0;
 675         ENTRY;
 676
 677         *nrpages = 0;
 678
 679         while (len > 0) {
 680                 int poff = offset & (PAGE_SIZE - 1);
 681                 int plen = PAGE_SIZE - poff;
 682
 683                 if (*nrpages >= maxlnb) {
 684                         rc = -EOVERFLOW;
 685                         break;
 686                 }
 687
 688                 if (plen > len)
 689                         plen = len;
 690                 lnb->lnb_file_offset = offset;
 691                 lnb->lnb_page_offset = poff;
 692                 lnb->lnb_len = plen;
 693                 /* lnb->lnb_flags = rnb->rnb_flags; */
 694                 lnb->lnb_flags = 0;
 695                 lnb->lnb_page = NULL;
 696                 lnb->lnb_rc = 0;
 697                 lnb->lnb_guard_rpc = 0;
 698                 lnb->lnb_guard_disk = 0;
 699                 lnb->lnb_locked = 0;
 700
 701                 LASSERTF(plen <= len, "plen %u, len %lld\n", plen,
 702                          (long long) len);
 703                 offset += plen;
 704                 len -= plen;
 705                 lnb++;
 706                 (*nrpages)++;
 707         }
 708
 709         RETURN(rc);
 710 }
 711
 712 static struct page *osd_get_page(const struct lu_env *env, struct dt_object *dt,
 713                                  loff_t offset, gfp_t gfp_mask, bool cache)
 714 {
 715         struct osd_thread_info *oti = osd_oti_get(env);
 716         struct inode *inode = osd_dt_obj(dt)->oo_inode;
 717         struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
 718         struct page *page;
 719         int cur;
 720
 721         LASSERT(inode);
 722
 723         if (cache) {
 724                 page = find_or_create_page(inode->i_mapping,
 725                                            offset >> PAGE_SHIFT, gfp_mask);
 726
 727                 if (likely(page)) {
 728                         LASSERT(!PagePrivate2(page));
 729                         wait_on_page_writeback(page);
 730                 } else {
 731                         lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1);
 732                 }
 733
 734                 return page;
 735         }
 736
 737         if (inode->i_mapping->nrpages) {
 738                 /* consult with pagecache, but do not create new pages */
 739                 /* this is normally used once */
 740                 page = find_lock_page(inode->i_mapping, offset >> PAGE_SHIFT);
 741                 if (page) {
 742                         wait_on_page_writeback(page);
 743                         return page;
 744                 }
 745         }
 746
 747         LASSERT(oti->oti_dio_pages);
 748         cur = oti->oti_dio_pages_used;
 749         page = oti->oti_dio_pages[cur];
 750
 751         if (unlikely(!page)) {
 752                 LASSERT(cur < PTLRPC_MAX_BRW_PAGES);
 753                 page = alloc_page(gfp_mask);
 754                 if (!page)
 755                         return NULL;
 756                 oti->oti_dio_pages[cur] = page;
 757                 SetPagePrivate2(page);
 758                 lock_page(page);
 759         }
 760
 761         ClearPageUptodate(page);
 762         page->index = offset >> PAGE_SHIFT;
 763         oti->oti_dio_pages_used++;
 764
 765         return page;
 766 }
 767
 768 /*
 769  * there are following "locks":
 770  * journal_start
 771  * i_mutex
 772  * page lock
 773  *
 774  * osd write path:
 775  *  - lock page(s)
 776  *  - journal_start
 777  *  - truncate_sem
 778  *
 779  * ext4 vmtruncate:
 780  *  - lock pages, unlock
 781  *  - journal_start
 782  *  - lock partial page
 783  *  - i_data_sem
 784  *
 785  */
 786
 787 /**
 788  * Unlock and release pages loaded by osd_bufs_get()
 789  *
 790  * Unlock \a npages pages from \a lnb and drop the refcount on them.
 791  *
 792  * \param env           thread execution environment
 793  * \param dt            dt object undergoing IO (OSD object + methods)
 794  * \param lnb           array of pages undergoing IO
 795  * \param npages        number of pages in \a lnb
 796  *
 797  * \retval 0            always
 798  */
 799 static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
 800                         struct niobuf_local *lnb, int npages)
 801 {
 802         struct osd_thread_info *oti = osd_oti_get(env);
 803         struct pagevec pvec;
 804         int i;
 805
 806         ll_pagevec_init(&pvec, 0);
 807
 808         for (i = 0; i < npages; i++) {
 809                 struct page *page = lnb[i].lnb_page;
 810
 811                 if (page == NULL)
 812                         continue;
 813
 814                 /* if the page isn't cached, then reset uptodate
 815                  * to prevent reuse
 816                  */
 817                 if (PagePrivate2(page)) {
 818                         oti->oti_dio_pages_used--;
 819                 } else {
 820                         if (lnb[i].lnb_locked)
 821                                 unlock_page(page);
 822                         if (pagevec_add(&pvec, page) == 0)
 823                                 pagevec_release(&pvec);
 824                 }
 825
 826                 lnb[i].lnb_page = NULL;
 827         }
 828
 829         LASSERTF(oti->oti_dio_pages_used == 0, "%d\n", oti->oti_dio_pages_used);
 830
 831         /* Release any partial pagevec */
 832         pagevec_release(&pvec);
 833
 834         RETURN(0);
 835 }
 836
 837 /**
 838  * Load and lock pages undergoing IO
 839  *
 840  * Pages as described in the \a lnb array are fetched (from disk or cache)
 841  * and locked for IO by the caller.
 842  *
 843  * DLM locking protects us from write and truncate competing for same region,
 844  * but partial-page truncate can leave dirty pages in the cache for ldiskfs.
 845  * It's possible the writeout on a such a page is in progress when we access
 846  * it. It's also possible that during this writeout we put new (partial) data
 847  * into the page, but won't be able to proceed in filter_commitrw_write().
 848  * Therefore, just wait for writeout completion as it should be rare enough.
 849  *
 850  * \param env           thread execution environment
 851  * \param dt            dt object undergoing IO (OSD object + methods)
 852  * \param pos           byte offset of IO start
 853  * \param len           number of bytes of IO
 854  * \param lnb           array of extents undergoing IO
 855  * \param rw            read or write operation, and other flags
 856  * \param capa          capabilities
 857  *
 858  * \retval pages        (zero or more) loaded successfully
 859  * \retval -ENOMEM      on memory/page allocation error
 860  */
 861 static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
 862                         loff_t pos, ssize_t len, struct niobuf_local *lnb,
 863                         int maxlnb, enum dt_bufs_type rw)
 864 {
 865         struct osd_thread_info *oti = osd_oti_get(env);
 866         struct osd_object *obj = osd_dt_obj(dt);
 867         struct osd_device *osd   = osd_obj2dev(obj);
 868         int npages, i, iosize, rc = 0;
 869         bool cache, write;
 870         loff_t fsize;
 871         gfp_t gfp_mask;
 872
 873         LASSERT(obj->oo_inode);
 874
 875         if (unlikely(obj->oo_destroyed))
 876                 RETURN(-ENOENT);
 877
 878         rc = osd_map_remote_to_local(pos, len, &npages, lnb, maxlnb);
 879         if (rc)
 880                 RETURN(rc);
 881
 882         write = rw & DT_BUFS_TYPE_WRITE;
 883
 884         fsize = lnb[npages - 1].lnb_file_offset + lnb[npages - 1].lnb_len;
 885         iosize = fsize - lnb[0].lnb_file_offset;
 886         fsize = max(fsize, i_size_read(obj->oo_inode));
 887
 888         cache = rw & DT_BUFS_TYPE_READAHEAD;
 889         if (cache)
 890                 goto bypass_checks;
 891
 892         cache = osd_use_page_cache(osd);
 893         while (cache) {
 894                 if (write) {
 895                         if (!osd->od_writethrough_cache) {
 896                                 cache = false;
 897                                 break;
 898                         }
 899                         if (iosize > osd->od_writethrough_max_iosize) {
 900                                 cache = false;
 901                                 break;
 902                         }
 903                 } else {
 904                         if (!osd->od_read_cache) {
 905                                 cache = false;
 906                                 break;
 907                         }
 908                         if (iosize > osd->od_readcache_max_iosize) {
 909                                 cache = false;
 910                                 break;
 911                         }
 912                 }
 913                 /* don't use cache on large files */
 914                 if (osd->od_readcache_max_filesize &&
 915                     fsize > osd->od_readcache_max_filesize)
 916                         cache = false;
 917                 break;
 918         }
 919
 920 bypass_checks:
 921         if (!cache && unlikely(!oti->oti_dio_pages)) {
 922                 OBD_ALLOC_PTR_ARRAY_LARGE(oti->oti_dio_pages,
 923                                           PTLRPC_MAX_BRW_PAGES);
 924                 if (!oti->oti_dio_pages)
 925                         return -ENOMEM;
 926         }
 927
 928         /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */
 929         gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) :
 930                                              GFP_HIGHUSER;
 931         for (i = 0; i < npages; i++, lnb++) {
 932                 lnb->lnb_page = osd_get_page(env, dt, lnb->lnb_file_offset,
 933                                              gfp_mask, cache);
 934                 if (lnb->lnb_page == NULL)
 935                         GOTO(cleanup, rc = -ENOMEM);
 936
 937                 lnb->lnb_locked = 1;
 938                 if (cache)
 939                         mark_page_accessed(lnb->lnb_page);
 940         }
 941
 942 #if 0
 943         /* XXX: this version doesn't invalidate cached pages, but use them */
 944         if (!cache && write && obj->oo_inode->i_mapping->nrpages) {
 945                 /* do not allow data aliasing, invalidate pagecache */
 946                 /* XXX: can be quite expensive in mixed case */
 947                 invalidate_mapping_pages(obj->oo_inode->i_mapping,
 948                                 lnb[0].lnb_file_offset >> PAGE_SHIFT,
 949                                 lnb[npages - 1].lnb_file_offset >> PAGE_SHIFT);
 950         }
 951 #endif
 952
 953         RETURN(i);
 954
 955 cleanup:
 956         if (i > 0)
 957                 osd_bufs_put(env, dt, lnb - i, i);
 958         return rc;
 959 }
 960 /* Borrow @ext4_chunk_trans_blocks */
 961 static int osd_chunk_trans_blocks(struct inode *inode, int nrblocks)
 962 {
 963         ldiskfs_group_t groups;
 964         int gdpblocks;
 965         int idxblocks;
 966         int depth;
 967         int ret;
 968
 969         depth = ext_depth(inode);
 970         idxblocks = depth * 2;
 971
 972         /*
 973          * Now let's see how many group bitmaps and group descriptors need
 974          * to account.
 975          */
 976         groups = idxblocks + 1;
 977         gdpblocks = groups;
 978         if (groups > LDISKFS_SB(inode->i_sb)->s_groups_count)
 979                 groups = LDISKFS_SB(inode->i_sb)->s_groups_count;
 980         if (gdpblocks > LDISKFS_SB(inode->i_sb)->s_gdb_count)
 981                 gdpblocks = LDISKFS_SB(inode->i_sb)->s_gdb_count;
 982
 983         /* bitmaps and block group descriptor blocks */
 984         ret = idxblocks + groups + gdpblocks;
 985
 986         /* Blocks for super block, inode, quota and xattr blocks */
 987         ret += LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
 988
 989         return ret;
 990 }
 991
 992 #ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
 993 static int osd_extend_restart_trans(handle_t *handle, int needed,
 994                                     struct inode *inode)
 995 {
 996         int rc;
 997
 998         rc = ldiskfs_journal_ensure_credits(handle, needed,
 999                 ldiskfs_trans_default_revoke_credits(inode->i_sb));
1000         /* this means journal has been restarted */
1001         if (rc > 0)
1002                 rc = 0;
1003
1004         return rc;
1005 }
1006 #else
1007 static int osd_extend_restart_trans(handle_t *handle, int needed,
1008                                     struct inode *inode)
1009 {
1010         int rc;
1011
1012         if (ldiskfs_handle_has_enough_credits(handle, needed))
1013                 return 0;
1014         rc = ldiskfs_journal_extend(handle,
1015                                 needed - handle->h_buffer_credits);
1016         if (rc <= 0)
1017                 return rc;
1018
1019         return ldiskfs_journal_restart(handle, needed);
1020 }
1021 #endif /* HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */
1022
1023 static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf,
1024                                  struct osd_device *osd, sector_t start_blocks,
1025                                  sector_t count, loff_t *disk_size,
1026                                  __u64 user_size)
1027 {
1028         /* if file has grown, take user_size into account */
1029         if (user_size && *disk_size > user_size)
1030                 *disk_size = user_size;
1031
1032         spin_lock(&inode->i_lock);
1033         if (*disk_size > i_size_read(inode)) {
1034                 i_size_write(inode, *disk_size);
1035                 LDISKFS_I(inode)->i_disksize = *disk_size;
1036                 spin_unlock(&inode->i_lock);
1037                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
1038         } else {
1039                 spin_unlock(&inode->i_lock);
1040         }
1041
1042         /*
1043          * We don't do stats here as in read path because
1044          * write is async: we'll do this in osd_put_bufs()
1045          */
1046         return osd_do_bio(osd, inode, iobuf, start_blocks, count);
1047 }
1048
1049 static unsigned int osd_extent_bytes(const struct osd_device *o)
1050 {
1051         unsigned int *extent_bytes_ptr =
1052                         raw_cpu_ptr(o->od_extent_bytes_percpu);
1053
1054         if (likely(*extent_bytes_ptr))
1055                 return *extent_bytes_ptr;
1056
1057         /* initialize on first access or CPU hotplug */
1058         if (!ldiskfs_has_feature_extents(osd_sb(o)))
1059                 *extent_bytes_ptr = 1 << osd_sb(o)->s_blocksize_bits;
1060         else
1061                 *extent_bytes_ptr = OSD_DEFAULT_EXTENT_BYTES;
1062
1063         return *extent_bytes_ptr;
1064 }
1065
1066 #define EXTENT_BYTES_DECAY 64
1067 static void osd_decay_extent_bytes(struct osd_device *osd,
1068                                    unsigned int new_bytes)
1069 {
1070         unsigned int old_bytes;
1071
1072         if (!ldiskfs_has_feature_extents(osd_sb(osd)))
1073                 return;
1074
1075         old_bytes = osd_extent_bytes(osd);
1076         *raw_cpu_ptr(osd->od_extent_bytes_percpu) =
1077                 (old_bytes * (EXTENT_BYTES_DECAY - 1) +
1078                  min(new_bytes, OSD_DEFAULT_EXTENT_BYTES) +
1079                  EXTENT_BYTES_DECAY - 1) / EXTENT_BYTES_DECAY;
1080 }
1081
1082 static int osd_ldiskfs_map_inode_pages(struct inode *inode,
1083                                        struct osd_iobuf *iobuf,
1084                                        struct osd_device *osd,
1085                                        int create, __u64 user_size,
1086                                        int check_credits,
1087                                        struct thandle *thandle)
1088 {
1089         int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
1090         int blocksize = 1 << inode->i_blkbits;
1091         int rc = 0, i = 0, mapped_index = 0;
1092         struct page *fp = NULL;
1093         int clen = 0;
1094         pgoff_t max_page_index;
1095         handle_t *handle = NULL;
1096         sector_t start_blocks = 0, count = 0;
1097         loff_t disk_size = 0;
1098         struct page **page = iobuf->dr_pages;
1099         int pages = iobuf->dr_npages;
1100         sector_t *blocks = iobuf->dr_blocks;
1101         struct niobuf_local *lnb1, *lnb2;
1102         loff_t size1, size2;
1103
1104         max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT;
1105
1106         CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
1107                 inode->i_ino, pages, (*page)->index);
1108
1109         if (create) {
1110                 create = LDISKFS_GET_BLOCKS_CREATE;
1111                 handle = ldiskfs_journal_current_handle();
1112                 LASSERT(handle != NULL);
1113                 rc = osd_attach_jinode(inode);
1114                 if (rc)
1115                         return rc;
1116                 disk_size = i_size_read(inode);
1117                 /* if disk_size is already bigger than specified user_size,
1118                  * ignore user_size
1119                  */
1120                 if (disk_size > user_size)
1121                         user_size = 0;
1122         }
1123         /* pages are sorted already. so, we just have to find
1124          * contig. space and process them properly
1125          */
1126         while (i < pages) {
1127                 long blen, total = 0, previous_total = 0;
1128                 struct ldiskfs_map_blocks map = { 0 };
1129
1130                 if (fp == NULL) { /* start new extent */
1131                         fp = *page++;
1132                         clen = 1;
1133                         if (++i != pages)
1134                                 continue;
1135                 } else if (fp->index + clen == (*page)->index) {
1136                         /* continue the extent */
1137                         page++;
1138                         clen++;
1139                         if (++i != pages)
1140                                 continue;
1141                 }
1142                 if (fp->index + clen >= max_page_index)
1143                         GOTO(cleanup, rc = -EFBIG);
1144                 /* process found extent */
1145                 map.m_lblk = fp->index * blocks_per_page;
1146                 map.m_len = blen = clen * blocks_per_page;
1147
1148                 /*
1149                  * For PAGE_SIZE > blocksize block allocation mapping, the
1150                  * ldiskfs_map_blocks() aims at looking up already mapped
1151                  * blocks, recording them to iobuf->dr_blocks and fixing up
1152                  * m_lblk, m_len for un-allocated blocks to be created/mapped
1153                  * in the second ldiskfs_map_blocks().
1154                  *
1155                  * M_lblk should be the first un-allocated block if m_lblk
1156                  * points at an already allocated block when create = 1,
1157                  * ldiskfs_map_blocks() will just return with already
1158                  * allocated blocks and without allocating any requested
1159                  * new blocks for the extent. For PAGE_SIZE = blocksize
1160                  * case, if m_lblk points at an already allocated block it
1161                  * will point at an un-allocated block in next restart
1162                  * transaction, because the already mapped block/page will
1163                  * be filtered out in next restart transaction via flag
1164                  * OBD_BRW_DONE in osd_declare_write_commit().
1165                  */
1166                 if (create && PAGE_SIZE > blocksize) {
1167                         /* With flags=0 just for already mapped blocks lookup */
1168                         rc = ldiskfs_map_blocks(handle, inode, &map, 0);
1169                         if (rc > 0 && map.m_flags & LDISKFS_MAP_MAPPED) {
1170                                 for (; total < blen && total < map.m_len;
1171                                                 total++)
1172                                         *(blocks + total) = map.m_pblk + total;
1173
1174                                 /* The extent is already full mapped */
1175                                 if (total == blen) {
1176                                         rc = 0;
1177                                         goto ext_already_mapped;
1178                                 }
1179                         }
1180                         /*
1181                          * Fixup or reset m_lblk and m_len for un-mapped blocks.
1182                          * The second ldiskfs_map_blocks() will create and map
1183                          * them.
1184                          */
1185                         map.m_lblk = fp->index * blocks_per_page + total;
1186                         map.m_len = blen - total;
1187                 }
1188
1189 cont_map:
1190                 /**
1191                  * We might restart transaction for block allocations,
1192                  * in order to make sure data ordered mode, issue IO, disk
1193                  * size update and block allocations need be within same
1194                  * transaction to make sure consistency.
1195                  */
1196                 if (handle && check_credits) {
1197                         struct osd_thandle *oh;
1198
1199                         LASSERT(thandle != NULL);
1200                         oh = container_of(thandle, struct osd_thandle,
1201                                           ot_super);
1202                         /*
1203                          * only issue IO if restart transaction needed,
1204                          * as update disk size need hold inode lock, we
1205                          * want to avoid that as much as possible.
1206                          */
1207                         if (oh->oh_declared_ext <= 0) {
1208                                 rc = osd_ldiskfs_map_write(inode,
1209                                         iobuf, osd, start_blocks,
1210                                         count, &disk_size, user_size);
1211                                 if (rc)
1212                                         GOTO(cleanup, rc);
1213                                 thandle->th_restart_tran = 1;
1214                                 GOTO(cleanup, rc = -EAGAIN);
1215                         }
1216
1217                         if (OBD_FAIL_CHECK(OBD_FAIL_OST_RESTART_IO))
1218                                 oh->oh_declared_ext = 0;
1219                         else
1220                                 oh->oh_declared_ext--;
1221                 }
1222                 rc = ldiskfs_map_blocks(handle, inode, &map, create);
1223                 if (rc >= 0) {
1224                         int c = 0;
1225
1226                         for (; total < blen && c < map.m_len; c++, total++) {
1227                                 if (rc == 0) {
1228                                         *(blocks + total) = 0;
1229                                         total++;
1230                                         break;
1231                                 }
1232                                 if ((map.m_flags & LDISKFS_MAP_UNWRITTEN) &&
1233                                     !create) {
1234                                         /* don't try to read allocated, but
1235                                          * unwritten blocks, instead fill the
1236                                          * patches with zeros in osd_do_bio() */
1237                                         *(blocks + total) = 0;
1238                                         continue;
1239                                 }
1240                                 *(blocks + total) = map.m_pblk + c;
1241                                 /* unmap any possible underlying
1242                                  * metadata from the block device
1243                                  * mapping.  b=6998.
1244                                  */
1245                                 if ((map.m_flags & LDISKFS_MAP_NEW) &&
1246                                     create)
1247                                         clean_bdev_aliases(inode->i_sb->s_bdev,
1248                                                            map.m_pblk + c, 1);
1249                         }
1250                         rc = 0;
1251                 }
1252
1253 ext_already_mapped:
1254                 if (rc == 0 && create) {
1255                         count += (total - previous_total);
1256                         mapped_index = (count + blocks_per_page -
1257                                         1) / blocks_per_page - 1;
1258                         lnb1 = iobuf->dr_lnbs[i - clen];
1259                         lnb2 = iobuf->dr_lnbs[mapped_index];
1260                         size1 = lnb1->lnb_file_offset -
1261                                 (lnb1->lnb_file_offset % PAGE_SIZE) +
1262                                 (total << inode->i_blkbits);
1263                         size2 = lnb2->lnb_file_offset + lnb2->lnb_len;
1264
1265                         if (size1 > size2)
1266                                 size1 = size2;
1267                         if (size1 > disk_size)
1268                                 disk_size = size1;
1269                 }
1270
1271                 if (rc == 0 && total < blen) {
1272                         /*
1273                          * decay extent blocks if we could not
1274                          * allocate extent once.
1275                          */
1276                         osd_decay_extent_bytes(osd,
1277                                 (total - previous_total) << inode->i_blkbits);
1278                         map.m_lblk = fp->index * blocks_per_page + total;
1279                         map.m_len = blen - total;
1280                         previous_total = total;
1281                         goto cont_map;
1282                 }
1283                 if (rc != 0)
1284                         GOTO(cleanup, rc);
1285                 /*
1286                  * decay extent blocks if we could allocate
1287                  * good large extent.
1288                  */
1289                 if (total - previous_total >=
1290                     osd_extent_bytes(osd) >> inode->i_blkbits)
1291                         osd_decay_extent_bytes(osd,
1292                                 (total - previous_total) << inode->i_blkbits);
1293                 /* look for next extent */
1294                 fp = NULL;
1295                 blocks += blocks_per_page * clen;
1296         }
1297 cleanup:
1298         if (rc == 0 && create &&
1299             start_blocks < pages * blocks_per_page) {
1300                 rc = osd_ldiskfs_map_write(inode, iobuf, osd, start_blocks,
1301                                            count, &disk_size, user_size);
1302                 LASSERT(start_blocks + count == pages * blocks_per_page);
1303         }
1304         return rc;
1305 }
1306
1307 static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
1308                           struct niobuf_local *lnb, int npages)
1309 {
1310         struct osd_thread_info *oti   = osd_oti_get(env);
1311         struct osd_iobuf       *iobuf = &oti->oti_iobuf;
1312         struct inode           *inode = osd_dt_obj(dt)->oo_inode;
1313         struct osd_device      *osd   = osd_obj2dev(osd_dt_obj(dt));
1314         ktime_t start, end;
1315         s64 timediff;
1316         ssize_t isize;
1317         __s64  maxidx;
1318         int i, rc = 0;
1319
1320         LASSERT(inode);
1321
1322         rc = osd_init_iobuf(osd, iobuf, 0, npages);
1323         if (unlikely(rc != 0))
1324                 RETURN(rc);
1325
1326         isize = i_size_read(inode);
1327         maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1;
1328
1329         start = ktime_get();
1330         for (i = 0; i < npages; i++) {
1331
1332                 /*
1333                  * till commit the content of the page is undefined
1334                  * we'll set it uptodate once bulk is done. otherwise
1335                  * subsequent reads can access non-stable data
1336                  */
1337                 ClearPageUptodate(lnb[i].lnb_page);
1338
1339                 if (lnb[i].lnb_len == PAGE_SIZE)
1340                         continue;
1341
1342                 if (maxidx >= lnb[i].lnb_page->index) {
1343                         osd_iobuf_add_page(iobuf, &lnb[i]);
1344                 } else {
1345                         long off;
1346                         char *p = kmap(lnb[i].lnb_page);
1347
1348                         off = lnb[i].lnb_page_offset;
1349                         if (off)
1350                                 memset(p, 0, off);
1351                         off = (lnb[i].lnb_page_offset + lnb[i].lnb_len) &
1352                               ~PAGE_MASK;
1353                         if (off)
1354                                 memset(p + off, 0, PAGE_SIZE - off);
1355                         kunmap(lnb[i].lnb_page);
1356                 }
1357         }
1358         end = ktime_get();
1359         timediff = ktime_us_delta(end, start);
1360         lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff);
1361
1362         if (iobuf->dr_npages) {
1363                 rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
1364                                                  0, 0, NULL);
1365                 if (likely(rc == 0)) {
1366                         rc = osd_do_bio(osd, inode, iobuf, 0, 0);
1367                         /* do IO stats for preparation reads */
1368                         osd_fini_iobuf(osd, iobuf);
1369                 }
1370         }
1371         RETURN(rc);
1372 }
1373
1374 struct osd_fextent {
1375         sector_t        start;
1376         sector_t        end;
1377         __u32           flags;
1378         unsigned int    mapped:1;
1379 };
1380
1381 static int osd_is_mapped(struct dt_object *dt, __u64 offset,
1382                          struct osd_fextent *cached_extent)
1383 {
1384         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1385         sector_t block = offset >> inode->i_blkbits;
1386         sector_t start;
1387         struct fiemap_extent_info fei = { 0 };
1388         struct fiemap_extent fe = { 0 };
1389         int rc;
1390
1391         if (block >= cached_extent->start && block < cached_extent->end)
1392                 return cached_extent->mapped;
1393
1394         if (i_size_read(inode) == 0)
1395                 return 0;
1396
1397         /* Beyond EOF, must not be mapped */
1398         if (((i_size_read(inode) - 1) >> inode->i_blkbits) < block)
1399                 return 0;
1400
1401         fei.fi_extents_max = 1;
1402         fei.fi_extents_start = &fe;
1403
1404         rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
1405         if (rc != 0)
1406                 return 0;
1407
1408         start = fe.fe_logical >> inode->i_blkbits;
1409         cached_extent->flags = fe.fe_flags;
1410         if (fei.fi_extents_mapped == 0) {
1411                 /* a special case - no extent found at this offset and forward.
1412                  * we can consider this as a hole to EOF. it's safe to cache
1413                  * as other threads can not allocate/punch blocks this thread
1414                  * is working on (LDLM). */
1415                 cached_extent->start = block;
1416                 cached_extent->end = i_size_read(inode) >> inode->i_blkbits;
1417                 cached_extent->mapped = 0;
1418                 return 0;
1419         }
1420
1421         if (start > block) {
1422                 cached_extent->start = block;
1423                 cached_extent->end = start;
1424                 cached_extent->mapped = 0;
1425         } else {
1426                 cached_extent->start = start;
1427                 cached_extent->end = (fe.fe_logical + fe.fe_length) >>
1428                                       inode->i_blkbits;
1429                 cached_extent->mapped = 1;
1430         }
1431
1432         return cached_extent->mapped;
1433 }
1434
1435 #define MAX_EXTENTS_PER_WRITE 100
1436 static int osd_declare_write_commit(const struct lu_env *env,
1437                                     struct dt_object *dt,
1438                                     struct niobuf_local *lnb, int npages,
1439                                     struct thandle *handle)
1440 {
1441         const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
1442         struct inode            *inode = osd_dt_obj(dt)->oo_inode;
1443         struct osd_thandle      *oh;
1444         int                     extents = 0, new_meta = 0;
1445         int                     depth, new_blocks = 0;
1446         int                     i;
1447         int                     dirty_groups = 0;
1448         int                     rc = 0;
1449         int                     credits = 0;
1450         long long               quota_space = 0;
1451         struct osd_fextent      mapped = { 0 }, extent = { 0 };
1452         enum osd_quota_local_flags local_flags = 0;
1453         enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
1454         unsigned int            extent_bytes;
1455         ENTRY;
1456
1457         LASSERT(handle != NULL);
1458         oh = container_of(handle, struct osd_thandle, ot_super);
1459         LASSERT(oh->ot_handle == NULL);
1460
1461         /*
1462          * We track a decaying average extent blocks per filesystem,
1463          * for most of time, it will be 1M, with filesystem becoming
1464          * heavily-fragmented, it will be reduced to 4K at the worst.
1465          */
1466         extent_bytes = osd_extent_bytes(osd);
1467         LASSERT(extent_bytes >= osd_sb(osd)->s_blocksize);
1468
1469         /* calculate number of extents (probably better to pass nb) */
1470         for (i = 0; i < npages; i++) {
1471                 /* ignore quota for the whole request if any page is from
1472                  * client cache or written by root.
1473                  *
1474                  * XXX we could handle this on per-lnb basis as done by
1475                  * grant.
1476                  */
1477                 if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
1478                     (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
1479                     !(lnb[i].lnb_flags & OBD_BRW_SYNC))
1480                         declare_flags |= OSD_QID_FORCE;
1481
1482                 /*
1483                  * Convert unwritten extent might need split extents, could
1484                  * not skip it.
1485                  */
1486                 if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
1487                     !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
1488                         lnb[i].lnb_flags |= OBD_BRW_MAPPED;
1489                         continue;
1490                 }
1491
1492                 if (lnb[i].lnb_flags & OBD_BRW_DONE) {
1493                         lnb[i].lnb_flags |= OBD_BRW_MAPPED;
1494                         continue;
1495                 }
1496
1497                 /* count only unmapped changes */
1498                 new_blocks++;
1499                 if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
1500                         if (extent.end != 0)
1501                                 extents += (extent.end - extent.start +
1502                                             extent_bytes - 1) / extent_bytes;
1503                         extent.start = lnb[i].lnb_file_offset;
1504                         extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
1505                 } else {
1506                         extent.end += lnb[i].lnb_len;
1507                 }
1508
1509                 quota_space += PAGE_SIZE;
1510         }
1511
1512         credits++; /* inode */
1513         /*
1514          * overwrite case, no need to modify tree and
1515          * allocate blocks.
1516          */
1517         if (!extent.end)
1518                 goto out_declare;
1519
1520         extents += (extent.end - extent.start +
1521                     extent_bytes - 1) / extent_bytes;
1522         /**
1523          * with system space usage growing up, mballoc codes won't
1524          * try best to scan block group to align best free extent as
1525          * we can. So extent bytes per extent could be decayed to a
1526          * very small value, this could make us reserve too many credits.
1527          * We could be more optimistic in the credit reservations, even
1528          * in a case where the filesystem is nearly full, it is extremely
1529          * unlikely that the worst case would ever be hit.
1530          */
1531         if (extents > MAX_EXTENTS_PER_WRITE)
1532                 extents = MAX_EXTENTS_PER_WRITE;
1533
1534         /**
1535          * If we add a single extent, then in the worse case, each tree
1536          * level index/leaf need to be changed in case of the tree split.
1537          * If more extents are inserted, they could cause the whole tree
1538          * split more than once, but this is really rare.
1539          */
1540         if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
1541                 /*
1542                  * many concurrent threads may grow tree by the time
1543                  * our transaction starts. so, consider 2 is a min depth.
1544                  */
1545                 depth = ext_depth(inode);
1546                 depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
1547                 if (extents <= 1) {
1548                         credits += depth * 2 * extents;
1549                         new_meta = depth;
1550                 } else {
1551                         credits += depth * 3 * extents;
1552                         new_meta = depth * 2 * extents;
1553                 }
1554         } else {
1555                 /*
1556                  * With N contiguous data blocks, we need at most
1557                  * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
1558                  * 2 dindirect blocks, and 1 tindirect block
1559                  */
1560                 new_meta = DIV_ROUND_UP(new_blocks,
1561                                 LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
1562                 credits += new_meta;
1563         }
1564         dirty_groups += (extents + new_meta);
1565
1566         oh->oh_declared_ext = extents;
1567
1568         /* quota space for metadata blocks */
1569         quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
1570
1571         /* quota space should be reported in 1K blocks */
1572         quota_space = toqb(quota_space);
1573
1574         /* each new block can go in different group (bitmap + gd) */
1575
1576         /* we can't dirty more bitmap blocks than exist */
1577         if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
1578                 credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
1579         else
1580                 credits += dirty_groups;
1581
1582         /* we can't dirty more gd blocks than exist */
1583         if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
1584                 credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
1585         else
1586                 credits += dirty_groups;
1587
1588         CDEBUG(D_INODE,
1589                "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
1590                osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
1591                credits);
1592
1593 out_declare:
1594         osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
1595
1596         /* make sure the over quota flags were not set */
1597         lnb[0].lnb_flags &= ~OBD_BRW_OVER_ALLQUOTA;
1598
1599         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
1600                                    i_projid_read(inode), quota_space, oh,
1601                                    osd_dt_obj(dt), &local_flags, declare_flags);
1602
1603         /* we need only to store the overquota flags in the first lnb for
1604          * now, once we support multiple objects BRW, this code needs be
1605          * revised.
1606          */
1607         if (local_flags & QUOTA_FL_OVER_USRQUOTA)
1608                 lnb[0].lnb_flags |= OBD_BRW_OVER_USRQUOTA;
1609         if (local_flags & QUOTA_FL_OVER_GRPQUOTA)
1610                 lnb[0].lnb_flags |= OBD_BRW_OVER_GRPQUOTA;
1611         if (local_flags & QUOTA_FL_OVER_PRJQUOTA)
1612                 lnb[0].lnb_flags |= OBD_BRW_OVER_PRJQUOTA;
1613
1614         if (rc == 0)
1615                 rc = osd_trunc_lock(osd_dt_obj(dt), oh, true);
1616
1617         RETURN(rc);
1618 }
1619
1620 /* Check if a block is allocated or not */
1621 static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
1622                             struct niobuf_local *lnb, int npages,
1623                             struct thandle *thandle, __u64 user_size)
1624 {
1625         struct osd_thread_info *oti = osd_oti_get(env);
1626         struct osd_iobuf *iobuf = &oti->oti_iobuf;
1627         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1628         struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
1629         int rc = 0, i, check_credits = 0;
1630
1631         LASSERT(inode);
1632
1633         rc = osd_init_iobuf(osd, iobuf, 1, npages);
1634         if (unlikely(rc != 0))
1635                 RETURN(rc);
1636
1637         dquot_initialize(inode);
1638
1639         for (i = 0; i < npages; i++) {
1640                 if (lnb[i].lnb_rc == -ENOSPC &&
1641                     (lnb[i].lnb_flags & OBD_BRW_MAPPED)) {
1642                         /* Allow the write to proceed if overwriting an
1643                          * existing block
1644                          */
1645                         lnb[i].lnb_rc = 0;
1646                 }
1647
1648                 if (lnb[i].lnb_rc) { /* ENOSPC, network RPC error, etc. */
1649                         CDEBUG(D_INODE, "Skipping [%d] == %d\n", i,
1650                                lnb[i].lnb_rc);
1651                         LASSERT(lnb[i].lnb_page);
1652                         generic_error_remove_page(inode->i_mapping,
1653                                                   lnb[i].lnb_page);
1654                         continue;
1655                 }
1656
1657                 if (lnb[i].lnb_flags & OBD_BRW_DONE)
1658                         continue;
1659
1660                 if (!(lnb[i].lnb_flags & OBD_BRW_MAPPED))
1661                         check_credits = 1;
1662
1663                 LASSERT(PageLocked(lnb[i].lnb_page));
1664                 LASSERT(!PageWriteback(lnb[i].lnb_page));
1665
1666                 /*
1667                  * Since write and truncate are serialized by oo_sem, even
1668                  * partial-page truncate should not leave dirty pages in the
1669                  * page cache.
1670                  */
1671                 LASSERT(!PageDirty(lnb[i].lnb_page));
1672
1673                 SetPageUptodate(lnb[i].lnb_page);
1674
1675                 osd_iobuf_add_page(iobuf, &lnb[i]);
1676         }
1677
1678         osd_trans_exec_op(env, thandle, OSD_OT_WRITE);
1679
1680         if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) {
1681                 rc = -ENOSPC;
1682         } else if (iobuf->dr_npages > 0) {
1683                 rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd,
1684                                                  1, user_size,
1685                                                  check_credits,
1686                                                  thandle);
1687         } else {
1688                 /* no pages to write, no transno is needed */
1689                 thandle->th_local = 1;
1690         }
1691
1692         if (rc != 0 && !thandle->th_restart_tran)
1693                 osd_fini_iobuf(osd, iobuf);
1694
1695         osd_trans_exec_check(env, thandle, OSD_OT_WRITE);
1696
1697         if (unlikely(rc != 0 && !thandle->th_restart_tran)) {
1698                 /* if write fails, we should drop pages from the cache */
1699                 for (i = 0; i < npages; i++) {
1700                         if (lnb[i].lnb_page == NULL)
1701                                 continue;
1702                         if (!PagePrivate2(lnb[i].lnb_page)) {
1703                                 LASSERT(PageLocked(lnb[i].lnb_page));
1704                                 generic_error_remove_page(inode->i_mapping,
1705                                                           lnb[i].lnb_page);
1706                         }
1707                 }
1708         }
1709
1710         RETURN(rc);
1711 }
1712
1713 static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
1714                          struct niobuf_local *lnb, int npages)
1715 {
1716         struct osd_thread_info *oti = osd_oti_get(env);
1717         struct osd_iobuf *iobuf = &oti->oti_iobuf;
1718         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1719         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
1720         int rc = 0, i, cache_hits = 0, cache_misses = 0;
1721         ktime_t start, end;
1722         s64 timediff;
1723         loff_t isize;
1724
1725         LASSERT(inode);
1726
1727         rc = osd_init_iobuf(osd, iobuf, 0, npages);
1728         if (unlikely(rc != 0))
1729                 RETURN(rc);
1730
1731         isize = i_size_read(inode);
1732
1733         start = ktime_get();
1734         for (i = 0; i < npages; i++) {
1735
1736                 if (isize <= lnb[i].lnb_file_offset)
1737                         /* If there's no more data, abort early.
1738                          * lnb->lnb_rc == 0, so it's easy to detect later.
1739                          */
1740                         break;
1741
1742                 /* instead of looking if we go beyong isize, send complete
1743                  * pages all the time
1744                  */
1745                 lnb[i].lnb_rc = lnb[i].lnb_len;
1746
1747                 /* Bypass disk read if fail_loc is set properly */
1748                 if (OBD_FAIL_CHECK_QUIET(OBD_FAIL_OST_FAKE_RW))
1749                         SetPageUptodate(lnb[i].lnb_page);
1750
1751                 if (PageUptodate(lnb[i].lnb_page)) {
1752                         cache_hits++;
1753                         unlock_page(lnb[i].lnb_page);
1754                 } else {
1755                         cache_misses++;
1756                         osd_iobuf_add_page(iobuf, &lnb[i]);
1757                 }
1758                 /* no need to unlock in osd_bufs_put(), the sooner page is
1759                  * unlocked, the earlier another client can access it.
1760                  * notice real unlock_page() can be called few lines
1761                  * below after osd_do_bio(). lnb is a per-thread, so it's
1762                  * fine to have PG_locked and lnb_locked inconsistent here
1763                  */
1764                 lnb[i].lnb_locked = 0;
1765         }
1766         end = ktime_get();
1767         timediff = ktime_us_delta(end, start);
1768         lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff);
1769
1770         if (cache_hits != 0)
1771                 lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_HIT,
1772                                     cache_hits);
1773         if (cache_misses != 0)
1774                 lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_MISS,
1775                                     cache_misses);
1776         if (cache_hits + cache_misses != 0)
1777                 lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_ACCESS,
1778                                     cache_hits + cache_misses);
1779
1780         if (iobuf->dr_npages) {
1781                 rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
1782                                                  0, 0, NULL);
1783                 if (!rc)
1784                         rc = osd_do_bio(osd, inode, iobuf, 0, 0);
1785
1786                 /* IO stats will be done in osd_bufs_put() */
1787
1788                 /* early release to let others read data during the bulk */
1789                 for (i = 0; i < iobuf->dr_npages; i++) {
1790                         LASSERT(PageLocked(iobuf->dr_pages[i]));
1791                         if (!PagePrivate2(iobuf->dr_pages[i]))
1792                                 unlock_page(iobuf->dr_pages[i]);
1793                 }
1794         }
1795
1796         RETURN(rc);
1797 }
1798
1799 /*
1800  * XXX: Another layering violation for now.
1801  *
1802  * We don't want to use ->f_op->read methods, because generic file write
1803  *
1804  *         - serializes on ->i_sem, and
1805  *
1806  *         - does a lot of extra work like balance_dirty_pages(),
1807  *
1808  * which doesn't work for globally shared files like /last_rcvd.
1809  */
1810 static int osd_ldiskfs_readlink(struct inode *inode, char *buffer, int buflen)
1811 {
1812         struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
1813
1814         memcpy(buffer, (char *)ei->i_data, buflen);
1815
1816         return  buflen;
1817 }
1818
1819 int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs)
1820 {
1821         struct buffer_head *bh;
1822         unsigned long block;
1823         int osize;
1824         int blocksize;
1825         int csize;
1826         int boffs;
1827
1828         /* prevent reading after eof */
1829         spin_lock(&inode->i_lock);
1830         if (i_size_read(inode) < *offs + size) {
1831                 loff_t diff = i_size_read(inode) - *offs;
1832
1833                 spin_unlock(&inode->i_lock);
1834                 if (diff < 0) {
1835                         CDEBUG(D_OTHER,
1836                                "size %llu is too short to read @%llu\n",
1837                                i_size_read(inode), *offs);
1838                         return -EBADR;
1839                 } else if (diff == 0) {
1840                         return 0;
1841                 } else {
1842                         size = diff;
1843                 }
1844         } else {
1845                 spin_unlock(&inode->i_lock);
1846         }
1847
1848         blocksize = 1 << inode->i_blkbits;
1849         osize = size;
1850         while (size > 0) {
1851                 block = *offs >> inode->i_blkbits;
1852                 boffs = *offs & (blocksize - 1);
1853                 csize = min(blocksize - boffs, size);
1854                 bh = __ldiskfs_bread(NULL, inode, block, 0);
1855                 if (IS_ERR(bh)) {
1856                         CERROR("%s: can't read %u@%llu on ino %lu: rc = %ld\n",
1857                                osd_ino2name(inode), csize, *offs, inode->i_ino,
1858                                PTR_ERR(bh));
1859                         return PTR_ERR(bh);
1860                 }
1861
1862                 if (bh != NULL) {
1863                         memcpy(buf, bh->b_data + boffs, csize);
1864                         brelse(bh);
1865                 } else {
1866                         memset(buf, 0, csize);
1867                 }
1868
1869                 *offs += csize;
1870                 buf += csize;
1871                 size -= csize;
1872         }
1873         return osize;
1874 }
1875
1876 static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
1877                         struct lu_buf *buf, loff_t *pos)
1878 {
1879         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1880         int rc;
1881
1882         /* Read small symlink from inode body as we need to maintain correct
1883          * on-disk symlinks for ldiskfs.
1884          */
1885         if (S_ISLNK(dt->do_lu.lo_header->loh_attr)) {
1886                 loff_t size = i_size_read(inode);
1887
1888                 if (buf->lb_len < size)
1889                         return -EOVERFLOW;
1890
1891                 if (size < sizeof(LDISKFS_I(inode)->i_data))
1892                         rc = osd_ldiskfs_readlink(inode, buf->lb_buf, size);
1893                 else
1894                         rc = osd_ldiskfs_read(inode, buf->lb_buf, size, pos);
1895         } else {
1896                 rc = osd_ldiskfs_read(inode, buf->lb_buf, buf->lb_len, pos);
1897         }
1898
1899         return rc;
1900 }
1901
1902 static inline int osd_extents_enabled(struct super_block *sb,
1903                                       struct inode *inode)
1904 {
1905         if (inode != NULL) {
1906                 if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL)
1907                         return 1;
1908         } else if (ldiskfs_has_feature_extents(sb)) {
1909                 return 1;
1910         }
1911         return 0;
1912 }
1913
1914 int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode,
1915                            const loff_t size, const loff_t pos,
1916                            const int blocks)
1917 {
1918         int credits, bits, bs, i;
1919
1920         bits = sb->s_blocksize_bits;
1921         bs = 1 << bits;
1922
1923         /* legacy blockmap: 3 levels * 3 (bitmap,gd,itself)
1924          * we do not expect blockmaps on the large files,
1925          * so let's shrink it to 2 levels (4GB files)
1926          */
1927
1928         /* this is default reservation: 2 levels */
1929         credits = (blocks + 2) * 3;
1930
1931         /* actual offset is unknown, hard to optimize */
1932         if (pos == -1)
1933                 return credits;
1934
1935         /* now check for few specific cases to optimize */
1936         if (pos + size <= LDISKFS_NDIR_BLOCKS * bs) {
1937                 /* no indirects */
1938                 credits = blocks;
1939                 /* allocate if not allocated */
1940                 if (inode == NULL) {
1941                         credits += blocks * 2;
1942                         return credits;
1943                 }
1944                 for (i = (pos >> bits); i < (pos >> bits) + blocks; i++) {
1945                         LASSERT(i < LDISKFS_NDIR_BLOCKS);
1946                         if (LDISKFS_I(inode)->i_data[i] == 0)
1947                                 credits += 2;
1948                 }
1949         } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) {
1950                 /* single indirect */
1951                 credits = blocks * 3;
1952                 if (inode == NULL ||
1953                     LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK] == 0)
1954                         credits += 3;
1955                 else
1956                         /* The indirect block may be modified. */
1957                         credits += 1;
1958         }
1959
1960         return credits;
1961 }
1962
1963 static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
1964                                  const struct lu_buf *buf, loff_t _pos,
1965                                  struct thandle *handle)
1966 {
1967         struct osd_object  *obj  = osd_dt_obj(dt);
1968         struct inode       *inode = obj->oo_inode;
1969         struct super_block *sb = osd_sb(osd_obj2dev(obj));
1970         struct osd_thandle *oh;
1971         int                 rc = 0, est = 0, credits, blocks, allocated = 0;
1972         int                 bits, bs;
1973         int                 depth, size;
1974         loff_t              pos;
1975         ENTRY;
1976
1977         LASSERT(buf != NULL);
1978         LASSERT(handle != NULL);
1979
1980         oh = container_of(handle, struct osd_thandle, ot_super);
1981         LASSERT(oh->ot_handle == NULL);
1982
1983         size = buf->lb_len;
1984         bits = sb->s_blocksize_bits;
1985         bs = 1 << bits;
1986
1987         if (_pos == -1) {
1988                 /* if this is an append, then we
1989                  * should expect cross-block record
1990                  */
1991                 pos = 0;
1992         } else {
1993                 pos = _pos;
1994         }
1995
1996         /* blocks to modify */
1997         blocks = ((pos + size + bs - 1) >> bits) - (pos >> bits);
1998         LASSERT(blocks > 0);
1999
2000         if (inode != NULL && _pos != -1) {
2001                 /* object size in blocks */
2002                 est = (i_size_read(inode) + bs - 1) >> bits;
2003                 allocated = inode->i_blocks >> (bits - 9);
2004                 if (pos + size <= i_size_read(inode) && est <= allocated) {
2005                         /* looks like an overwrite, no need to modify tree */
2006                         credits = blocks;
2007                         /* no need to modify i_size */
2008                         goto out;
2009                 }
2010         }
2011
2012         if (osd_extents_enabled(sb, inode)) {
2013                 /*
2014                  * many concurrent threads may grow tree by the time
2015                  * our transaction starts. so, consider 2 is a min depth
2016                  * for every level we may need to allocate a new block
2017                  * and take some entries from the old one. so, 3 blocks
2018                  * to allocate (bitmap, gd, itself) + old block - 4 per
2019                  * level.
2020                  */
2021                 depth = inode != NULL ? ext_depth(inode) : 0;
2022                 depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH);
2023                 credits = depth;
2024                 /* if not append, then split may need to modify
2025                  * existing blocks moving entries into the new ones
2026                  */
2027                 if (_pos != -1)
2028                         credits += depth;
2029                 /* blocks to store data: bitmap,gd,itself */
2030                 credits += blocks * 3;
2031         } else {
2032                 credits = osd_calc_bkmap_credits(sb, inode, size, _pos, blocks);
2033         }
2034         /* if inode is created as part of the transaction,
2035          * then it's counted already by the creation method
2036          */
2037         if (inode != NULL)
2038                 credits++;
2039
2040 out:
2041
2042         osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
2043
2044         /* dt_declare_write() is usually called for system objects, such
2045          * as llog or last_rcvd files. We needn't enforce quota on those
2046          * objects, so always set the lqi_space as 0.
2047          */
2048         if (inode != NULL)
2049                 rc = osd_declare_inode_qid(env, i_uid_read(inode),
2050                                            i_gid_read(inode),
2051                                            i_projid_read(inode), 0,
2052                                            oh, obj, NULL, OSD_QID_BLK);
2053
2054         if (rc == 0)
2055                 rc = osd_trunc_lock(obj, oh, true);
2056
2057         RETURN(rc);
2058 }
2059
2060 static int osd_ldiskfs_writelink(struct inode *inode, char *buffer, int buflen)
2061 {
2062         /* LU-2634: clear the extent format for fast symlink */
2063         ldiskfs_clear_inode_flag(inode, LDISKFS_INODE_EXTENTS);
2064
2065         memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen);
2066         spin_lock(&inode->i_lock);
2067         LDISKFS_I(inode)->i_disksize = buflen;
2068         i_size_write(inode, buflen);
2069         spin_unlock(&inode->i_lock);
2070         osd_dirty_inode(inode, I_DIRTY_DATASYNC);
2071
2072         return 0;
2073 }
2074
2075 static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
2076                                     int bufsize, int write_NUL, loff_t *offs,
2077                                     handle_t *handle)
2078 {
2079         struct inode *inode = osd_dt_obj(dt)->oo_inode;
2080         struct buffer_head *bh        = NULL;
2081         loff_t              offset    = *offs;
2082         loff_t              new_size  = i_size_read(inode);
2083         unsigned long       block;
2084         int                 blocksize = 1 << inode->i_blkbits;
2085         struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
2086         int                 err = 0;
2087         int                 size;
2088         int                 boffs;
2089         int                 dirty_inode = 0;
2090         bool create, sparse, sync = false;
2091
2092         if (write_NUL) {
2093                 /*
2094                  * long symlink write does not count the NUL terminator in
2095                  * bufsize, we write it, and the inode's file size does not
2096                  * count the NUL terminator as well.
2097                  */
2098                 ((char *)buf)[bufsize] = '\0';
2099                 ++bufsize;
2100         }
2101
2102         /* only the first flag-set matters */
2103         dirty_inode = !test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA,
2104                                        &ei->i_flags);
2105
2106         /* sparse checking is racy, but sparse is very rare case, leave as is */
2107         sparse = (new_size > 0 && (inode->i_blocks >> (inode->i_blkbits - 9)) <
2108                   ((new_size - 1) >> inode->i_blkbits) + 1);
2109
2110         while (bufsize > 0) {
2111                 int credits = handle->h_buffer_credits;
2112                 unsigned long last_block = (new_size == 0) ? 0 :
2113                                            (new_size - 1) >> inode->i_blkbits;
2114
2115                 if (bh)
2116                         brelse(bh);
2117
2118                 block = offset >> inode->i_blkbits;
2119                 boffs = offset & (blocksize - 1);
2120                 size = min(blocksize - boffs, bufsize);
2121                 sync = (block > last_block || new_size == 0 || sparse);
2122
2123                 if (sync)
2124                         down(&ei->i_append_sem);
2125
2126                 bh = __ldiskfs_bread(handle, inode, block, 0);
2127
2128                 if (unlikely(IS_ERR_OR_NULL(bh) && !sync))
2129                         CWARN(
2130                               "%s: adding bh without locking off %llu (block %lu, size %d, offs %llu)\n",
2131                               osd_ino2name(inode),
2132                               offset, block, bufsize, *offs);
2133
2134                 if (IS_ERR_OR_NULL(bh)) {
2135                         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
2136                         int flags = LDISKFS_GET_BLOCKS_CREATE;
2137
2138                         /* while the file system is being mounted, avoid
2139                          * preallocation otherwise mount can take a long
2140                          * time as mballoc cache is cold.
2141                          * XXX: this is a workaround until we have a proper
2142                          *      fix in mballoc
2143                          * XXX: works with extent-based files only */
2144                         if (!osd->od_cl_seq)
2145                                 flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE;
2146                         bh = __ldiskfs_bread(handle, inode, block, flags);
2147                         create = true;
2148                 } else {
2149                         if (sync) {
2150                                 up(&ei->i_append_sem);
2151                                 sync = false;
2152                         }
2153                         create = false;
2154                 }
2155                 if (IS_ERR_OR_NULL(bh)) {
2156                         if (bh == NULL) {
2157                                 err = -EIO;
2158                         } else {
2159                                 err = PTR_ERR(bh);
2160                                 bh = NULL;
2161                         }
2162
2163                         CERROR(
2164                                "%s: error reading offset %llu (block %lu, size %d, offs %llu), credits %d/%d: rc = %d\n",
2165                                osd_ino2name(inode), offset, block, bufsize,
2166                                *offs, credits, handle->h_buffer_credits, err);
2167                         break;
2168                 }
2169
2170                 err = ldiskfs_journal_get_write_access(handle, bh);
2171                 if (err) {
2172                         CERROR("journal_get_write_access() returned error %d\n",
2173                                err);
2174                         break;
2175                 }
2176                 LASSERTF(boffs + size <= bh->b_size,
2177                          "boffs %d size %d bh->b_size %lu\n",
2178                          boffs, size, (unsigned long)bh->b_size);
2179                 if (create) {
2180                         memset(bh->b_data, 0, bh->b_size);
2181                         if (sync) {
2182                                 up(&ei->i_append_sem);
2183                                 sync = false;
2184                         }
2185                 }
2186                 memcpy(bh->b_data + boffs, buf, size);
2187                 err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
2188                 if (err)
2189                         break;
2190
2191                 if (offset + size > new_size)
2192                         new_size = offset + size;
2193                 offset += size;
2194                 bufsize -= size;
2195                 buf += size;
2196         }
2197         if (sync)
2198                 up(&ei->i_append_sem);
2199
2200         if (bh)
2201                 brelse(bh);
2202
2203         if (write_NUL)
2204                 --new_size;
2205         /* correct in-core and on-disk sizes */
2206         if (new_size > i_size_read(inode)) {
2207                 spin_lock(&inode->i_lock);
2208                 if (new_size > i_size_read(inode))
2209                         i_size_write(inode, new_size);
2210                 if (i_size_read(inode) > ei->i_disksize) {
2211                         ei->i_disksize = i_size_read(inode);
2212                         dirty_inode = 1;
2213                 }
2214                 spin_unlock(&inode->i_lock);
2215         }
2216         if (dirty_inode)
2217                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
2218
2219         if (err == 0)
2220                 *offs = offset;
2221         return err;
2222 }
2223
2224 static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
2225                          const struct lu_buf *buf, loff_t *pos,
2226                          struct thandle *handle)
2227 {
2228         struct inode            *inode = osd_dt_obj(dt)->oo_inode;
2229         struct osd_thandle      *oh;
2230         ssize_t                 result;
2231         int                     is_link;
2232
2233         LASSERT(dt_object_exists(dt));
2234
2235         LASSERT(handle != NULL);
2236         LASSERT(inode != NULL);
2237         dquot_initialize(inode);
2238
2239         /* XXX: don't check: one declared chunk can be used many times */
2240         /* osd_trans_exec_op(env, handle, OSD_OT_WRITE); */
2241
2242         oh = container_of(handle, struct osd_thandle, ot_super);
2243         LASSERT(oh->ot_handle->h_transaction != NULL);
2244         osd_trans_exec_op(env, handle, OSD_OT_WRITE);
2245
2246         /* Write small symlink to inode body as we need to maintain correct
2247          * on-disk symlinks for ldiskfs.
2248          * Note: the buf->lb_buf contains a NUL terminator while buf->lb_len
2249          * does not count it in.
2250          */
2251         is_link = S_ISLNK(dt->do_lu.lo_header->loh_attr);
2252         if (is_link && (buf->lb_len < sizeof(LDISKFS_I(inode)->i_data)))
2253                 result = osd_ldiskfs_writelink(inode, buf->lb_buf, buf->lb_len);
2254         else
2255                 result = osd_ldiskfs_write_record(dt, buf->lb_buf, buf->lb_len,
2256                                                   is_link, pos, oh->ot_handle);
2257         if (result == 0)
2258                 result = buf->lb_len;
2259
2260         osd_trans_exec_check(env, handle, OSD_OT_WRITE);
2261
2262         return result;
2263 }
2264
2265 static int osd_declare_fallocate(const struct lu_env *env,
2266                                  struct dt_object *dt, __u64 start, __u64 end,
2267                                  int mode, struct thandle *th)
2268 {
2269         struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
2270         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
2271         struct inode *inode = osd_dt_obj(dt)->oo_inode;
2272         long long quota_space = 0;
2273         /* 5 is max tree depth. (inode + 4 index blocks) */
2274         int depth = 5;
2275         int rc;
2276
2277         ENTRY;
2278
2279         /*
2280          * mode == 0 (which is standard prealloc) and PUNCH is supported
2281          * Rest of mode options is not supported yet.
2282          */
2283         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2284                 RETURN(-EOPNOTSUPP);
2285
2286         /* disable fallocate completely */
2287         if (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks < 0)
2288                 RETURN(-EOPNOTSUPP);
2289
2290         LASSERT(th);
2291         LASSERT(inode);
2292
2293         if (mode & FALLOC_FL_PUNCH_HOLE) {
2294                 rc = osd_declare_inode_qid(env, i_uid_read(inode),
2295                                            i_gid_read(inode),
2296                                            i_projid_read(inode), 0, oh,
2297                                            osd_dt_obj(dt), NULL, OSD_QID_BLK);
2298                 if (rc == 0)
2299                         rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
2300                 RETURN(rc);
2301         }
2302
2303         /* quota space for metadata blocks
2304          * approximate metadata estimate should be good enough.
2305          */
2306         quota_space += PAGE_SIZE;
2307         quota_space += depth * LDISKFS_BLOCK_SIZE(osd_sb(osd));
2308
2309         /* quota space should be reported in 1K blocks */
2310         quota_space = toqb(quota_space) + toqb(end - start) +
2311                       LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
2312
2313         /* We don't need to reserve credits for whole fallocate here.
2314          * We reserve space only for metadata. Fallocate credits are
2315          * extended as required
2316          */
2317         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
2318                                    i_projid_read(inode), quota_space, oh,
2319                                    osd_dt_obj(dt), NULL, OSD_QID_BLK);
2320         RETURN(rc);
2321 }
2322
2323 static int osd_fallocate_preallocate(const struct lu_env *env,
2324                                      struct dt_object *dt,
2325                                      __u64 start, __u64 end, int mode,
2326                                      struct thandle *th)
2327 {
2328         struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
2329         handle_t *handle = ldiskfs_journal_current_handle();
2330         unsigned int save_credits = oh->ot_credits;
2331         struct osd_object *obj = osd_dt_obj(dt);
2332         struct inode *inode = obj->oo_inode;
2333         struct ldiskfs_map_blocks map;
2334         unsigned int credits;
2335         ldiskfs_lblk_t blen;
2336         ldiskfs_lblk_t boff;
2337         loff_t new_size = 0;
2338         int depth = 0;
2339         int flags;
2340         int rc = 0;
2341
2342         ENTRY;
2343
2344         LASSERT(dt_object_exists(dt));
2345         LASSERT(osd_invariant(obj));
2346         LASSERT(inode != NULL);
2347
2348         CDEBUG(D_INODE, "fallocate: inode #%lu: start %llu end %llu mode %d\n",
2349                inode->i_ino, start, end, mode);
2350
2351         dquot_initialize(inode);
2352
2353         LASSERT(th);
2354
2355         boff = start >> inode->i_blkbits;
2356         blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
2357
2358         /* Create and mark new extents as either zero or unwritten */
2359         flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ||
2360                  !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ?
2361                 LDISKFS_GET_BLOCKS_CREATE_ZERO :
2362                 LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
2363 #ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
2364         if (mode & FALLOC_FL_KEEP_SIZE)
2365                 flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
2366 #endif
2367         inode_lock(inode);
2368
2369         if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
2370             end > LDISKFS_I(inode)->i_disksize)) {
2371                 new_size = end;
2372                 rc = inode_newsize_ok(inode, new_size);
2373                 if (rc)
2374                         GOTO(out, rc);
2375         }
2376
2377         inode_dio_wait(inode);
2378
2379         map.m_lblk = boff;
2380         map.m_len = blen;
2381
2382         /* Don't normalize the request if it can fit in one extent so
2383          * that it doesn't get unnecessarily split into multiple extents.
2384          */
2385         if (blen <= EXT_UNWRITTEN_MAX_LEN)
2386                 flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE;
2387
2388         /*
2389          * credits to insert 1 extent into extent tree.
2390          */
2391         credits = osd_chunk_trans_blocks(inode, blen);
2392         depth = ext_depth(inode);
2393
2394         while (rc >= 0 && blen) {
2395                 loff_t epos;
2396
2397                 /*
2398                  * Recalculate credits when extent tree depth changes.
2399                  */
2400                 if (depth != ext_depth(inode)) {
2401                         credits = osd_chunk_trans_blocks(inode, blen);
2402                         depth = ext_depth(inode);
2403                 }
2404
2405                 /* TODO: quota check */
2406                 rc = osd_extend_restart_trans(handle, credits, inode);
2407                 if (rc)
2408                         break;
2409
2410                 rc = ldiskfs_map_blocks(handle, inode, &map, flags);
2411                 if (rc <= 0) {
2412                         CDEBUG(D_INODE,
2413                                "inode #%lu: block %u: len %u: ldiskfs_map_blocks returned %d\n",
2414                                inode->i_ino, map.m_lblk, map.m_len, rc);
2415                         ldiskfs_mark_inode_dirty(handle, inode);
2416                         break;
2417                 }
2418
2419                 map.m_lblk += rc;
2420                 map.m_len = blen = blen - rc;
2421                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
2422                 inode->i_ctime = current_time(inode);
2423                 if (new_size) {
2424                         if (epos > end)
2425                                 epos = end;
2426                         if (ldiskfs_update_inode_size(inode, epos) & 0x1)
2427                                 inode->i_mtime = inode->i_ctime;
2428 #ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
2429                 } else {
2430                         if (epos > inode->i_size)
2431                                 ldiskfs_set_inode_flag(inode,
2432                                                        LDISKFS_INODE_EOFBLOCKS);
2433 #endif
2434                 }
2435
2436                 ldiskfs_mark_inode_dirty(handle, inode);
2437         }
2438
2439 out:
2440         /* extand credits if needed for operations such as attribute set */
2441         if (rc >= 0)
2442                 rc = osd_extend_restart_trans(handle, save_credits, inode);
2443
2444         inode_unlock(inode);
2445
2446         RETURN(rc);
2447 }
2448
2449 static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
2450                                __u64 start, __u64 end, int mode,
2451                                struct thandle *th)
2452 {
2453         struct osd_object *obj = osd_dt_obj(dt);
2454         struct inode *inode = obj->oo_inode;
2455         struct osd_access_lock *al;
2456         struct osd_thandle *oh;
2457         int rc = 0, found = 0;
2458
2459         ENTRY;
2460
2461         LASSERT(dt_object_exists(dt));
2462         LASSERT(osd_invariant(obj));
2463         LASSERT(inode != NULL);
2464
2465         dquot_initialize(inode);
2466
2467         LASSERT(th);
2468         oh = container_of(th, struct osd_thandle, ot_super);
2469         LASSERT(oh->ot_handle->h_transaction != NULL);
2470
2471         list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
2472                 if (obj != al->tl_obj)
2473                         continue;
2474                 LASSERT(al->tl_shared == 0);
2475                 found = 1;
2476                 /* do actual punch in osd_trans_stop() */
2477                 al->tl_start = start;
2478                 al->tl_end = end;
2479                 al->tl_mode = mode;
2480                 al->tl_punch = true;
2481                 break;
2482         }
2483
2484         RETURN(rc);
2485 }
2486
2487 static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
2488                          __u64 start, __u64 end, int mode, struct thandle *th)
2489 {
2490         int rc;
2491
2492         ENTRY;
2493
2494         if (mode & FALLOC_FL_PUNCH_HOLE) {
2495                 /* punch */
2496                 rc = osd_fallocate_punch(env, dt, start, end, mode, th);
2497         } else {
2498                 /* standard preallocate */
2499                 rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
2500         }
2501         RETURN(rc);
2502 }
2503
2504 static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
2505                              __u64 start, __u64 end, struct thandle *th)
2506 {
2507         struct osd_thandle *oh;
2508         struct inode       *inode;
2509         int                 rc;
2510         ENTRY;
2511
2512         LASSERT(th);
2513         oh = container_of(th, struct osd_thandle, ot_super);
2514
2515         /*
2516          * we don't need to reserve credits for whole truncate
2517          * it's not possible as truncate may need to free too many
2518          * blocks and that won't fit a single transaction. instead
2519          * we reserve credits to change i_size and put inode onto
2520          * orphan list. if needed truncate will extend or restart
2521          * transaction
2522          */
2523         osd_trans_declare_op(env, oh, OSD_OT_PUNCH,
2524                              osd_dto_credits_noquota[DTO_ATTR_SET_BASE] + 3);
2525
2526         inode = osd_dt_obj(dt)->oo_inode;
2527         LASSERT(inode);
2528
2529         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
2530                                    i_projid_read(inode), 0, oh, osd_dt_obj(dt),
2531                                    NULL, OSD_QID_BLK);
2532
2533         if (rc == 0)
2534                 rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
2535
2536         RETURN(rc);
2537 }
2538
2539 static int osd_punch(const struct lu_env *env, struct dt_object *dt,
2540                      __u64 start, __u64 end, struct thandle *th)
2541 {
2542         struct osd_object *obj = osd_dt_obj(dt);
2543         struct osd_device *osd = osd_obj2dev(obj);
2544         struct inode *inode = obj->oo_inode;
2545         struct osd_access_lock *al;
2546         struct osd_thandle *oh;
2547         int rc = 0, found = 0;
2548         bool grow = false;
2549         ENTRY;
2550
2551         LASSERT(dt_object_exists(dt));
2552         LASSERT(osd_invariant(obj));
2553         LASSERT(inode != NULL);
2554         dquot_initialize(inode);
2555
2556         LASSERT(th);
2557         oh = container_of(th, struct osd_thandle, ot_super);
2558         LASSERT(oh->ot_handle->h_transaction != NULL);
2559
2560         /* we used to skip truncate to current size to
2561          * optimize truncates on OST. with DoM we can
2562          * get attr_set to set specific size (MDS_REINT)
2563          * and then get truncate RPC which essentially
2564          * would be skipped. this is bad.. so, disable
2565          * this optimization on MDS till the client stop
2566          * to sent MDS_REINT (LU-11033) -bzzz
2567          */
2568         if (osd->od_is_ost && i_size_read(inode) == start)
2569                 RETURN(0);
2570
2571         osd_trans_exec_op(env, th, OSD_OT_PUNCH);
2572
2573         spin_lock(&inode->i_lock);
2574         if (i_size_read(inode) < start)
2575                 grow = true;
2576         i_size_write(inode, start);
2577         spin_unlock(&inode->i_lock);
2578         /* if object holds encrypted content, we need to make sure we truncate
2579          * on an encryption unit boundary, or subsequent reads will get
2580          * corrupted content
2581          */
2582         if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL &&
2583             start & ~LUSTRE_ENCRYPTION_MASK)
2584                 start = (start & LUSTRE_ENCRYPTION_MASK) +
2585                         LUSTRE_ENCRYPTION_UNIT_SIZE;
2586         ll_truncate_pagecache(inode, start);
2587
2588         /* optimize grow case */
2589         if (grow) {
2590                 osd_execute_truncate(obj);
2591                 GOTO(out, rc);
2592         }
2593
2594         inode_lock(inode);
2595         /* add to orphan list to ensure truncate completion
2596          * if this transaction succeed. ldiskfs_truncate()
2597          * will take the inode out of the list
2598          */
2599         rc = ldiskfs_orphan_add(oh->ot_handle, inode);
2600         inode_unlock(inode);
2601         if (rc != 0)
2602                 GOTO(out, rc);
2603
2604         list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
2605                 if (obj != al->tl_obj)
2606                         continue;
2607                 LASSERT(al->tl_shared == 0);
2608                 found = 1;
2609                 /* do actual truncate in osd_trans_stop() */
2610                 al->tl_truncate = 1;
2611                 break;
2612         }
2613         LASSERT(found);
2614
2615 out:
2616         RETURN(rc);
2617 }
2618
2619 static int fiemap_check_ranges(struct inode *inode,
2620                                u64 start, u64 len, u64 *new_len)
2621 {
2622         loff_t maxbytes;
2623
2624         *new_len = len;
2625
2626         if (len == 0)
2627                 return -EINVAL;
2628
2629         if (ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS))
2630                 maxbytes = inode->i_sb->s_maxbytes;
2631         else
2632                 maxbytes = LDISKFS_SB(inode->i_sb)->s_bitmap_maxbytes;
2633
2634         if (start > maxbytes)
2635                 return -EFBIG;
2636
2637         /*
2638          * Shrink request scope to what the fs can actually handle.
2639          */
2640         if (len > maxbytes || (maxbytes - len) < start)
2641                 *new_len = maxbytes - start;
2642
2643         return 0;
2644 }
2645
2646 /* So that the fiemap access checks can't overflow on 32 bit machines. */
2647 #define FIEMAP_MAX_EXTENTS     (UINT_MAX / sizeof(struct fiemap_extent))
2648
2649 static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
2650                           struct fiemap *fm)
2651 {
2652         struct fiemap_extent_info fieinfo = {0, };
2653         struct inode *inode = osd_dt_obj(dt)->oo_inode;
2654         u64 len;
2655         int rc;
2656
2657         LASSERT(inode);
2658         if (inode->i_op->fiemap == NULL)
2659                 return -EOPNOTSUPP;
2660
2661         if (fm->fm_extent_count > FIEMAP_MAX_EXTENTS)
2662                 return -EINVAL;
2663
2664         rc = fiemap_check_ranges(inode, fm->fm_start, fm->fm_length, &len);
2665         if (rc)
2666                 return rc;
2667
2668         fieinfo.fi_flags = fm->fm_flags;
2669         fieinfo.fi_extents_max = fm->fm_extent_count;
2670         fieinfo.fi_extents_start = fm->fm_extents;
2671
2672         if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
2673                 filemap_write_and_wait(inode->i_mapping);
2674
2675         rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
2676         fm->fm_flags = fieinfo.fi_flags;
2677         fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
2678
2679         return rc;
2680 }
2681
2682 static int osd_ladvise(const struct lu_env *env, struct dt_object *dt,
2683                        __u64 start, __u64 end, enum lu_ladvise_type advice)
2684 {
2685         struct osd_object *obj = osd_dt_obj(dt);
2686         int rc = 0;
2687         ENTRY;
2688
2689         switch (advice) {
2690         case LU_LADVISE_DONTNEED:
2691                 if (end)
2692                         invalidate_mapping_pages(obj->oo_inode->i_mapping,
2693                                                  start >> PAGE_SHIFT,
2694                                                  (end - 1) >> PAGE_SHIFT);
2695                 break;
2696         default:
2697                 rc = -ENOTSUPP;
2698                 break;
2699         }
2700
2701         RETURN(rc);
2702 }
2703
2704 static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt,
2705                         loff_t offset, int whence)
2706 {
2707         struct osd_object *obj = osd_dt_obj(dt);
2708         struct osd_device *dev = osd_obj2dev(obj);
2709         struct inode *inode = obj->oo_inode;
2710         struct file *file;
2711         loff_t result;
2712
2713         ENTRY;
2714         LASSERT(dt_object_exists(dt));
2715         LASSERT(osd_invariant(obj));
2716         LASSERT(inode);
2717         LASSERT(offset >= 0);
2718
2719         file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
2720                                  inode->i_fop);
2721         if (IS_ERR(file))
2722                 RETURN(PTR_ERR(file));
2723
2724         file->f_mode |= FMODE_64BITHASH;
2725         result = file->f_op->llseek(file, offset, whence);
2726         ihold(inode);
2727         fput(file);
2728         /*
2729          * If 'offset' is beyond end of object file then treat it as not error
2730          * but valid case for SEEK_HOLE and return 'offset' as result.
2731          * LOV will decide if it is beyond real end of file or not.
2732          */
2733         if (whence == SEEK_HOLE && result == -ENXIO)
2734                 result = offset;
2735
2736         CDEBUG(D_INFO, "seek %s from %lld: %lld\n", whence == SEEK_HOLE ?
2737                        "hole" : "data", offset, result);
2738         RETURN(result);
2739 }
2740
2741 /*
2742  * in some cases we may need declare methods for objects being created
2743  * e.g., when we create symlink
2744  */
2745 const struct dt_body_operations osd_body_ops_new = {
2746         .dbo_declare_write = osd_declare_write,
2747 };
2748
2749 const struct dt_body_operations osd_body_ops = {
2750         .dbo_read                       = osd_read,
2751         .dbo_declare_write              = osd_declare_write,
2752         .dbo_write                      = osd_write,
2753         .dbo_bufs_get                   = osd_bufs_get,
2754         .dbo_bufs_put                   = osd_bufs_put,
2755         .dbo_write_prep                 = osd_write_prep,
2756         .dbo_declare_write_commit       = osd_declare_write_commit,
2757         .dbo_write_commit               = osd_write_commit,
2758         .dbo_read_prep                  = osd_read_prep,
2759         .dbo_declare_punch              = osd_declare_punch,
2760         .dbo_punch                      = osd_punch,
2761         .dbo_fiemap_get                 = osd_fiemap_get,
2762         .dbo_ladvise                    = osd_ladvise,
2763         .dbo_declare_fallocate          = osd_declare_fallocate,
2764         .dbo_fallocate                  = osd_fallocate,
2765         .dbo_lseek                      = osd_lseek,
2766 };
2767
2768 /**
2769  * Get a truncate lock
2770  *
2771  * In order to take multi-transaction truncate out of main transaction we let
2772  * the caller grab a lock on the object passed. the lock can be shared (for
2773  * writes) and exclusive (for truncate). It's not allowed to mix truncate
2774  * and write in the same transaction handle (do not confuse with big ldiskfs
2775  * transaction containing lots of handles).
2776  * The lock must be taken at declaration.
2777  *
2778  * \param obj           object to lock
2779  * \oh                  transaction
2780  * \shared              shared or exclusive
2781  *
2782  * \retval 0            lock is granted
2783  * \retval -NOMEM       no memory to allocate lock
2784  */
2785 int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, bool shared)
2786 {
2787         struct osd_access_lock *al, *tmp;
2788
2789         LASSERT(obj);
2790         LASSERT(oh);
2791
2792         list_for_each_entry(tmp, &oh->ot_trunc_locks, tl_list) {
2793                 if (tmp->tl_obj != obj)
2794                         continue;
2795                 LASSERT(tmp->tl_shared == shared);
2796                 /* found same lock */
2797                 return 0;
2798         }
2799
2800         OBD_ALLOC_PTR(al);
2801         if (unlikely(al == NULL))
2802                 return -ENOMEM;
2803         al->tl_obj = obj;
2804         al->tl_truncate = false;
2805         if (shared)
2806                 down_read(&obj->oo_ext_idx_sem);
2807         else
2808                 down_write(&obj->oo_ext_idx_sem);
2809         al->tl_shared = shared;
2810         lu_object_get(&obj->oo_dt.do_lu);
2811
2812         list_add(&al->tl_list, &oh->ot_trunc_locks);
2813
2814         return 0;
2815 }
2816
2817 void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
2818 {
2819         struct osd_access_lock *al, *tmp;
2820
2821         list_for_each_entry_safe(al, tmp, list, tl_list) {
2822                 if (al->tl_shared)
2823                         up_read(&al->tl_obj->oo_ext_idx_sem);
2824                 else
2825                         up_write(&al->tl_obj->oo_ext_idx_sem);
2826                 osd_object_put(env, al->tl_obj);
2827                 list_del(&al->tl_list);
2828                 OBD_FREE_PTR(al);
2829         }
2830 }
2831
2832 /* For a partial-page punch, flush punch range to disk immediately */
2833 static void osd_partial_page_flush_punch(struct osd_device *d,
2834                                          struct inode *inode, loff_t start,
2835                                          loff_t end)
2836 {
2837         if (osd_use_page_cache(d)) {
2838                 filemap_fdatawrite_range(inode->i_mapping, start, end);
2839         } else {
2840                 /* Notice we use "wait" version to ensure I/O is complete */
2841                 filemap_write_and_wait_range(inode->i_mapping, start,
2842                                              end);
2843                 invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT,
2844                                          end >> PAGE_SHIFT);
2845         }
2846 }
2847
2848 /*
2849  * For a partial-page truncate, flush the page to disk immediately to
2850  * avoid data corruption during direct disk write.  b=17397
2851  */
2852 static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
2853                                    loff_t offset)
2854 {
2855         if (!(offset & ~PAGE_MASK))
2856                 return;
2857
2858         if (osd_use_page_cache(d)) {
2859                 filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
2860         } else {
2861                 /* Notice we use "wait" version to ensure I/O is complete */
2862                 filemap_write_and_wait_range(inode->i_mapping, offset,
2863                                              offset + 1);
2864                 invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
2865                                          offset >> PAGE_SHIFT);
2866         }
2867 }
2868
2869 void osd_execute_truncate(struct osd_object *obj)
2870 {
2871         struct osd_device *d = osd_obj2dev(obj);
2872         struct inode *inode = obj->oo_inode;
2873         __u64 size;
2874
2875         /* simulate crash before (in the middle) of delayed truncate */
2876         if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FAIL_AT_TRUNCATE)) {
2877                 struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
2878                 struct ldiskfs_sb_info *sbi = LDISKFS_SB(inode->i_sb);
2879
2880                 mutex_lock(&sbi->s_orphan_lock);
2881                 list_del_init(&ei->i_orphan);
2882                 mutex_unlock(&sbi->s_orphan_lock);
2883                 return;
2884         }
2885
2886         size = i_size_read(inode);
2887         inode_lock(inode);
2888         /* if object holds encrypted content, we need to make sure we truncate
2889          * on an encryption unit boundary, or block content will get corrupted
2890          */
2891         if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL &&
2892             size & ~LUSTRE_ENCRYPTION_MASK)
2893                 inode->i_size = (size & LUSTRE_ENCRYPTION_MASK) +
2894                         LUSTRE_ENCRYPTION_UNIT_SIZE;
2895         ldiskfs_truncate(inode);
2896         inode_unlock(inode);
2897         if (inode->i_size != size) {
2898                 spin_lock(&inode->i_lock);
2899                 i_size_write(inode, size);
2900                 LDISKFS_I(inode)->i_disksize = size;
2901                 spin_unlock(&inode->i_lock);
2902                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
2903         }
2904         osd_partial_page_flush(d, inode, size);
2905 }
2906
2907 static int osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
2908                              loff_t start, loff_t end, int mode)
2909 {
2910         struct osd_device *d = osd_obj2dev(obj);
2911         struct inode *inode = obj->oo_inode;
2912         struct file *file;
2913         int rc;
2914
2915         file = alloc_file_pseudo(inode, d->od_mnt, "/", O_NOATIME,
2916                                  inode->i_fop);
2917         if (IS_ERR(file))
2918                 RETURN(PTR_ERR(file));
2919
2920         file->f_mode |= FMODE_64BITHASH;
2921         rc = file->f_op->fallocate(file, mode, start, end - start);
2922         ihold(inode);
2923         fput(file);
2924         if (rc == 0)
2925                 osd_partial_page_flush_punch(d, inode, start, end - 1);
2926         return rc;
2927 }
2928
2929 int osd_process_truncates(const struct lu_env *env, struct list_head *list)
2930 {
2931         struct osd_access_lock *al;
2932         int rc = 0;
2933
2934         LASSERT(!journal_current_handle());
2935
2936         list_for_each_entry(al, list, tl_list) {
2937                 if (al->tl_shared)
2938                         continue;
2939                 if (al->tl_truncate)
2940                         osd_execute_truncate(al->tl_obj);
2941                 else if (al->tl_punch)
2942                         rc = osd_execute_punch(env, al->tl_obj, al->tl_start,
2943                                                al->tl_end, al->tl_mode);
2944         }
2945
2946         return rc;
2947 }