Whamcloud - gitweb
LU-15117 ofd: don't take lock for dt_bufs_get()
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_io.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * lustre/osd/osd_io.c
32  *
33  * body operations
34  *
35  * Author: Nikita Danilov <nikita@clusterfs.com>
36  * Author: Alex Zhuravlev <bzzz@whamcloud.com>
37  *
38  */
39
40 #define DEBUG_SUBSYSTEM S_OSD
41
42 /* prerequisite for linux/xattr.h */
43 #include <linux/types.h>
44 /* prerequisite for linux/xattr.h */
45 #include <linux/fs.h>
46 #include <linux/mm.h>
47 #include <linux/swap.h>
48 #include <linux/pagevec.h>
49
50 /*
51  * struct OBD_{ALLOC,FREE}*()
52  * OBD_FAIL_CHECK
53  */
54 #include <obd_support.h>
55
56 #include "osd_internal.h"
57
58 /* ext_depth() */
59 #include <ldiskfs/ldiskfs_extents.h>
60 #include <ldiskfs/ldiskfs.h>
61
62 static inline bool osd_use_page_cache(struct osd_device *d)
63 {
64         /* do not use pagecache if write and read caching are disabled */
65         if (d->od_writethrough_cache + d->od_read_cache == 0)
66                 return false;
67         /* use pagecache by default */
68         return true;
69 }
70
71 static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf,
72                             int rw, int line, int pages)
73 {
74         int blocks, i;
75
76         LASSERTF(iobuf->dr_elapsed_valid == 0,
77                  "iobuf %p, reqs %d, rw %d, line %d\n", iobuf,
78                  atomic_read(&iobuf->dr_numreqs), iobuf->dr_rw,
79                  iobuf->dr_init_at);
80         LASSERT(pages <= PTLRPC_MAX_BRW_PAGES);
81
82         init_waitqueue_head(&iobuf->dr_wait);
83         atomic_set(&iobuf->dr_numreqs, 0);
84         iobuf->dr_npages = 0;
85         iobuf->dr_error = 0;
86         iobuf->dr_dev = d;
87         iobuf->dr_frags = 0;
88         iobuf->dr_elapsed = ktime_set(0, 0);
89         /* must be counted before, so assert */
90         iobuf->dr_rw = rw;
91         iobuf->dr_init_at = line;
92
93         blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits);
94         if (iobuf->dr_bl_buf.lb_len >= blocks * sizeof(iobuf->dr_blocks[0])) {
95                 LASSERT(iobuf->dr_pg_buf.lb_len >=
96                         pages * sizeof(iobuf->dr_pages[0]));
97                 return 0;
98         }
99
100         /* start with 1MB for 4K blocks */
101         i = 256;
102         while (i <= PTLRPC_MAX_BRW_PAGES && i < pages)
103                 i <<= 1;
104
105         CDEBUG(D_OTHER, "realloc %u for %u (%u) pages\n",
106                (unsigned int)(pages * sizeof(iobuf->dr_pages[0])), i, pages);
107         pages = i;
108         blocks = pages * (PAGE_SIZE >> osd_sb(d)->s_blocksize_bits);
109         iobuf->dr_max_pages = 0;
110         CDEBUG(D_OTHER, "realloc %u for %u blocks\n",
111                (unsigned int)(blocks * sizeof(iobuf->dr_blocks[0])), blocks);
112
113         lu_buf_realloc(&iobuf->dr_bl_buf, blocks * sizeof(iobuf->dr_blocks[0]));
114         iobuf->dr_blocks = iobuf->dr_bl_buf.lb_buf;
115         if (unlikely(iobuf->dr_blocks == NULL))
116                 return -ENOMEM;
117
118         lu_buf_realloc(&iobuf->dr_pg_buf, pages * sizeof(iobuf->dr_pages[0]));
119         iobuf->dr_pages = iobuf->dr_pg_buf.lb_buf;
120         if (unlikely(iobuf->dr_pages == NULL))
121                 return -ENOMEM;
122
123         lu_buf_realloc(&iobuf->dr_lnb_buf,
124                        pages * sizeof(iobuf->dr_lnbs[0]));
125         iobuf->dr_lnbs = iobuf->dr_lnb_buf.lb_buf;
126         if (unlikely(iobuf->dr_lnbs == NULL))
127                 return -ENOMEM;
128
129         iobuf->dr_max_pages = pages;
130
131         return 0;
132 }
133 #define osd_init_iobuf(dev, iobuf, rw, pages) \
134         __osd_init_iobuf(dev, iobuf, rw, __LINE__, pages)
135
136 static void osd_iobuf_add_page(struct osd_iobuf *iobuf,
137                                struct niobuf_local *lnb)
138 {
139         LASSERT(iobuf->dr_npages < iobuf->dr_max_pages);
140         iobuf->dr_pages[iobuf->dr_npages] = lnb->lnb_page;
141         iobuf->dr_lnbs[iobuf->dr_npages] = lnb;
142         iobuf->dr_npages++;
143 }
144
145 void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
146 {
147         int rw = iobuf->dr_rw;
148
149         if (iobuf->dr_elapsed_valid) {
150                 struct brw_stats *h = &d->od_brw_stats;
151
152                 iobuf->dr_elapsed_valid = 0;
153                 LASSERT(iobuf->dr_dev == d);
154                 LASSERT(iobuf->dr_frags > 0);
155                 lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_DIO_FRAGS+rw],
156                                       iobuf->dr_frags);
157                 lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_IO_TIME+rw],
158                                            ktime_to_ms(iobuf->dr_elapsed));
159         }
160 }
161
162 #ifdef HAVE_BIO_ENDIO_USES_ONE_ARG
163 static void dio_complete_routine(struct bio *bio)
164 {
165         int error = blk_status_to_errno(bio->bi_status);
166 #else
167 static void dio_complete_routine(struct bio *bio, int error)
168 {
169 #endif
170         struct osd_iobuf *iobuf = bio->bi_private;
171         struct bio_vec *bvl;
172
173         /* CAVEAT EMPTOR: possibly in IRQ context
174          * DO NOT record procfs stats here!!!
175          */
176
177         if (unlikely(iobuf == NULL)) {
178                 CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
179                 CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
180                        ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
181                        bio->bi_next, (unsigned long)bio->bi_flags,
182                        (unsigned int)bio->bi_opf, bio->bi_vcnt, bio_idx(bio),
183                        bio_sectors(bio) << 9, bio->bi_end_io,
184                        atomic_read(&bio->__bi_cnt),
185                        bio->bi_private);
186                 return;
187         }
188
189         /* the check is outside of the cycle for performance reason -bzzz */
190         if (!bio_data_dir(bio)) {
191                 DECLARE_BVEC_ITER_ALL(iter_all);
192
193                 bio_for_each_segment_all(bvl, bio, iter_all) {
194                         if (likely(error == 0))
195                                 SetPageUptodate(bvl_to_page(bvl));
196                         LASSERT(PageLocked(bvl_to_page(bvl)));
197                 }
198                 atomic_dec(&iobuf->dr_dev->od_r_in_flight);
199         } else {
200                 atomic_dec(&iobuf->dr_dev->od_w_in_flight);
201         }
202
203         /* any real error is good enough -bzzz */
204         if (error != 0 && iobuf->dr_error == 0)
205                 iobuf->dr_error = error;
206
207         /*
208          * set dr_elapsed before dr_numreqs turns to 0, otherwise
209          * it's possible that service thread will see dr_numreqs
210          * is zero, but dr_elapsed is not set yet, leading to lost
211          * data in this processing and an assertion in a subsequent
212          * call to OSD.
213          */
214         if (atomic_read(&iobuf->dr_numreqs) == 1) {
215                 ktime_t now = ktime_get();
216
217                 iobuf->dr_elapsed = ktime_sub(now, iobuf->dr_start_time);
218                 iobuf->dr_elapsed_valid = 1;
219         }
220         if (atomic_dec_and_test(&iobuf->dr_numreqs))
221                 wake_up(&iobuf->dr_wait);
222
223         /* Completed bios used to be chained off iobuf->dr_bios and freed in
224          * filter_clear_dreq().  It was then possible to exhaust the biovec-256
225          * mempool when serious on-disk fragmentation was encountered,
226          * deadlocking the OST.  The bios are now released as soon as complete
227          * so the pool cannot be exhausted while IOs are competing. b=10076
228          */
229         bio_put(bio);
230 }
231
232 static void record_start_io(struct osd_iobuf *iobuf, int size)
233 {
234         struct osd_device *osd = iobuf->dr_dev;
235         struct brw_stats *h = &osd->od_brw_stats;
236
237         iobuf->dr_frags++;
238         atomic_inc(&iobuf->dr_numreqs);
239
240         if (iobuf->dr_rw == 0) {
241                 atomic_inc(&osd->od_r_in_flight);
242                 lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_RPC_HIST],
243                                  atomic_read(&osd->od_r_in_flight));
244                 lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_DISK_IOSIZE],
245                                            size);
246         } else if (iobuf->dr_rw == 1) {
247                 atomic_inc(&osd->od_w_in_flight);
248                 lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_RPC_HIST],
249                                  atomic_read(&osd->od_w_in_flight));
250                 lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_W_DISK_IOSIZE],
251                                            size);
252         } else {
253                 LBUG();
254         }
255 }
256
257 static void osd_submit_bio(int rw, struct bio *bio)
258 {
259         LASSERTF(rw == 0 || rw == 1, "%x\n", rw);
260 #ifdef HAVE_SUBMIT_BIO_2ARGS
261         submit_bio(rw ? WRITE : READ, bio);
262 #else
263         bio->bi_opf |= rw;
264         submit_bio(bio);
265 #endif
266 }
267
268 static int can_be_merged(struct bio *bio, sector_t sector)
269 {
270         if (bio == NULL)
271                 return 0;
272
273         return bio_end_sector(bio) == sector ? 1 : 0;
274 }
275
276 #if IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)
277 /*
278  * This function will change the data written, thus it should only be
279  * used when checking data integrity feature
280  */
281 static void bio_integrity_fault_inject(struct bio *bio)
282 {
283         struct bio_vec *bvec;
284         DECLARE_BVEC_ITER_ALL(iter_all);
285         void *kaddr;
286         char *addr;
287
288         bio_for_each_segment_all(bvec, bio, iter_all) {
289                 struct page *page = bvec->bv_page;
290
291                 kaddr = kmap(page);
292                 addr = kaddr;
293                 *addr = ~(*addr);
294                 kunmap(page);
295                 break;
296         }
297 }
298
299 static int bio_dif_compare(__u16 *expected_guard_buf, void *bio_prot_buf,
300                            unsigned int sectors, int tuple_size)
301 {
302         __u16 *expected_guard;
303         __u16 *bio_guard;
304         int i;
305
306         expected_guard = expected_guard_buf;
307         for (i = 0; i < sectors; i++) {
308                 bio_guard = (__u16 *)bio_prot_buf;
309                 if (*bio_guard != *expected_guard) {
310                         CERROR(
311                                "unexpected guard tags on sector %d expected guard %u, bio guard %u, sectors %u, tuple size %d\n",
312                                i, *expected_guard, *bio_guard, sectors,
313                                tuple_size);
314                         return -EIO;
315                 }
316                 expected_guard++;
317                 bio_prot_buf += tuple_size;
318         }
319         return 0;
320 }
321
322 static int osd_bio_integrity_compare(struct bio *bio, struct block_device *bdev,
323                                      struct osd_iobuf *iobuf, int index)
324 {
325         struct blk_integrity *bi = bdev_get_integrity(bdev);
326         struct bio_integrity_payload *bip = bio->bi_integrity;
327         struct niobuf_local *lnb = NULL;
328         unsigned short sector_size = blk_integrity_interval(bi);
329         void *bio_prot_buf = page_address(bip->bip_vec->bv_page) +
330                 bip->bip_vec->bv_offset;
331         struct bio_vec *bv;
332         sector_t sector = bio_start_sector(bio);
333         unsigned int i, sectors, total;
334         DECLARE_BVEC_ITER_ALL(iter_all);
335         __u16 *expected_guard;
336         int rc;
337
338         total = 0;
339         bio_for_each_segment_all(bv, bio, iter_all) {
340                 for (i = index; i < iobuf->dr_npages; i++) {
341                         if (iobuf->dr_pages[i] == bv->bv_page) {
342                                 lnb = iobuf->dr_lnbs[i];
343                                 break;
344                         }
345                 }
346                 if (!lnb)
347                         continue;
348                 expected_guard = lnb->lnb_guards;
349                 sectors = bv->bv_len / sector_size;
350                 if (lnb->lnb_guard_rpc) {
351                         rc = bio_dif_compare(expected_guard, bio_prot_buf,
352                                              sectors, bi->tuple_size);
353                         if (rc)
354                                 return rc;
355                 }
356
357                 sector += sectors;
358                 bio_prot_buf += sectors * bi->tuple_size;
359                 total += sectors * bi->tuple_size;
360                 LASSERT(total <= bip_size(bio->bi_integrity));
361                 index++;
362                 lnb = NULL;
363         }
364         return 0;
365 }
366
367 static int osd_bio_integrity_handle(struct osd_device *osd, struct bio *bio,
368                                     struct osd_iobuf *iobuf,
369                                     int start_page_idx, bool fault_inject,
370                                     bool integrity_enabled)
371 {
372         struct super_block *sb = osd_sb(osd);
373         integrity_gen_fn *generate_fn = NULL;
374         integrity_vrfy_fn *verify_fn = NULL;
375         int rc;
376
377         ENTRY;
378
379         if (!integrity_enabled)
380                 RETURN(0);
381
382         rc = osd_get_integrity_profile(osd, &generate_fn, &verify_fn);
383         if (rc)
384                 RETURN(rc);
385
386         rc = bio_integrity_prep_fn(bio, generate_fn, verify_fn);
387         if (rc)
388                 RETURN(rc);
389
390         /* Verify and inject fault only when writing */
391         if (iobuf->dr_rw == 1) {
392                 if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_OST_INTEGRITY_CMP))) {
393                         rc = osd_bio_integrity_compare(bio, sb->s_bdev, iobuf,
394                                                        start_page_idx);
395                         if (rc)
396                                 RETURN(rc);
397                 }
398
399                 if (unlikely(fault_inject))
400                         bio_integrity_fault_inject(bio);
401         }
402
403         RETURN(0);
404 }
405
406 #ifdef HAVE_BIO_INTEGRITY_PREP_FN
407 #  ifdef HAVE_BIO_ENDIO_USES_ONE_ARG
408 static void dio_integrity_complete_routine(struct bio *bio)
409 #  else
410 static void dio_integrity_complete_routine(struct bio *bio, int error)
411 #  endif
412 {
413         struct osd_bio_private *bio_private = bio->bi_private;
414
415         bio->bi_private = bio_private->obp_iobuf;
416         osd_dio_complete_routine(bio, error);
417
418         OBD_FREE_PTR(bio_private);
419 }
420 #endif /* HAVE_BIO_INTEGRITY_PREP_FN */
421 #else  /* !CONFIG_BLK_DEV_INTEGRITY */
422 #define osd_bio_integrity_handle(osd, bio, iobuf, start_page_idx, \
423                                  fault_inject, integrity_enabled) 0
424 #endif /* CONFIG_BLK_DEV_INTEGRITY */
425
426 static int osd_bio_init(struct bio *bio, struct osd_iobuf *iobuf,
427                         bool integrity_enabled, int start_page_idx,
428                         struct osd_bio_private **pprivate)
429 {
430         ENTRY;
431
432         *pprivate = NULL;
433
434 #ifdef HAVE_BIO_INTEGRITY_PREP_FN
435         if (integrity_enabled) {
436                 struct osd_bio_private *bio_private = NULL;
437
438                 OBD_ALLOC_GFP(bio_private, sizeof(*bio_private), GFP_NOIO);
439                 if (bio_private == NULL)
440                         RETURN(-ENOMEM);
441                 bio->bi_end_io = dio_integrity_complete_routine;
442                 bio->bi_private = bio_private;
443                 bio_private->obp_start_page_idx = start_page_idx;
444                 bio_private->obp_iobuf = iobuf;
445                 *pprivate = bio_private;
446         } else
447 #endif
448         {
449                 bio->bi_end_io = dio_complete_routine;
450                 bio->bi_private = iobuf;
451         }
452
453         RETURN(0);
454 }
455
456 static void osd_mark_page_io_done(struct osd_iobuf *iobuf,
457                                   struct inode *inode,
458                                   sector_t start_blocks,
459                                   sector_t count)
460 {
461         struct niobuf_local *lnb;
462         int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
463         pgoff_t pg_start, pg_end;
464
465         pg_start = start_blocks / blocks_per_page;
466         if (start_blocks % blocks_per_page)
467                 pg_start++;
468         if (count >= blocks_per_page)
469                 pg_end = (start_blocks + count -
470                           blocks_per_page) / blocks_per_page;
471         else
472                 return; /* nothing to mark */
473         for ( ; pg_start <= pg_end; pg_start++) {
474                 lnb = iobuf->dr_lnbs[pg_start];
475                 lnb->lnb_flags |= OBD_BRW_DONE;
476         }
477 }
478
479 static int osd_do_bio(struct osd_device *osd, struct inode *inode,
480                       struct osd_iobuf *iobuf, sector_t start_blocks,
481                       sector_t count)
482 {
483         int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
484         struct page **pages = iobuf->dr_pages;
485         int npages = iobuf->dr_npages;
486         sector_t *blocks = iobuf->dr_blocks;
487         struct super_block *sb = inode->i_sb;
488         int sector_bits = sb->s_blocksize_bits - 9;
489         unsigned int blocksize = sb->s_blocksize;
490         struct block_device *bdev = sb->s_bdev;
491         struct osd_bio_private *bio_private = NULL;
492         struct bio *bio = NULL;
493         int bio_start_page_idx;
494         struct page *page;
495         unsigned int page_offset;
496         sector_t sector;
497         int nblocks;
498         int block_idx, block_idx_end;
499         int page_idx, page_idx_start;
500         int i;
501         int rc = 0;
502         bool fault_inject;
503         bool integrity_enabled;
504         struct blk_plug plug;
505         int blocks_left_page;
506
507         ENTRY;
508
509         fault_inject = OBD_FAIL_CHECK(OBD_FAIL_OST_INTEGRITY_FAULT);
510         LASSERT(iobuf->dr_npages == npages);
511
512         integrity_enabled = bdev_integrity_enabled(bdev, iobuf->dr_rw);
513
514         osd_brw_stats_update(osd, iobuf);
515         iobuf->dr_start_time = ktime_get();
516
517         if (!count)
518                 count = npages * blocks_per_page;
519         block_idx_end = start_blocks + count;
520
521         blk_start_plug(&plug);
522
523         page_idx_start = start_blocks / blocks_per_page;
524         for (page_idx = page_idx_start, block_idx = start_blocks;
525              block_idx < block_idx_end; page_idx++,
526              block_idx += blocks_left_page) {
527                 /* For cases where the filesystems blocksize is not the
528                  * same as PAGE_SIZE (e.g. ARM with PAGE_SIZE=64KB and
529                  * blocksize=4KB), there will be multiple blocks to
530                  * read/write per page. Also, the start and end block may
531                  * not be aligned to the start and end of the page, so the
532                  * first page may skip some blocks at the start ("i != 0",
533                  * "blocks_left_page" is reduced), and the last page may
534                  * skip some blocks at the end (limited by "count").
535                  */
536                 page = pages[page_idx];
537                 LASSERT(page_idx < iobuf->dr_npages);
538
539                 i = block_idx % blocks_per_page;
540                 blocks_left_page = blocks_per_page - i;
541                 if (block_idx + blocks_left_page > block_idx_end)
542                         blocks_left_page = block_idx_end - block_idx;
543                 page_offset = i * blocksize;
544                 for (i = 0; i < blocks_left_page;
545                      i += nblocks, page_offset += blocksize * nblocks) {
546                         nblocks = 1;
547
548                         if (blocks[block_idx + i] == 0) {  /* hole */
549                                 LASSERTF(iobuf->dr_rw == 0,
550                                          "page_idx %u, block_idx %u, i %u,"
551                                          "start_blocks: %llu, count: %llu, npages: %d\n",
552                                          page_idx, block_idx, i,
553                                          (unsigned long long)start_blocks,
554                                          (unsigned long long)count, npages);
555                                 memset(kmap(page) + page_offset, 0, blocksize);
556                                 kunmap(page);
557                                 continue;
558                         }
559
560                         sector = (sector_t)blocks[block_idx + i] << sector_bits;
561
562                         /* Additional contiguous file blocks? */
563                         while (i + nblocks < blocks_left_page &&
564                                (sector + (nblocks << sector_bits)) ==
565                                ((sector_t)blocks[block_idx + i + nblocks] <<
566                                  sector_bits))
567                                 nblocks++;
568
569                         if (bio && can_be_merged(bio, sector) &&
570                             bio_add_page(bio, page, blocksize * nblocks,
571                                          page_offset) != 0)
572                                 continue;       /* added this frag OK */
573
574                         if (bio != NULL) {
575                                 struct request_queue *q = bio_get_queue(bio);
576                                 unsigned int bi_size = bio_sectors(bio) << 9;
577
578                                 /* Dang! I have to fragment this I/O */
579                                 CDEBUG(D_INODE,
580                                        "bio++ sz %d vcnt %d(%d) sectors %d(%d) psg %d(%d)\n",
581                                        bi_size, bio->bi_vcnt, bio->bi_max_vecs,
582                                        bio_sectors(bio),
583                                        queue_max_sectors(q),
584                                        osd_bio_nr_segs(bio),
585                                        queue_max_segments(q));
586                                 rc = osd_bio_integrity_handle(osd, bio,
587                                         iobuf, bio_start_page_idx,
588                                         fault_inject, integrity_enabled);
589                                 if (rc) {
590                                         bio_put(bio);
591                                         goto out;
592                                 }
593
594                                 record_start_io(iobuf, bi_size);
595                                 osd_submit_bio(iobuf->dr_rw, bio);
596                         }
597
598                         bio_start_page_idx = page_idx;
599                         /* allocate new bio */
600                         bio = bio_alloc(GFP_NOIO, min(BIO_MAX_PAGES,
601                                         (block_idx_end - block_idx +
602                                          blocks_left_page - 1)));
603                         if (bio == NULL) {
604                                 CERROR("Can't allocate bio %u pages\n",
605                                        block_idx_end - block_idx +
606                                        blocks_left_page - 1);
607                                 rc = -ENOMEM;
608                                 goto out;
609                         }
610
611                         bio_set_dev(bio, bdev);
612                         bio_set_sector(bio, sector);
613                         bio->bi_opf = iobuf->dr_rw ? WRITE : READ;
614                         rc = osd_bio_init(bio, iobuf, integrity_enabled,
615                                           bio_start_page_idx, &bio_private);
616                         if (rc) {
617                                 bio_put(bio);
618                                 goto out;
619                         }
620
621                         rc = bio_add_page(bio, page,
622                                           blocksize * nblocks, page_offset);
623                         LASSERT(rc != 0);
624                 }
625         }
626
627         if (bio != NULL) {
628                 rc = osd_bio_integrity_handle(osd, bio, iobuf,
629                                               bio_start_page_idx,
630                                               fault_inject,
631                                               integrity_enabled);
632                 if (rc) {
633                         bio_put(bio);
634                         goto out;
635                 }
636
637                 record_start_io(iobuf, bio_sectors(bio) << 9);
638                 osd_submit_bio(iobuf->dr_rw, bio);
639                 rc = 0;
640         }
641
642 out:
643         blk_finish_plug(&plug);
644
645         /* in order to achieve better IO throughput, we don't wait for writes
646          * completion here. instead we proceed with transaction commit in
647          * parallel and wait for IO completion once transaction is stopped
648          * see osd_trans_stop() for more details -bzzz
649          */
650         if (iobuf->dr_rw == 0 || fault_inject) {
651                 wait_event(iobuf->dr_wait,
652                            atomic_read(&iobuf->dr_numreqs) == 0);
653                 osd_fini_iobuf(osd, iobuf);
654         }
655
656         if (rc == 0) {
657                 rc = iobuf->dr_error;
658         } else {
659                 if (bio_private)
660                         OBD_FREE_PTR(bio_private);
661         }
662
663         /* Write only now */
664         if (rc == 0 && iobuf->dr_rw)
665                 osd_mark_page_io_done(iobuf, inode,
666                                       start_blocks, count);
667
668         RETURN(rc);
669 }
670
671 static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages,
672                                    struct niobuf_local *lnb, int maxlnb)
673 {
674         int rc = 0;
675         ENTRY;
676
677         *nrpages = 0;
678
679         while (len > 0) {
680                 int poff = offset & (PAGE_SIZE - 1);
681                 int plen = PAGE_SIZE - poff;
682
683                 if (*nrpages >= maxlnb) {
684                         rc = -EOVERFLOW;
685                         break;
686                 }
687
688                 if (plen > len)
689                         plen = len;
690                 lnb->lnb_file_offset = offset;
691                 lnb->lnb_page_offset = poff;
692                 lnb->lnb_len = plen;
693                 /* lnb->lnb_flags = rnb->rnb_flags; */
694                 lnb->lnb_flags = 0;
695                 lnb->lnb_page = NULL;
696                 lnb->lnb_rc = 0;
697                 lnb->lnb_guard_rpc = 0;
698                 lnb->lnb_guard_disk = 0;
699                 lnb->lnb_locked = 0;
700
701                 LASSERTF(plen <= len, "plen %u, len %lld\n", plen,
702                          (long long) len);
703                 offset += plen;
704                 len -= plen;
705                 lnb++;
706                 (*nrpages)++;
707         }
708
709         RETURN(rc);
710 }
711
712 static struct page *osd_get_page(const struct lu_env *env, struct dt_object *dt,
713                                  loff_t offset, gfp_t gfp_mask, bool cache)
714 {
715         struct osd_thread_info *oti = osd_oti_get(env);
716         struct inode *inode = osd_dt_obj(dt)->oo_inode;
717         struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
718         struct page *page;
719         int cur;
720
721         LASSERT(inode);
722
723         if (cache) {
724                 page = find_or_create_page(inode->i_mapping,
725                                            offset >> PAGE_SHIFT, gfp_mask);
726
727                 if (likely(page)) {
728                         LASSERT(!PagePrivate2(page));
729                         wait_on_page_writeback(page);
730                 } else {
731                         lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1);
732                 }
733
734                 return page;
735         }
736
737         if (inode->i_mapping->nrpages) {
738                 /* consult with pagecache, but do not create new pages */
739                 /* this is normally used once */
740                 page = find_lock_page(inode->i_mapping, offset >> PAGE_SHIFT);
741                 if (page) {
742                         wait_on_page_writeback(page);
743                         return page;
744                 }
745         }
746
747         LASSERT(oti->oti_dio_pages);
748         cur = oti->oti_dio_pages_used;
749         page = oti->oti_dio_pages[cur];
750
751         if (unlikely(!page)) {
752                 LASSERT(cur < PTLRPC_MAX_BRW_PAGES);
753                 page = alloc_page(gfp_mask);
754                 if (!page)
755                         return NULL;
756                 oti->oti_dio_pages[cur] = page;
757                 SetPagePrivate2(page);
758                 lock_page(page);
759         }
760
761         ClearPageUptodate(page);
762         page->index = offset >> PAGE_SHIFT;
763         oti->oti_dio_pages_used++;
764
765         return page;
766 }
767
768 /*
769  * there are following "locks":
770  * journal_start
771  * i_mutex
772  * page lock
773  *
774  * osd write path:
775  *  - lock page(s)
776  *  - journal_start
777  *  - truncate_sem
778  *
779  * ext4 vmtruncate:
780  *  - lock pages, unlock
781  *  - journal_start
782  *  - lock partial page
783  *  - i_data_sem
784  *
785  */
786
787 /**
788  * Unlock and release pages loaded by osd_bufs_get()
789  *
790  * Unlock \a npages pages from \a lnb and drop the refcount on them.
791  *
792  * \param env           thread execution environment
793  * \param dt            dt object undergoing IO (OSD object + methods)
794  * \param lnb           array of pages undergoing IO
795  * \param npages        number of pages in \a lnb
796  *
797  * \retval 0            always
798  */
799 static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
800                         struct niobuf_local *lnb, int npages)
801 {
802         struct osd_thread_info *oti = osd_oti_get(env);
803         struct pagevec pvec;
804         int i;
805
806         ll_pagevec_init(&pvec, 0);
807
808         for (i = 0; i < npages; i++) {
809                 struct page *page = lnb[i].lnb_page;
810
811                 if (page == NULL)
812                         continue;
813
814                 /* if the page isn't cached, then reset uptodate
815                  * to prevent reuse
816                  */
817                 if (PagePrivate2(page)) {
818                         oti->oti_dio_pages_used--;
819                 } else {
820                         if (lnb[i].lnb_locked)
821                                 unlock_page(page);
822                         if (pagevec_add(&pvec, page) == 0)
823                                 pagevec_release(&pvec);
824                 }
825
826                 lnb[i].lnb_page = NULL;
827         }
828
829         LASSERTF(oti->oti_dio_pages_used == 0, "%d\n", oti->oti_dio_pages_used);
830
831         /* Release any partial pagevec */
832         pagevec_release(&pvec);
833
834         RETURN(0);
835 }
836
837 /**
838  * Load and lock pages undergoing IO
839  *
840  * Pages as described in the \a lnb array are fetched (from disk or cache)
841  * and locked for IO by the caller.
842  *
843  * DLM locking protects us from write and truncate competing for same region,
844  * but partial-page truncate can leave dirty pages in the cache for ldiskfs.
845  * It's possible the writeout on a such a page is in progress when we access
846  * it. It's also possible that during this writeout we put new (partial) data
847  * into the page, but won't be able to proceed in filter_commitrw_write().
848  * Therefore, just wait for writeout completion as it should be rare enough.
849  *
850  * \param env           thread execution environment
851  * \param dt            dt object undergoing IO (OSD object + methods)
852  * \param pos           byte offset of IO start
853  * \param len           number of bytes of IO
854  * \param lnb           array of extents undergoing IO
855  * \param rw            read or write operation, and other flags
856  * \param capa          capabilities
857  *
858  * \retval pages        (zero or more) loaded successfully
859  * \retval -ENOMEM      on memory/page allocation error
860  */
861 static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
862                         loff_t pos, ssize_t len, struct niobuf_local *lnb,
863                         int maxlnb, enum dt_bufs_type rw)
864 {
865         struct osd_thread_info *oti = osd_oti_get(env);
866         struct osd_object *obj = osd_dt_obj(dt);
867         struct osd_device *osd   = osd_obj2dev(obj);
868         int npages, i, iosize, rc = 0;
869         bool cache, write;
870         loff_t fsize;
871         gfp_t gfp_mask;
872
873         LASSERT(obj->oo_inode);
874
875         if (unlikely(obj->oo_destroyed))
876                 RETURN(-ENOENT);
877
878         rc = osd_map_remote_to_local(pos, len, &npages, lnb, maxlnb);
879         if (rc)
880                 RETURN(rc);
881
882         write = rw & DT_BUFS_TYPE_WRITE;
883
884         fsize = lnb[npages - 1].lnb_file_offset + lnb[npages - 1].lnb_len;
885         iosize = fsize - lnb[0].lnb_file_offset;
886         fsize = max(fsize, i_size_read(obj->oo_inode));
887
888         cache = rw & DT_BUFS_TYPE_READAHEAD;
889         if (cache)
890                 goto bypass_checks;
891
892         cache = osd_use_page_cache(osd);
893         while (cache) {
894                 if (write) {
895                         if (!osd->od_writethrough_cache) {
896                                 cache = false;
897                                 break;
898                         }
899                         if (iosize > osd->od_writethrough_max_iosize) {
900                                 cache = false;
901                                 break;
902                         }
903                 } else {
904                         if (!osd->od_read_cache) {
905                                 cache = false;
906                                 break;
907                         }
908                         if (iosize > osd->od_readcache_max_iosize) {
909                                 cache = false;
910                                 break;
911                         }
912                 }
913                 /* don't use cache on large files */
914                 if (osd->od_readcache_max_filesize &&
915                     fsize > osd->od_readcache_max_filesize)
916                         cache = false;
917                 break;
918         }
919
920 bypass_checks:
921         if (!cache && unlikely(!oti->oti_dio_pages)) {
922                 OBD_ALLOC_PTR_ARRAY_LARGE(oti->oti_dio_pages,
923                                           PTLRPC_MAX_BRW_PAGES);
924                 if (!oti->oti_dio_pages)
925                         return -ENOMEM;
926         }
927
928         /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */
929         gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) :
930                                              GFP_HIGHUSER;
931         for (i = 0; i < npages; i++, lnb++) {
932                 lnb->lnb_page = osd_get_page(env, dt, lnb->lnb_file_offset,
933                                              gfp_mask, cache);
934                 if (lnb->lnb_page == NULL)
935                         GOTO(cleanup, rc = -ENOMEM);
936
937                 lnb->lnb_locked = 1;
938                 if (cache)
939                         mark_page_accessed(lnb->lnb_page);
940         }
941
942 #if 0
943         /* XXX: this version doesn't invalidate cached pages, but use them */
944         if (!cache && write && obj->oo_inode->i_mapping->nrpages) {
945                 /* do not allow data aliasing, invalidate pagecache */
946                 /* XXX: can be quite expensive in mixed case */
947                 invalidate_mapping_pages(obj->oo_inode->i_mapping,
948                                 lnb[0].lnb_file_offset >> PAGE_SHIFT,
949                                 lnb[npages - 1].lnb_file_offset >> PAGE_SHIFT);
950         }
951 #endif
952
953         RETURN(i);
954
955 cleanup:
956         if (i > 0)
957                 osd_bufs_put(env, dt, lnb - i, i);
958         return rc;
959 }
960 /* Borrow @ext4_chunk_trans_blocks */
961 static int osd_chunk_trans_blocks(struct inode *inode, int nrblocks)
962 {
963         ldiskfs_group_t groups;
964         int gdpblocks;
965         int idxblocks;
966         int depth;
967         int ret;
968
969         depth = ext_depth(inode);
970         idxblocks = depth * 2;
971
972         /*
973          * Now let's see how many group bitmaps and group descriptors need
974          * to account.
975          */
976         groups = idxblocks + 1;
977         gdpblocks = groups;
978         if (groups > LDISKFS_SB(inode->i_sb)->s_groups_count)
979                 groups = LDISKFS_SB(inode->i_sb)->s_groups_count;
980         if (gdpblocks > LDISKFS_SB(inode->i_sb)->s_gdb_count)
981                 gdpblocks = LDISKFS_SB(inode->i_sb)->s_gdb_count;
982
983         /* bitmaps and block group descriptor blocks */
984         ret = idxblocks + groups + gdpblocks;
985
986         /* Blocks for super block, inode, quota and xattr blocks */
987         ret += LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
988
989         return ret;
990 }
991
992 #ifdef HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS
993 static int osd_extend_restart_trans(handle_t *handle, int needed,
994                                     struct inode *inode)
995 {
996         int rc;
997
998         rc = ldiskfs_journal_ensure_credits(handle, needed,
999                 ldiskfs_trans_default_revoke_credits(inode->i_sb));
1000         /* this means journal has been restarted */
1001         if (rc > 0)
1002                 rc = 0;
1003
1004         return rc;
1005 }
1006 #else
1007 static int osd_extend_restart_trans(handle_t *handle, int needed,
1008                                     struct inode *inode)
1009 {
1010         int rc;
1011
1012         if (ldiskfs_handle_has_enough_credits(handle, needed))
1013                 return 0;
1014         rc = ldiskfs_journal_extend(handle,
1015                                 needed - handle->h_buffer_credits);
1016         if (rc <= 0)
1017                 return rc;
1018
1019         return ldiskfs_journal_restart(handle, needed);
1020 }
1021 #endif /* HAVE_LDISKFS_JOURNAL_ENSURE_CREDITS */
1022
1023 static int osd_ldiskfs_map_write(struct inode *inode, struct osd_iobuf *iobuf,
1024                                  struct osd_device *osd, sector_t start_blocks,
1025                                  sector_t count, loff_t *disk_size,
1026                                  __u64 user_size)
1027 {
1028         /* if file has grown, take user_size into account */
1029         if (user_size && *disk_size > user_size)
1030                 *disk_size = user_size;
1031
1032         spin_lock(&inode->i_lock);
1033         if (*disk_size > i_size_read(inode)) {
1034                 i_size_write(inode, *disk_size);
1035                 LDISKFS_I(inode)->i_disksize = *disk_size;
1036                 spin_unlock(&inode->i_lock);
1037                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
1038         } else {
1039                 spin_unlock(&inode->i_lock);
1040         }
1041
1042         /*
1043          * We don't do stats here as in read path because
1044          * write is async: we'll do this in osd_put_bufs()
1045          */
1046         return osd_do_bio(osd, inode, iobuf, start_blocks, count);
1047 }
1048
1049 static unsigned int osd_extent_bytes(const struct osd_device *o)
1050 {
1051         unsigned int *extent_bytes_ptr =
1052                         raw_cpu_ptr(o->od_extent_bytes_percpu);
1053
1054         if (likely(*extent_bytes_ptr))
1055                 return *extent_bytes_ptr;
1056
1057         /* initialize on first access or CPU hotplug */
1058         if (!ldiskfs_has_feature_extents(osd_sb(o)))
1059                 *extent_bytes_ptr = 1 << osd_sb(o)->s_blocksize_bits;
1060         else
1061                 *extent_bytes_ptr = OSD_DEFAULT_EXTENT_BYTES;
1062
1063         return *extent_bytes_ptr;
1064 }
1065
1066 #define EXTENT_BYTES_DECAY 64
1067 static void osd_decay_extent_bytes(struct osd_device *osd,
1068                                    unsigned int new_bytes)
1069 {
1070         unsigned int old_bytes;
1071
1072         if (!ldiskfs_has_feature_extents(osd_sb(osd)))
1073                 return;
1074
1075         old_bytes = osd_extent_bytes(osd);
1076         *raw_cpu_ptr(osd->od_extent_bytes_percpu) =
1077                 (old_bytes * (EXTENT_BYTES_DECAY - 1) +
1078                  min(new_bytes, OSD_DEFAULT_EXTENT_BYTES) +
1079                  EXTENT_BYTES_DECAY - 1) / EXTENT_BYTES_DECAY;
1080 }
1081
1082 static int osd_ldiskfs_map_inode_pages(struct inode *inode,
1083                                        struct osd_iobuf *iobuf,
1084                                        struct osd_device *osd,
1085                                        int create, __u64 user_size,
1086                                        int check_credits,
1087                                        struct thandle *thandle)
1088 {
1089         int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
1090         int blocksize = 1 << inode->i_blkbits;
1091         int rc = 0, i = 0, mapped_index = 0;
1092         struct page *fp = NULL;
1093         int clen = 0;
1094         pgoff_t max_page_index;
1095         handle_t *handle = NULL;
1096         sector_t start_blocks = 0, count = 0;
1097         loff_t disk_size = 0;
1098         struct page **page = iobuf->dr_pages;
1099         int pages = iobuf->dr_npages;
1100         sector_t *blocks = iobuf->dr_blocks;
1101         struct niobuf_local *lnb1, *lnb2;
1102         loff_t size1, size2;
1103
1104         max_page_index = inode->i_sb->s_maxbytes >> PAGE_SHIFT;
1105
1106         CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
1107                 inode->i_ino, pages, (*page)->index);
1108
1109         if (create) {
1110                 create = LDISKFS_GET_BLOCKS_CREATE;
1111                 handle = ldiskfs_journal_current_handle();
1112                 LASSERT(handle != NULL);
1113                 rc = osd_attach_jinode(inode);
1114                 if (rc)
1115                         return rc;
1116                 disk_size = i_size_read(inode);
1117                 /* if disk_size is already bigger than specified user_size,
1118                  * ignore user_size
1119                  */
1120                 if (disk_size > user_size)
1121                         user_size = 0;
1122         }
1123         /* pages are sorted already. so, we just have to find
1124          * contig. space and process them properly
1125          */
1126         while (i < pages) {
1127                 long blen, total = 0, previous_total = 0;
1128                 struct ldiskfs_map_blocks map = { 0 };
1129
1130                 if (fp == NULL) { /* start new extent */
1131                         fp = *page++;
1132                         clen = 1;
1133                         if (++i != pages)
1134                                 continue;
1135                 } else if (fp->index + clen == (*page)->index) {
1136                         /* continue the extent */
1137                         page++;
1138                         clen++;
1139                         if (++i != pages)
1140                                 continue;
1141                 }
1142                 if (fp->index + clen >= max_page_index)
1143                         GOTO(cleanup, rc = -EFBIG);
1144                 /* process found extent */
1145                 map.m_lblk = fp->index * blocks_per_page;
1146                 map.m_len = blen = clen * blocks_per_page;
1147
1148                 /*
1149                  * For PAGE_SIZE > blocksize block allocation mapping, the
1150                  * ldiskfs_map_blocks() aims at looking up already mapped
1151                  * blocks, recording them to iobuf->dr_blocks and fixing up
1152                  * m_lblk, m_len for un-allocated blocks to be created/mapped
1153                  * in the second ldiskfs_map_blocks().
1154                  *
1155                  * M_lblk should be the first un-allocated block if m_lblk
1156                  * points at an already allocated block when create = 1,
1157                  * ldiskfs_map_blocks() will just return with already
1158                  * allocated blocks and without allocating any requested
1159                  * new blocks for the extent. For PAGE_SIZE = blocksize
1160                  * case, if m_lblk points at an already allocated block it
1161                  * will point at an un-allocated block in next restart
1162                  * transaction, because the already mapped block/page will
1163                  * be filtered out in next restart transaction via flag
1164                  * OBD_BRW_DONE in osd_declare_write_commit().
1165                  */
1166                 if (create && PAGE_SIZE > blocksize) {
1167                         /* With flags=0 just for already mapped blocks lookup */
1168                         rc = ldiskfs_map_blocks(handle, inode, &map, 0);
1169                         if (rc > 0 && map.m_flags & LDISKFS_MAP_MAPPED) {
1170                                 for (; total < blen && total < map.m_len;
1171                                                 total++)
1172                                         *(blocks + total) = map.m_pblk + total;
1173
1174                                 /* The extent is already full mapped */
1175                                 if (total == blen) {
1176                                         rc = 0;
1177                                         goto ext_already_mapped;
1178                                 }
1179                         }
1180                         /*
1181                          * Fixup or reset m_lblk and m_len for un-mapped blocks.
1182                          * The second ldiskfs_map_blocks() will create and map
1183                          * them.
1184                          */
1185                         map.m_lblk = fp->index * blocks_per_page + total;
1186                         map.m_len = blen - total;
1187                 }
1188
1189 cont_map:
1190                 /**
1191                  * We might restart transaction for block allocations,
1192                  * in order to make sure data ordered mode, issue IO, disk
1193                  * size update and block allocations need be within same
1194                  * transaction to make sure consistency.
1195                  */
1196                 if (handle && check_credits) {
1197                         struct osd_thandle *oh;
1198
1199                         LASSERT(thandle != NULL);
1200                         oh = container_of(thandle, struct osd_thandle,
1201                                           ot_super);
1202                         /*
1203                          * only issue IO if restart transaction needed,
1204                          * as update disk size need hold inode lock, we
1205                          * want to avoid that as much as possible.
1206                          */
1207                         if (oh->oh_declared_ext <= 0) {
1208                                 rc = osd_ldiskfs_map_write(inode,
1209                                         iobuf, osd, start_blocks,
1210                                         count, &disk_size, user_size);
1211                                 if (rc)
1212                                         GOTO(cleanup, rc);
1213                                 thandle->th_restart_tran = 1;
1214                                 GOTO(cleanup, rc = -EAGAIN);
1215                         }
1216
1217                         if (OBD_FAIL_CHECK(OBD_FAIL_OST_RESTART_IO))
1218                                 oh->oh_declared_ext = 0;
1219                         else
1220                                 oh->oh_declared_ext--;
1221                 }
1222                 rc = ldiskfs_map_blocks(handle, inode, &map, create);
1223                 if (rc >= 0) {
1224                         int c = 0;
1225
1226                         for (; total < blen && c < map.m_len; c++, total++) {
1227                                 if (rc == 0) {
1228                                         *(blocks + total) = 0;
1229                                         total++;
1230                                         break;
1231                                 }
1232                                 if ((map.m_flags & LDISKFS_MAP_UNWRITTEN) &&
1233                                     !create) {
1234                                         /* don't try to read allocated, but
1235                                          * unwritten blocks, instead fill the
1236                                          * patches with zeros in osd_do_bio() */
1237                                         *(blocks + total) = 0;
1238                                         continue;
1239                                 }
1240                                 *(blocks + total) = map.m_pblk + c;
1241                                 /* unmap any possible underlying
1242                                  * metadata from the block device
1243                                  * mapping.  b=6998.
1244                                  */
1245                                 if ((map.m_flags & LDISKFS_MAP_NEW) &&
1246                                     create)
1247                                         clean_bdev_aliases(inode->i_sb->s_bdev,
1248                                                            map.m_pblk + c, 1);
1249                         }
1250                         rc = 0;
1251                 }
1252
1253 ext_already_mapped:
1254                 if (rc == 0 && create) {
1255                         count += (total - previous_total);
1256                         mapped_index = (count + blocks_per_page -
1257                                         1) / blocks_per_page - 1;
1258                         lnb1 = iobuf->dr_lnbs[i - clen];
1259                         lnb2 = iobuf->dr_lnbs[mapped_index];
1260                         size1 = lnb1->lnb_file_offset -
1261                                 (lnb1->lnb_file_offset % PAGE_SIZE) +
1262                                 (total << inode->i_blkbits);
1263                         size2 = lnb2->lnb_file_offset + lnb2->lnb_len;
1264
1265                         if (size1 > size2)
1266                                 size1 = size2;
1267                         if (size1 > disk_size)
1268                                 disk_size = size1;
1269                 }
1270
1271                 if (rc == 0 && total < blen) {
1272                         /*
1273                          * decay extent blocks if we could not
1274                          * allocate extent once.
1275                          */
1276                         osd_decay_extent_bytes(osd,
1277                                 (total - previous_total) << inode->i_blkbits);
1278                         map.m_lblk = fp->index * blocks_per_page + total;
1279                         map.m_len = blen - total;
1280                         previous_total = total;
1281                         goto cont_map;
1282                 }
1283                 if (rc != 0)
1284                         GOTO(cleanup, rc);
1285                 /*
1286                  * decay extent blocks if we could allocate
1287                  * good large extent.
1288                  */
1289                 if (total - previous_total >=
1290                     osd_extent_bytes(osd) >> inode->i_blkbits)
1291                         osd_decay_extent_bytes(osd,
1292                                 (total - previous_total) << inode->i_blkbits);
1293                 /* look for next extent */
1294                 fp = NULL;
1295                 blocks += blocks_per_page * clen;
1296         }
1297 cleanup:
1298         if (rc == 0 && create &&
1299             start_blocks < pages * blocks_per_page) {
1300                 rc = osd_ldiskfs_map_write(inode, iobuf, osd, start_blocks,
1301                                            count, &disk_size, user_size);
1302                 LASSERT(start_blocks + count == pages * blocks_per_page);
1303         }
1304         return rc;
1305 }
1306
1307 static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
1308                           struct niobuf_local *lnb, int npages)
1309 {
1310         struct osd_thread_info *oti   = osd_oti_get(env);
1311         struct osd_iobuf       *iobuf = &oti->oti_iobuf;
1312         struct inode           *inode = osd_dt_obj(dt)->oo_inode;
1313         struct osd_device      *osd   = osd_obj2dev(osd_dt_obj(dt));
1314         ktime_t start, end;
1315         s64 timediff;
1316         ssize_t isize;
1317         __s64  maxidx;
1318         int i, rc = 0;
1319
1320         LASSERT(inode);
1321
1322         rc = osd_init_iobuf(osd, iobuf, 0, npages);
1323         if (unlikely(rc != 0))
1324                 RETURN(rc);
1325
1326         isize = i_size_read(inode);
1327         maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1;
1328
1329         start = ktime_get();
1330         for (i = 0; i < npages; i++) {
1331
1332                 /*
1333                  * till commit the content of the page is undefined
1334                  * we'll set it uptodate once bulk is done. otherwise
1335                  * subsequent reads can access non-stable data
1336                  */
1337                 ClearPageUptodate(lnb[i].lnb_page);
1338
1339                 if (lnb[i].lnb_len == PAGE_SIZE)
1340                         continue;
1341
1342                 if (maxidx >= lnb[i].lnb_page->index) {
1343                         osd_iobuf_add_page(iobuf, &lnb[i]);
1344                 } else {
1345                         long off;
1346                         char *p = kmap(lnb[i].lnb_page);
1347
1348                         off = lnb[i].lnb_page_offset;
1349                         if (off)
1350                                 memset(p, 0, off);
1351                         off = (lnb[i].lnb_page_offset + lnb[i].lnb_len) &
1352                               ~PAGE_MASK;
1353                         if (off)
1354                                 memset(p + off, 0, PAGE_SIZE - off);
1355                         kunmap(lnb[i].lnb_page);
1356                 }
1357         }
1358         end = ktime_get();
1359         timediff = ktime_us_delta(end, start);
1360         lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff);
1361
1362         if (iobuf->dr_npages) {
1363                 rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
1364                                                  0, 0, NULL);
1365                 if (likely(rc == 0)) {
1366                         rc = osd_do_bio(osd, inode, iobuf, 0, 0);
1367                         /* do IO stats for preparation reads */
1368                         osd_fini_iobuf(osd, iobuf);
1369                 }
1370         }
1371         RETURN(rc);
1372 }
1373
1374 struct osd_fextent {
1375         sector_t        start;
1376         sector_t        end;
1377         __u32           flags;
1378         unsigned int    mapped:1;
1379 };
1380
1381 static int osd_is_mapped(struct dt_object *dt, __u64 offset,
1382                          struct osd_fextent *cached_extent)
1383 {
1384         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1385         sector_t block = offset >> inode->i_blkbits;
1386         sector_t start;
1387         struct fiemap_extent_info fei = { 0 };
1388         struct fiemap_extent fe = { 0 };
1389         int rc;
1390
1391         if (block >= cached_extent->start && block < cached_extent->end)
1392                 return cached_extent->mapped;
1393
1394         if (i_size_read(inode) == 0)
1395                 return 0;
1396
1397         /* Beyond EOF, must not be mapped */
1398         if (((i_size_read(inode) - 1) >> inode->i_blkbits) < block)
1399                 return 0;
1400
1401         fei.fi_extents_max = 1;
1402         fei.fi_extents_start = &fe;
1403
1404         rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
1405         if (rc != 0)
1406                 return 0;
1407
1408         start = fe.fe_logical >> inode->i_blkbits;
1409         cached_extent->flags = fe.fe_flags;
1410         if (fei.fi_extents_mapped == 0) {
1411                 /* a special case - no extent found at this offset and forward.
1412                  * we can consider this as a hole to EOF. it's safe to cache
1413                  * as other threads can not allocate/punch blocks this thread
1414                  * is working on (LDLM). */
1415                 cached_extent->start = block;
1416                 cached_extent->end = i_size_read(inode) >> inode->i_blkbits;
1417                 cached_extent->mapped = 0;
1418                 return 0;
1419         }
1420
1421         if (start > block) {
1422                 cached_extent->start = block;
1423                 cached_extent->end = start;
1424                 cached_extent->mapped = 0;
1425         } else {
1426                 cached_extent->start = start;
1427                 cached_extent->end = (fe.fe_logical + fe.fe_length) >>
1428                                       inode->i_blkbits;
1429                 cached_extent->mapped = 1;
1430         }
1431
1432         return cached_extent->mapped;
1433 }
1434
1435 #define MAX_EXTENTS_PER_WRITE 100
1436 static int osd_declare_write_commit(const struct lu_env *env,
1437                                     struct dt_object *dt,
1438                                     struct niobuf_local *lnb, int npages,
1439                                     struct thandle *handle)
1440 {
1441         const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
1442         struct inode            *inode = osd_dt_obj(dt)->oo_inode;
1443         struct osd_thandle      *oh;
1444         int                     extents = 0, new_meta = 0;
1445         int                     depth, new_blocks = 0;
1446         int                     i;
1447         int                     dirty_groups = 0;
1448         int                     rc = 0;
1449         int                     credits = 0;
1450         long long               quota_space = 0;
1451         struct osd_fextent      mapped = { 0 }, extent = { 0 };
1452         enum osd_quota_local_flags local_flags = 0;
1453         enum osd_qid_declare_flags declare_flags = OSD_QID_BLK;
1454         unsigned int            extent_bytes;
1455         ENTRY;
1456
1457         LASSERT(handle != NULL);
1458         oh = container_of(handle, struct osd_thandle, ot_super);
1459         LASSERT(oh->ot_handle == NULL);
1460
1461         /*
1462          * We track a decaying average extent blocks per filesystem,
1463          * for most of time, it will be 1M, with filesystem becoming
1464          * heavily-fragmented, it will be reduced to 4K at the worst.
1465          */
1466         extent_bytes = osd_extent_bytes(osd);
1467         LASSERT(extent_bytes >= osd_sb(osd)->s_blocksize);
1468
1469         /* calculate number of extents (probably better to pass nb) */
1470         for (i = 0; i < npages; i++) {
1471                 /* ignore quota for the whole request if any page is from
1472                  * client cache or written by root.
1473                  *
1474                  * XXX we could handle this on per-lnb basis as done by
1475                  * grant.
1476                  */
1477                 if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
1478                     (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
1479                     !(lnb[i].lnb_flags & OBD_BRW_SYNC))
1480                         declare_flags |= OSD_QID_FORCE;
1481
1482                 /*
1483                  * Convert unwritten extent might need split extents, could
1484                  * not skip it.
1485                  */
1486                 if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
1487                     !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
1488                         lnb[i].lnb_flags |= OBD_BRW_MAPPED;
1489                         continue;
1490                 }
1491
1492                 if (lnb[i].lnb_flags & OBD_BRW_DONE) {
1493                         lnb[i].lnb_flags |= OBD_BRW_MAPPED;
1494                         continue;
1495                 }
1496
1497                 /* count only unmapped changes */
1498                 new_blocks++;
1499                 if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
1500                         if (extent.end != 0)
1501                                 extents += (extent.end - extent.start +
1502                                             extent_bytes - 1) / extent_bytes;
1503                         extent.start = lnb[i].lnb_file_offset;
1504                         extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
1505                 } else {
1506                         extent.end += lnb[i].lnb_len;
1507                 }
1508
1509                 quota_space += PAGE_SIZE;
1510         }
1511
1512         credits++; /* inode */
1513         /*
1514          * overwrite case, no need to modify tree and
1515          * allocate blocks.
1516          */
1517         if (!extent.end)
1518                 goto out_declare;
1519
1520         extents += (extent.end - extent.start +
1521                     extent_bytes - 1) / extent_bytes;
1522         /**
1523          * with system space usage growing up, mballoc codes won't
1524          * try best to scan block group to align best free extent as
1525          * we can. So extent bytes per extent could be decayed to a
1526          * very small value, this could make us reserve too many credits.
1527          * We could be more optimistic in the credit reservations, even
1528          * in a case where the filesystem is nearly full, it is extremely
1529          * unlikely that the worst case would ever be hit.
1530          */
1531         if (extents > MAX_EXTENTS_PER_WRITE)
1532                 extents = MAX_EXTENTS_PER_WRITE;
1533
1534         /**
1535          * If we add a single extent, then in the worse case, each tree
1536          * level index/leaf need to be changed in case of the tree split.
1537          * If more extents are inserted, they could cause the whole tree
1538          * split more than once, but this is really rare.
1539          */
1540         if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
1541                 /*
1542                  * many concurrent threads may grow tree by the time
1543                  * our transaction starts. so, consider 2 is a min depth.
1544                  */
1545                 depth = ext_depth(inode);
1546                 depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
1547                 if (extents <= 1) {
1548                         credits += depth * 2 * extents;
1549                         new_meta = depth;
1550                 } else {
1551                         credits += depth * 3 * extents;
1552                         new_meta = depth * 2 * extents;
1553                 }
1554         } else {
1555                 /*
1556                  * With N contiguous data blocks, we need at most
1557                  * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
1558                  * 2 dindirect blocks, and 1 tindirect block
1559                  */
1560                 new_meta = DIV_ROUND_UP(new_blocks,
1561                                 LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
1562                 credits += new_meta;
1563         }
1564         dirty_groups += (extents + new_meta);
1565
1566         oh->oh_declared_ext = extents;
1567
1568         /* quota space for metadata blocks */
1569         quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
1570
1571         /* quota space should be reported in 1K blocks */
1572         quota_space = toqb(quota_space);
1573
1574         /* each new block can go in different group (bitmap + gd) */
1575
1576         /* we can't dirty more bitmap blocks than exist */
1577         if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
1578                 credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
1579         else
1580                 credits += dirty_groups;
1581
1582         /* we can't dirty more gd blocks than exist */
1583         if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
1584                 credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
1585         else
1586                 credits += dirty_groups;
1587
1588         CDEBUG(D_INODE,
1589                "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
1590                osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
1591                credits);
1592
1593 out_declare:
1594         osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
1595
1596         /* make sure the over quota flags were not set */
1597         lnb[0].lnb_flags &= ~OBD_BRW_OVER_ALLQUOTA;
1598
1599         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
1600                                    i_projid_read(inode), quota_space, oh,
1601                                    osd_dt_obj(dt), &local_flags, declare_flags);
1602
1603         /* we need only to store the overquota flags in the first lnb for
1604          * now, once we support multiple objects BRW, this code needs be
1605          * revised.
1606          */
1607         if (local_flags & QUOTA_FL_OVER_USRQUOTA)
1608                 lnb[0].lnb_flags |= OBD_BRW_OVER_USRQUOTA;
1609         if (local_flags & QUOTA_FL_OVER_GRPQUOTA)
1610                 lnb[0].lnb_flags |= OBD_BRW_OVER_GRPQUOTA;
1611         if (local_flags & QUOTA_FL_OVER_PRJQUOTA)
1612                 lnb[0].lnb_flags |= OBD_BRW_OVER_PRJQUOTA;
1613
1614         if (rc == 0)
1615                 rc = osd_trunc_lock(osd_dt_obj(dt), oh, true);
1616
1617         RETURN(rc);
1618 }
1619
1620 /* Check if a block is allocated or not */
1621 static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
1622                             struct niobuf_local *lnb, int npages,
1623                             struct thandle *thandle, __u64 user_size)
1624 {
1625         struct osd_thread_info *oti = osd_oti_get(env);
1626         struct osd_iobuf *iobuf = &oti->oti_iobuf;
1627         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1628         struct osd_device  *osd = osd_obj2dev(osd_dt_obj(dt));
1629         int rc = 0, i, check_credits = 0;
1630
1631         LASSERT(inode);
1632
1633         rc = osd_init_iobuf(osd, iobuf, 1, npages);
1634         if (unlikely(rc != 0))
1635                 RETURN(rc);
1636
1637         dquot_initialize(inode);
1638
1639         for (i = 0; i < npages; i++) {
1640                 if (lnb[i].lnb_rc == -ENOSPC &&
1641                     (lnb[i].lnb_flags & OBD_BRW_MAPPED)) {
1642                         /* Allow the write to proceed if overwriting an
1643                          * existing block
1644                          */
1645                         lnb[i].lnb_rc = 0;
1646                 }
1647
1648                 if (lnb[i].lnb_rc) { /* ENOSPC, network RPC error, etc. */
1649                         CDEBUG(D_INODE, "Skipping [%d] == %d\n", i,
1650                                lnb[i].lnb_rc);
1651                         LASSERT(lnb[i].lnb_page);
1652                         generic_error_remove_page(inode->i_mapping,
1653                                                   lnb[i].lnb_page);
1654                         continue;
1655                 }
1656
1657                 if (lnb[i].lnb_flags & OBD_BRW_DONE)
1658                         continue;
1659
1660                 if (!(lnb[i].lnb_flags & OBD_BRW_MAPPED))
1661                         check_credits = 1;
1662
1663                 LASSERT(PageLocked(lnb[i].lnb_page));
1664                 LASSERT(!PageWriteback(lnb[i].lnb_page));
1665
1666                 /*
1667                  * Since write and truncate are serialized by oo_sem, even
1668                  * partial-page truncate should not leave dirty pages in the
1669                  * page cache.
1670                  */
1671                 LASSERT(!PageDirty(lnb[i].lnb_page));
1672
1673                 SetPageUptodate(lnb[i].lnb_page);
1674
1675                 osd_iobuf_add_page(iobuf, &lnb[i]);
1676         }
1677
1678         osd_trans_exec_op(env, thandle, OSD_OT_WRITE);
1679
1680         if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) {
1681                 rc = -ENOSPC;
1682         } else if (iobuf->dr_npages > 0) {
1683                 rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd,
1684                                                  1, user_size,
1685                                                  check_credits,
1686                                                  thandle);
1687         } else {
1688                 /* no pages to write, no transno is needed */
1689                 thandle->th_local = 1;
1690         }
1691
1692         if (rc != 0 && !thandle->th_restart_tran)
1693                 osd_fini_iobuf(osd, iobuf);
1694
1695         osd_trans_exec_check(env, thandle, OSD_OT_WRITE);
1696
1697         if (unlikely(rc != 0 && !thandle->th_restart_tran)) {
1698                 /* if write fails, we should drop pages from the cache */
1699                 for (i = 0; i < npages; i++) {
1700                         if (lnb[i].lnb_page == NULL)
1701                                 continue;
1702                         if (!PagePrivate2(lnb[i].lnb_page)) {
1703                                 LASSERT(PageLocked(lnb[i].lnb_page));
1704                                 generic_error_remove_page(inode->i_mapping,
1705                                                           lnb[i].lnb_page);
1706                         }
1707                 }
1708         }
1709
1710         RETURN(rc);
1711 }
1712
1713 static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
1714                          struct niobuf_local *lnb, int npages)
1715 {
1716         struct osd_thread_info *oti = osd_oti_get(env);
1717         struct osd_iobuf *iobuf = &oti->oti_iobuf;
1718         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1719         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
1720         int rc = 0, i, cache_hits = 0, cache_misses = 0;
1721         ktime_t start, end;
1722         s64 timediff;
1723         loff_t isize;
1724
1725         LASSERT(inode);
1726
1727         rc = osd_init_iobuf(osd, iobuf, 0, npages);
1728         if (unlikely(rc != 0))
1729                 RETURN(rc);
1730
1731         isize = i_size_read(inode);
1732
1733         start = ktime_get();
1734         for (i = 0; i < npages; i++) {
1735
1736                 if (isize <= lnb[i].lnb_file_offset)
1737                         /* If there's no more data, abort early.
1738                          * lnb->lnb_rc == 0, so it's easy to detect later.
1739                          */
1740                         break;
1741
1742                 /* instead of looking if we go beyong isize, send complete
1743                  * pages all the time
1744                  */
1745                 lnb[i].lnb_rc = lnb[i].lnb_len;
1746
1747                 /* Bypass disk read if fail_loc is set properly */
1748                 if (OBD_FAIL_CHECK_QUIET(OBD_FAIL_OST_FAKE_RW))
1749                         SetPageUptodate(lnb[i].lnb_page);
1750
1751                 if (PageUptodate(lnb[i].lnb_page)) {
1752                         cache_hits++;
1753                         unlock_page(lnb[i].lnb_page);
1754                 } else {
1755                         cache_misses++;
1756                         osd_iobuf_add_page(iobuf, &lnb[i]);
1757                 }
1758                 /* no need to unlock in osd_bufs_put(), the sooner page is
1759                  * unlocked, the earlier another client can access it.
1760                  * notice real unlock_page() can be called few lines
1761                  * below after osd_do_bio(). lnb is a per-thread, so it's
1762                  * fine to have PG_locked and lnb_locked inconsistent here
1763                  */
1764                 lnb[i].lnb_locked = 0;
1765         }
1766         end = ktime_get();
1767         timediff = ktime_us_delta(end, start);
1768         lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff);
1769
1770         if (cache_hits != 0)
1771                 lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_HIT,
1772                                     cache_hits);
1773         if (cache_misses != 0)
1774                 lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_MISS,
1775                                     cache_misses);
1776         if (cache_hits + cache_misses != 0)
1777                 lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_ACCESS,
1778                                     cache_hits + cache_misses);
1779
1780         if (iobuf->dr_npages) {
1781                 rc = osd_ldiskfs_map_inode_pages(inode, iobuf, osd, 0,
1782                                                  0, 0, NULL);
1783                 if (!rc)
1784                         rc = osd_do_bio(osd, inode, iobuf, 0, 0);
1785
1786                 /* IO stats will be done in osd_bufs_put() */
1787
1788                 /* early release to let others read data during the bulk */
1789                 for (i = 0; i < iobuf->dr_npages; i++) {
1790                         LASSERT(PageLocked(iobuf->dr_pages[i]));
1791                         if (!PagePrivate2(iobuf->dr_pages[i]))
1792                                 unlock_page(iobuf->dr_pages[i]);
1793                 }
1794         }
1795
1796         RETURN(rc);
1797 }
1798
1799 /*
1800  * XXX: Another layering violation for now.
1801  *
1802  * We don't want to use ->f_op->read methods, because generic file write
1803  *
1804  *         - serializes on ->i_sem, and
1805  *
1806  *         - does a lot of extra work like balance_dirty_pages(),
1807  *
1808  * which doesn't work for globally shared files like /last_rcvd.
1809  */
1810 static int osd_ldiskfs_readlink(struct inode *inode, char *buffer, int buflen)
1811 {
1812         struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
1813
1814         memcpy(buffer, (char *)ei->i_data, buflen);
1815
1816         return  buflen;
1817 }
1818
1819 int osd_ldiskfs_read(struct inode *inode, void *buf, int size, loff_t *offs)
1820 {
1821         struct buffer_head *bh;
1822         unsigned long block;
1823         int osize;
1824         int blocksize;
1825         int csize;
1826         int boffs;
1827
1828         /* prevent reading after eof */
1829         spin_lock(&inode->i_lock);
1830         if (i_size_read(inode) < *offs + size) {
1831                 loff_t diff = i_size_read(inode) - *offs;
1832
1833                 spin_unlock(&inode->i_lock);
1834                 if (diff < 0) {
1835                         CDEBUG(D_OTHER,
1836                                "size %llu is too short to read @%llu\n",
1837                                i_size_read(inode), *offs);
1838                         return -EBADR;
1839                 } else if (diff == 0) {
1840                         return 0;
1841                 } else {
1842                         size = diff;
1843                 }
1844         } else {
1845                 spin_unlock(&inode->i_lock);
1846         }
1847
1848         blocksize = 1 << inode->i_blkbits;
1849         osize = size;
1850         while (size > 0) {
1851                 block = *offs >> inode->i_blkbits;
1852                 boffs = *offs & (blocksize - 1);
1853                 csize = min(blocksize - boffs, size);
1854                 bh = __ldiskfs_bread(NULL, inode, block, 0);
1855                 if (IS_ERR(bh)) {
1856                         CERROR("%s: can't read %u@%llu on ino %lu: rc = %ld\n",
1857                                osd_ino2name(inode), csize, *offs, inode->i_ino,
1858                                PTR_ERR(bh));
1859                         return PTR_ERR(bh);
1860                 }
1861
1862                 if (bh != NULL) {
1863                         memcpy(buf, bh->b_data + boffs, csize);
1864                         brelse(bh);
1865                 } else {
1866                         memset(buf, 0, csize);
1867                 }
1868
1869                 *offs += csize;
1870                 buf += csize;
1871                 size -= csize;
1872         }
1873         return osize;
1874 }
1875
1876 static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
1877                         struct lu_buf *buf, loff_t *pos)
1878 {
1879         struct inode *inode = osd_dt_obj(dt)->oo_inode;
1880         int rc;
1881
1882         /* Read small symlink from inode body as we need to maintain correct
1883          * on-disk symlinks for ldiskfs.
1884          */
1885         if (S_ISLNK(dt->do_lu.lo_header->loh_attr)) {
1886                 loff_t size = i_size_read(inode);
1887
1888                 if (buf->lb_len < size)
1889                         return -EOVERFLOW;
1890
1891                 if (size < sizeof(LDISKFS_I(inode)->i_data))
1892                         rc = osd_ldiskfs_readlink(inode, buf->lb_buf, size);
1893                 else
1894                         rc = osd_ldiskfs_read(inode, buf->lb_buf, size, pos);
1895         } else {
1896                 rc = osd_ldiskfs_read(inode, buf->lb_buf, buf->lb_len, pos);
1897         }
1898
1899         return rc;
1900 }
1901
1902 static inline int osd_extents_enabled(struct super_block *sb,
1903                                       struct inode *inode)
1904 {
1905         if (inode != NULL) {
1906                 if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL)
1907                         return 1;
1908         } else if (ldiskfs_has_feature_extents(sb)) {
1909                 return 1;
1910         }
1911         return 0;
1912 }
1913
1914 int osd_calc_bkmap_credits(struct super_block *sb, struct inode *inode,
1915                            const loff_t size, const loff_t pos,
1916                            const int blocks)
1917 {
1918         int credits, bits, bs, i;
1919
1920         bits = sb->s_blocksize_bits;
1921         bs = 1 << bits;
1922
1923         /* legacy blockmap: 3 levels * 3 (bitmap,gd,itself)
1924          * we do not expect blockmaps on the large files,
1925          * so let's shrink it to 2 levels (4GB files)
1926          */
1927
1928         /* this is default reservation: 2 levels */
1929         credits = (blocks + 2) * 3;
1930
1931         /* actual offset is unknown, hard to optimize */
1932         if (pos == -1)
1933                 return credits;
1934
1935         /* now check for few specific cases to optimize */
1936         if (pos + size <= LDISKFS_NDIR_BLOCKS * bs) {
1937                 /* no indirects */
1938                 credits = blocks;
1939                 /* allocate if not allocated */
1940                 if (inode == NULL) {
1941                         credits += blocks * 2;
1942                         return credits;
1943                 }
1944                 for (i = (pos >> bits); i < (pos >> bits) + blocks; i++) {
1945                         LASSERT(i < LDISKFS_NDIR_BLOCKS);
1946                         if (LDISKFS_I(inode)->i_data[i] == 0)
1947                                 credits += 2;
1948                 }
1949         } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) {
1950                 /* single indirect */
1951                 credits = blocks * 3;
1952                 if (inode == NULL ||
1953                     LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK] == 0)
1954                         credits += 3;
1955                 else
1956                         /* The indirect block may be modified. */
1957                         credits += 1;
1958         }
1959
1960         return credits;
1961 }
1962
1963 static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
1964                                  const struct lu_buf *buf, loff_t _pos,
1965                                  struct thandle *handle)
1966 {
1967         struct osd_object  *obj  = osd_dt_obj(dt);
1968         struct inode       *inode = obj->oo_inode;
1969         struct super_block *sb = osd_sb(osd_obj2dev(obj));
1970         struct osd_thandle *oh;
1971         int                 rc = 0, est = 0, credits, blocks, allocated = 0;
1972         int                 bits, bs;
1973         int                 depth, size;
1974         loff_t              pos;
1975         ENTRY;
1976
1977         LASSERT(buf != NULL);
1978         LASSERT(handle != NULL);
1979
1980         oh = container_of(handle, struct osd_thandle, ot_super);
1981         LASSERT(oh->ot_handle == NULL);
1982
1983         size = buf->lb_len;
1984         bits = sb->s_blocksize_bits;
1985         bs = 1 << bits;
1986
1987         if (_pos == -1) {
1988                 /* if this is an append, then we
1989                  * should expect cross-block record
1990                  */
1991                 pos = 0;
1992         } else {
1993                 pos = _pos;
1994         }
1995
1996         /* blocks to modify */
1997         blocks = ((pos + size + bs - 1) >> bits) - (pos >> bits);
1998         LASSERT(blocks > 0);
1999
2000         if (inode != NULL && _pos != -1) {
2001                 /* object size in blocks */
2002                 est = (i_size_read(inode) + bs - 1) >> bits;
2003                 allocated = inode->i_blocks >> (bits - 9);
2004                 if (pos + size <= i_size_read(inode) && est <= allocated) {
2005                         /* looks like an overwrite, no need to modify tree */
2006                         credits = blocks;
2007                         /* no need to modify i_size */
2008                         goto out;
2009                 }
2010         }
2011
2012         if (osd_extents_enabled(sb, inode)) {
2013                 /*
2014                  * many concurrent threads may grow tree by the time
2015                  * our transaction starts. so, consider 2 is a min depth
2016                  * for every level we may need to allocate a new block
2017                  * and take some entries from the old one. so, 3 blocks
2018                  * to allocate (bitmap, gd, itself) + old block - 4 per
2019                  * level.
2020                  */
2021                 depth = inode != NULL ? ext_depth(inode) : 0;
2022                 depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH);
2023                 credits = depth;
2024                 /* if not append, then split may need to modify
2025                  * existing blocks moving entries into the new ones
2026                  */
2027                 if (_pos != -1)
2028                         credits += depth;
2029                 /* blocks to store data: bitmap,gd,itself */
2030                 credits += blocks * 3;
2031         } else {
2032                 credits = osd_calc_bkmap_credits(sb, inode, size, _pos, blocks);
2033         }
2034         /* if inode is created as part of the transaction,
2035          * then it's counted already by the creation method
2036          */
2037         if (inode != NULL)
2038                 credits++;
2039
2040 out:
2041
2042         osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
2043
2044         /* dt_declare_write() is usually called for system objects, such
2045          * as llog or last_rcvd files. We needn't enforce quota on those
2046          * objects, so always set the lqi_space as 0.
2047          */
2048         if (inode != NULL)
2049                 rc = osd_declare_inode_qid(env, i_uid_read(inode),
2050                                            i_gid_read(inode),
2051                                            i_projid_read(inode), 0,
2052                                            oh, obj, NULL, OSD_QID_BLK);
2053
2054         if (rc == 0)
2055                 rc = osd_trunc_lock(obj, oh, true);
2056
2057         RETURN(rc);
2058 }
2059
2060 static int osd_ldiskfs_writelink(struct inode *inode, char *buffer, int buflen)
2061 {
2062         /* LU-2634: clear the extent format for fast symlink */
2063         ldiskfs_clear_inode_flag(inode, LDISKFS_INODE_EXTENTS);
2064
2065         memcpy((char *)&LDISKFS_I(inode)->i_data, (char *)buffer, buflen);
2066         spin_lock(&inode->i_lock);
2067         LDISKFS_I(inode)->i_disksize = buflen;
2068         i_size_write(inode, buflen);
2069         spin_unlock(&inode->i_lock);
2070         osd_dirty_inode(inode, I_DIRTY_DATASYNC);
2071
2072         return 0;
2073 }
2074
2075 static int osd_ldiskfs_write_record(struct dt_object *dt, void *buf,
2076                                     int bufsize, int write_NUL, loff_t *offs,
2077                                     handle_t *handle)
2078 {
2079         struct inode *inode = osd_dt_obj(dt)->oo_inode;
2080         struct buffer_head *bh        = NULL;
2081         loff_t              offset    = *offs;
2082         loff_t              new_size  = i_size_read(inode);
2083         unsigned long       block;
2084         int                 blocksize = 1 << inode->i_blkbits;
2085         struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
2086         int                 err = 0;
2087         int                 size;
2088         int                 boffs;
2089         int                 dirty_inode = 0;
2090         bool create, sparse, sync = false;
2091
2092         if (write_NUL) {
2093                 /*
2094                  * long symlink write does not count the NUL terminator in
2095                  * bufsize, we write it, and the inode's file size does not
2096                  * count the NUL terminator as well.
2097                  */
2098                 ((char *)buf)[bufsize] = '\0';
2099                 ++bufsize;
2100         }
2101
2102         /* only the first flag-set matters */
2103         dirty_inode = !test_and_set_bit(LDISKFS_INODE_JOURNAL_DATA,
2104                                        &ei->i_flags);
2105
2106         /* sparse checking is racy, but sparse is very rare case, leave as is */
2107         sparse = (new_size > 0 && (inode->i_blocks >> (inode->i_blkbits - 9)) <
2108                   ((new_size - 1) >> inode->i_blkbits) + 1);
2109
2110         while (bufsize > 0) {
2111                 int credits = handle->h_buffer_credits;
2112                 unsigned long last_block = (new_size == 0) ? 0 :
2113                                            (new_size - 1) >> inode->i_blkbits;
2114
2115                 if (bh)
2116                         brelse(bh);
2117
2118                 block = offset >> inode->i_blkbits;
2119                 boffs = offset & (blocksize - 1);
2120                 size = min(blocksize - boffs, bufsize);
2121                 sync = (block > last_block || new_size == 0 || sparse);
2122
2123                 if (sync)
2124                         down(&ei->i_append_sem);
2125
2126                 bh = __ldiskfs_bread(handle, inode, block, 0);
2127
2128                 if (unlikely(IS_ERR_OR_NULL(bh) && !sync))
2129                         CWARN(
2130                               "%s: adding bh without locking off %llu (block %lu, size %d, offs %llu)\n",
2131                               osd_ino2name(inode),
2132                               offset, block, bufsize, *offs);
2133
2134                 if (IS_ERR_OR_NULL(bh)) {
2135                         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
2136                         int flags = LDISKFS_GET_BLOCKS_CREATE;
2137
2138                         /* while the file system is being mounted, avoid
2139                          * preallocation otherwise mount can take a long
2140                          * time as mballoc cache is cold.
2141                          * XXX: this is a workaround until we have a proper
2142                          *      fix in mballoc
2143                          * XXX: works with extent-based files only */
2144                         if (!osd->od_cl_seq)
2145                                 flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE;
2146                         bh = __ldiskfs_bread(handle, inode, block, flags);
2147                         create = true;
2148                 } else {
2149                         if (sync) {
2150                                 up(&ei->i_append_sem);
2151                                 sync = false;
2152                         }
2153                         create = false;
2154                 }
2155                 if (IS_ERR_OR_NULL(bh)) {
2156                         if (bh == NULL) {
2157                                 err = -EIO;
2158                         } else {
2159                                 err = PTR_ERR(bh);
2160                                 bh = NULL;
2161                         }
2162
2163                         CERROR(
2164                                "%s: error reading offset %llu (block %lu, size %d, offs %llu), credits %d/%d: rc = %d\n",
2165                                osd_ino2name(inode), offset, block, bufsize,
2166                                *offs, credits, handle->h_buffer_credits, err);
2167                         break;
2168                 }
2169
2170                 err = ldiskfs_journal_get_write_access(handle, bh);
2171                 if (err) {
2172                         CERROR("journal_get_write_access() returned error %d\n",
2173                                err);
2174                         break;
2175                 }
2176                 LASSERTF(boffs + size <= bh->b_size,
2177                          "boffs %d size %d bh->b_size %lu\n",
2178                          boffs, size, (unsigned long)bh->b_size);
2179                 if (create) {
2180                         memset(bh->b_data, 0, bh->b_size);
2181                         if (sync) {
2182                                 up(&ei->i_append_sem);
2183                                 sync = false;
2184                         }
2185                 }
2186                 memcpy(bh->b_data + boffs, buf, size);
2187                 err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
2188                 if (err)
2189                         break;
2190
2191                 if (offset + size > new_size)
2192                         new_size = offset + size;
2193                 offset += size;
2194                 bufsize -= size;
2195                 buf += size;
2196         }
2197         if (sync)
2198                 up(&ei->i_append_sem);
2199
2200         if (bh)
2201                 brelse(bh);
2202
2203         if (write_NUL)
2204                 --new_size;
2205         /* correct in-core and on-disk sizes */
2206         if (new_size > i_size_read(inode)) {
2207                 spin_lock(&inode->i_lock);
2208                 if (new_size > i_size_read(inode))
2209                         i_size_write(inode, new_size);
2210                 if (i_size_read(inode) > ei->i_disksize) {
2211                         ei->i_disksize = i_size_read(inode);
2212                         dirty_inode = 1;
2213                 }
2214                 spin_unlock(&inode->i_lock);
2215         }
2216         if (dirty_inode)
2217                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
2218
2219         if (err == 0)
2220                 *offs = offset;
2221         return err;
2222 }
2223
2224 static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
2225                          const struct lu_buf *buf, loff_t *pos,
2226                          struct thandle *handle)
2227 {
2228         struct inode            *inode = osd_dt_obj(dt)->oo_inode;
2229         struct osd_thandle      *oh;
2230         ssize_t                 result;
2231         int                     is_link;
2232
2233         LASSERT(dt_object_exists(dt));
2234
2235         LASSERT(handle != NULL);
2236         LASSERT(inode != NULL);
2237         dquot_initialize(inode);
2238
2239         /* XXX: don't check: one declared chunk can be used many times */
2240         /* osd_trans_exec_op(env, handle, OSD_OT_WRITE); */
2241
2242         oh = container_of(handle, struct osd_thandle, ot_super);
2243         LASSERT(oh->ot_handle->h_transaction != NULL);
2244         osd_trans_exec_op(env, handle, OSD_OT_WRITE);
2245
2246         /* Write small symlink to inode body as we need to maintain correct
2247          * on-disk symlinks for ldiskfs.
2248          * Note: the buf->lb_buf contains a NUL terminator while buf->lb_len
2249          * does not count it in.
2250          */
2251         is_link = S_ISLNK(dt->do_lu.lo_header->loh_attr);
2252         if (is_link && (buf->lb_len < sizeof(LDISKFS_I(inode)->i_data)))
2253                 result = osd_ldiskfs_writelink(inode, buf->lb_buf, buf->lb_len);
2254         else
2255                 result = osd_ldiskfs_write_record(dt, buf->lb_buf, buf->lb_len,
2256                                                   is_link, pos, oh->ot_handle);
2257         if (result == 0)
2258                 result = buf->lb_len;
2259
2260         osd_trans_exec_check(env, handle, OSD_OT_WRITE);
2261
2262         return result;
2263 }
2264
2265 static int osd_declare_fallocate(const struct lu_env *env,
2266                                  struct dt_object *dt, __u64 start, __u64 end,
2267                                  int mode, struct thandle *th)
2268 {
2269         struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
2270         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
2271         struct inode *inode = osd_dt_obj(dt)->oo_inode;
2272         long long quota_space = 0;
2273         /* 5 is max tree depth. (inode + 4 index blocks) */
2274         int depth = 5;
2275         int rc;
2276
2277         ENTRY;
2278
2279         /*
2280          * mode == 0 (which is standard prealloc) and PUNCH is supported
2281          * Rest of mode options is not supported yet.
2282          */
2283         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2284                 RETURN(-EOPNOTSUPP);
2285
2286         /* disable fallocate completely */
2287         if (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks < 0)
2288                 RETURN(-EOPNOTSUPP);
2289
2290         LASSERT(th);
2291         LASSERT(inode);
2292
2293         if (mode & FALLOC_FL_PUNCH_HOLE) {
2294                 rc = osd_declare_inode_qid(env, i_uid_read(inode),
2295                                            i_gid_read(inode),
2296                                            i_projid_read(inode), 0, oh,
2297                                            osd_dt_obj(dt), NULL, OSD_QID_BLK);
2298                 if (rc == 0)
2299                         rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
2300                 RETURN(rc);
2301         }
2302
2303         /* quota space for metadata blocks
2304          * approximate metadata estimate should be good enough.
2305          */
2306         quota_space += PAGE_SIZE;
2307         quota_space += depth * LDISKFS_BLOCK_SIZE(osd_sb(osd));
2308
2309         /* quota space should be reported in 1K blocks */
2310         quota_space = toqb(quota_space) + toqb(end - start) +
2311                       LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
2312
2313         /* We don't need to reserve credits for whole fallocate here.
2314          * We reserve space only for metadata. Fallocate credits are
2315          * extended as required
2316          */
2317         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
2318                                    i_projid_read(inode), quota_space, oh,
2319                                    osd_dt_obj(dt), NULL, OSD_QID_BLK);
2320         RETURN(rc);
2321 }
2322
2323 static int osd_fallocate_preallocate(const struct lu_env *env,
2324                                      struct dt_object *dt,
2325                                      __u64 start, __u64 end, int mode,
2326                                      struct thandle *th)
2327 {
2328         struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
2329         handle_t *handle = ldiskfs_journal_current_handle();
2330         unsigned int save_credits = oh->ot_credits;
2331         struct osd_object *obj = osd_dt_obj(dt);
2332         struct inode *inode = obj->oo_inode;
2333         struct ldiskfs_map_blocks map;
2334         unsigned int credits;
2335         ldiskfs_lblk_t blen;
2336         ldiskfs_lblk_t boff;
2337         loff_t new_size = 0;
2338         int depth = 0;
2339         int flags;
2340         int rc = 0;
2341
2342         ENTRY;
2343
2344         LASSERT(dt_object_exists(dt));
2345         LASSERT(osd_invariant(obj));
2346         LASSERT(inode != NULL);
2347
2348         CDEBUG(D_INODE, "fallocate: inode #%lu: start %llu end %llu mode %d\n",
2349                inode->i_ino, start, end, mode);
2350
2351         dquot_initialize(inode);
2352
2353         LASSERT(th);
2354
2355         boff = start >> inode->i_blkbits;
2356         blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
2357
2358         /* Create and mark new extents as either zero or unwritten */
2359         flags = (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ||
2360                  !ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)) ?
2361                 LDISKFS_GET_BLOCKS_CREATE_ZERO :
2362                 LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
2363 #ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
2364         if (mode & FALLOC_FL_KEEP_SIZE)
2365                 flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
2366 #endif
2367         inode_lock(inode);
2368
2369         if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
2370             end > LDISKFS_I(inode)->i_disksize)) {
2371                 new_size = end;
2372                 rc = inode_newsize_ok(inode, new_size);
2373                 if (rc)
2374                         GOTO(out, rc);
2375         }
2376
2377         inode_dio_wait(inode);
2378
2379         map.m_lblk = boff;
2380         map.m_len = blen;
2381
2382         /* Don't normalize the request if it can fit in one extent so
2383          * that it doesn't get unnecessarily split into multiple extents.
2384          */
2385         if (blen <= EXT_UNWRITTEN_MAX_LEN)
2386                 flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE;
2387
2388         /*
2389          * credits to insert 1 extent into extent tree.
2390          */
2391         credits = osd_chunk_trans_blocks(inode, blen);
2392         depth = ext_depth(inode);
2393
2394         while (rc >= 0 && blen) {
2395                 loff_t epos;
2396
2397                 /*
2398                  * Recalculate credits when extent tree depth changes.
2399                  */
2400                 if (depth != ext_depth(inode)) {
2401                         credits = osd_chunk_trans_blocks(inode, blen);
2402                         depth = ext_depth(inode);
2403                 }
2404
2405                 /* TODO: quota check */
2406                 rc = osd_extend_restart_trans(handle, credits, inode);
2407                 if (rc)
2408                         break;
2409
2410                 rc = ldiskfs_map_blocks(handle, inode, &map, flags);
2411                 if (rc <= 0) {
2412                         CDEBUG(D_INODE,
2413                                "inode #%lu: block %u: len %u: ldiskfs_map_blocks returned %d\n",
2414                                inode->i_ino, map.m_lblk, map.m_len, rc);
2415                         ldiskfs_mark_inode_dirty(handle, inode);
2416                         break;
2417                 }
2418
2419                 map.m_lblk += rc;
2420                 map.m_len = blen = blen - rc;
2421                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
2422                 inode->i_ctime = current_time(inode);
2423                 if (new_size) {
2424                         if (epos > end)
2425                                 epos = end;
2426                         if (ldiskfs_update_inode_size(inode, epos) & 0x1)
2427                                 inode->i_mtime = inode->i_ctime;
2428 #ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
2429                 } else {
2430                         if (epos > inode->i_size)
2431                                 ldiskfs_set_inode_flag(inode,
2432                                                        LDISKFS_INODE_EOFBLOCKS);
2433 #endif
2434                 }
2435
2436                 ldiskfs_mark_inode_dirty(handle, inode);
2437         }
2438
2439 out:
2440         /* extand credits if needed for operations such as attribute set */
2441         if (rc >= 0)
2442                 rc = osd_extend_restart_trans(handle, save_credits, inode);
2443
2444         inode_unlock(inode);
2445
2446         RETURN(rc);
2447 }
2448
2449 static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
2450                                __u64 start, __u64 end, int mode,
2451                                struct thandle *th)
2452 {
2453         struct osd_object *obj = osd_dt_obj(dt);
2454         struct inode *inode = obj->oo_inode;
2455         struct osd_access_lock *al;
2456         struct osd_thandle *oh;
2457         int rc = 0, found = 0;
2458
2459         ENTRY;
2460
2461         LASSERT(dt_object_exists(dt));
2462         LASSERT(osd_invariant(obj));
2463         LASSERT(inode != NULL);
2464
2465         dquot_initialize(inode);
2466
2467         LASSERT(th);
2468         oh = container_of(th, struct osd_thandle, ot_super);
2469         LASSERT(oh->ot_handle->h_transaction != NULL);
2470
2471         list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
2472                 if (obj != al->tl_obj)
2473                         continue;
2474                 LASSERT(al->tl_shared == 0);
2475                 found = 1;
2476                 /* do actual punch in osd_trans_stop() */
2477                 al->tl_start = start;
2478                 al->tl_end = end;
2479                 al->tl_mode = mode;
2480                 al->tl_punch = true;
2481                 break;
2482         }
2483
2484         RETURN(rc);
2485 }
2486
2487 static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
2488                          __u64 start, __u64 end, int mode, struct thandle *th)
2489 {
2490         int rc;
2491
2492         ENTRY;
2493
2494         if (mode & FALLOC_FL_PUNCH_HOLE) {
2495                 /* punch */
2496                 rc = osd_fallocate_punch(env, dt, start, end, mode, th);
2497         } else {
2498                 /* standard preallocate */
2499                 rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
2500         }
2501         RETURN(rc);
2502 }
2503
2504 static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
2505                              __u64 start, __u64 end, struct thandle *th)
2506 {
2507         struct osd_thandle *oh;
2508         struct inode       *inode;
2509         int                 rc;
2510         ENTRY;
2511
2512         LASSERT(th);
2513         oh = container_of(th, struct osd_thandle, ot_super);
2514
2515         /*
2516          * we don't need to reserve credits for whole truncate
2517          * it's not possible as truncate may need to free too many
2518          * blocks and that won't fit a single transaction. instead
2519          * we reserve credits to change i_size and put inode onto
2520          * orphan list. if needed truncate will extend or restart
2521          * transaction
2522          */
2523         osd_trans_declare_op(env, oh, OSD_OT_PUNCH,
2524                              osd_dto_credits_noquota[DTO_ATTR_SET_BASE] + 3);
2525
2526         inode = osd_dt_obj(dt)->oo_inode;
2527         LASSERT(inode);
2528
2529         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
2530                                    i_projid_read(inode), 0, oh, osd_dt_obj(dt),
2531                                    NULL, OSD_QID_BLK);
2532
2533         if (rc == 0)
2534                 rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
2535
2536         RETURN(rc);
2537 }
2538
2539 static int osd_punch(const struct lu_env *env, struct dt_object *dt,
2540                      __u64 start, __u64 end, struct thandle *th)
2541 {
2542         struct osd_object *obj = osd_dt_obj(dt);
2543         struct osd_device *osd = osd_obj2dev(obj);
2544         struct inode *inode = obj->oo_inode;
2545         struct osd_access_lock *al;
2546         struct osd_thandle *oh;
2547         int rc = 0, found = 0;
2548         bool grow = false;
2549         ENTRY;
2550
2551         LASSERT(dt_object_exists(dt));
2552         LASSERT(osd_invariant(obj));
2553         LASSERT(inode != NULL);
2554         dquot_initialize(inode);
2555
2556         LASSERT(th);
2557         oh = container_of(th, struct osd_thandle, ot_super);
2558         LASSERT(oh->ot_handle->h_transaction != NULL);
2559
2560         /* we used to skip truncate to current size to
2561          * optimize truncates on OST. with DoM we can
2562          * get attr_set to set specific size (MDS_REINT)
2563          * and then get truncate RPC which essentially
2564          * would be skipped. this is bad.. so, disable
2565          * this optimization on MDS till the client stop
2566          * to sent MDS_REINT (LU-11033) -bzzz
2567          */
2568         if (osd->od_is_ost && i_size_read(inode) == start)
2569                 RETURN(0);
2570
2571         osd_trans_exec_op(env, th, OSD_OT_PUNCH);
2572
2573         spin_lock(&inode->i_lock);
2574         if (i_size_read(inode) < start)
2575                 grow = true;
2576         i_size_write(inode, start);
2577         spin_unlock(&inode->i_lock);
2578         /* if object holds encrypted content, we need to make sure we truncate
2579          * on an encryption unit boundary, or subsequent reads will get
2580          * corrupted content
2581          */
2582         if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL &&
2583             start & ~LUSTRE_ENCRYPTION_MASK)
2584                 start = (start & LUSTRE_ENCRYPTION_MASK) +
2585                         LUSTRE_ENCRYPTION_UNIT_SIZE;
2586         ll_truncate_pagecache(inode, start);
2587
2588         /* optimize grow case */
2589         if (grow) {
2590                 osd_execute_truncate(obj);
2591                 GOTO(out, rc);
2592         }
2593
2594         inode_lock(inode);
2595         /* add to orphan list to ensure truncate completion
2596          * if this transaction succeed. ldiskfs_truncate()
2597          * will take the inode out of the list
2598          */
2599         rc = ldiskfs_orphan_add(oh->ot_handle, inode);
2600         inode_unlock(inode);
2601         if (rc != 0)
2602                 GOTO(out, rc);
2603
2604         list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
2605                 if (obj != al->tl_obj)
2606                         continue;
2607                 LASSERT(al->tl_shared == 0);
2608                 found = 1;
2609                 /* do actual truncate in osd_trans_stop() */
2610                 al->tl_truncate = 1;
2611                 break;
2612         }
2613         LASSERT(found);
2614
2615 out:
2616         RETURN(rc);
2617 }
2618
2619 static int fiemap_check_ranges(struct inode *inode,
2620                                u64 start, u64 len, u64 *new_len)
2621 {
2622         loff_t maxbytes;
2623
2624         *new_len = len;
2625
2626         if (len == 0)
2627                 return -EINVAL;
2628
2629         if (ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS))
2630                 maxbytes = inode->i_sb->s_maxbytes;
2631         else
2632                 maxbytes = LDISKFS_SB(inode->i_sb)->s_bitmap_maxbytes;
2633
2634         if (start > maxbytes)
2635                 return -EFBIG;
2636
2637         /*
2638          * Shrink request scope to what the fs can actually handle.
2639          */
2640         if (len > maxbytes || (maxbytes - len) < start)
2641                 *new_len = maxbytes - start;
2642
2643         return 0;
2644 }
2645
2646 /* So that the fiemap access checks can't overflow on 32 bit machines. */
2647 #define FIEMAP_MAX_EXTENTS     (UINT_MAX / sizeof(struct fiemap_extent))
2648
2649 static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
2650                           struct fiemap *fm)
2651 {
2652         struct fiemap_extent_info fieinfo = {0, };
2653         struct inode *inode = osd_dt_obj(dt)->oo_inode;
2654         u64 len;
2655         int rc;
2656
2657         LASSERT(inode);
2658         if (inode->i_op->fiemap == NULL)
2659                 return -EOPNOTSUPP;
2660
2661         if (fm->fm_extent_count > FIEMAP_MAX_EXTENTS)
2662                 return -EINVAL;
2663
2664         rc = fiemap_check_ranges(inode, fm->fm_start, fm->fm_length, &len);
2665         if (rc)
2666                 return rc;
2667
2668         fieinfo.fi_flags = fm->fm_flags;
2669         fieinfo.fi_extents_max = fm->fm_extent_count;
2670         fieinfo.fi_extents_start = fm->fm_extents;
2671
2672         if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
2673                 filemap_write_and_wait(inode->i_mapping);
2674
2675         rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
2676         fm->fm_flags = fieinfo.fi_flags;
2677         fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
2678
2679         return rc;
2680 }
2681
2682 static int osd_ladvise(const struct lu_env *env, struct dt_object *dt,
2683                        __u64 start, __u64 end, enum lu_ladvise_type advice)
2684 {
2685         struct osd_object *obj = osd_dt_obj(dt);
2686         int rc = 0;
2687         ENTRY;
2688
2689         switch (advice) {
2690         case LU_LADVISE_DONTNEED:
2691                 if (end)
2692                         invalidate_mapping_pages(obj->oo_inode->i_mapping,
2693                                                  start >> PAGE_SHIFT,
2694                                                  (end - 1) >> PAGE_SHIFT);
2695                 break;
2696         default:
2697                 rc = -ENOTSUPP;
2698                 break;
2699         }
2700
2701         RETURN(rc);
2702 }
2703
2704 static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt,
2705                         loff_t offset, int whence)
2706 {
2707         struct osd_object *obj = osd_dt_obj(dt);
2708         struct osd_device *dev = osd_obj2dev(obj);
2709         struct inode *inode = obj->oo_inode;
2710         struct file *file;
2711         loff_t result;
2712
2713         ENTRY;
2714         LASSERT(dt_object_exists(dt));
2715         LASSERT(osd_invariant(obj));
2716         LASSERT(inode);
2717         LASSERT(offset >= 0);
2718
2719         file = alloc_file_pseudo(inode, dev->od_mnt, "/", O_NOATIME,
2720                                  inode->i_fop);
2721         if (IS_ERR(file))
2722                 RETURN(PTR_ERR(file));
2723
2724         file->f_mode |= FMODE_64BITHASH;
2725         result = file->f_op->llseek(file, offset, whence);
2726         ihold(inode);
2727         fput(file);
2728         /*
2729          * If 'offset' is beyond end of object file then treat it as not error
2730          * but valid case for SEEK_HOLE and return 'offset' as result.
2731          * LOV will decide if it is beyond real end of file or not.
2732          */
2733         if (whence == SEEK_HOLE && result == -ENXIO)
2734                 result = offset;
2735
2736         CDEBUG(D_INFO, "seek %s from %lld: %lld\n", whence == SEEK_HOLE ?
2737                        "hole" : "data", offset, result);
2738         RETURN(result);
2739 }
2740
2741 /*
2742  * in some cases we may need declare methods for objects being created
2743  * e.g., when we create symlink
2744  */
2745 const struct dt_body_operations osd_body_ops_new = {
2746         .dbo_declare_write = osd_declare_write,
2747 };
2748
2749 const struct dt_body_operations osd_body_ops = {
2750         .dbo_read                       = osd_read,
2751         .dbo_declare_write              = osd_declare_write,
2752         .dbo_write                      = osd_write,
2753         .dbo_bufs_get                   = osd_bufs_get,
2754         .dbo_bufs_put                   = osd_bufs_put,
2755         .dbo_write_prep                 = osd_write_prep,
2756         .dbo_declare_write_commit       = osd_declare_write_commit,
2757         .dbo_write_commit               = osd_write_commit,
2758         .dbo_read_prep                  = osd_read_prep,
2759         .dbo_declare_punch              = osd_declare_punch,
2760         .dbo_punch                      = osd_punch,
2761         .dbo_fiemap_get                 = osd_fiemap_get,
2762         .dbo_ladvise                    = osd_ladvise,
2763         .dbo_declare_fallocate          = osd_declare_fallocate,
2764         .dbo_fallocate                  = osd_fallocate,
2765         .dbo_lseek                      = osd_lseek,
2766 };
2767
2768 /**
2769  * Get a truncate lock
2770  *
2771  * In order to take multi-transaction truncate out of main transaction we let
2772  * the caller grab a lock on the object passed. the lock can be shared (for
2773  * writes) and exclusive (for truncate). It's not allowed to mix truncate
2774  * and write in the same transaction handle (do not confuse with big ldiskfs
2775  * transaction containing lots of handles).
2776  * The lock must be taken at declaration.
2777  *
2778  * \param obj           object to lock
2779  * \oh                  transaction
2780  * \shared              shared or exclusive
2781  *
2782  * \retval 0            lock is granted
2783  * \retval -NOMEM       no memory to allocate lock
2784  */
2785 int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, bool shared)
2786 {
2787         struct osd_access_lock *al, *tmp;
2788
2789         LASSERT(obj);
2790         LASSERT(oh);
2791
2792         list_for_each_entry(tmp, &oh->ot_trunc_locks, tl_list) {
2793                 if (tmp->tl_obj != obj)
2794                         continue;
2795                 LASSERT(tmp->tl_shared == shared);
2796                 /* found same lock */
2797                 return 0;
2798         }
2799
2800         OBD_ALLOC_PTR(al);
2801         if (unlikely(al == NULL))
2802                 return -ENOMEM;
2803         al->tl_obj = obj;
2804         al->tl_truncate = false;
2805         if (shared)
2806                 down_read(&obj->oo_ext_idx_sem);
2807         else
2808                 down_write(&obj->oo_ext_idx_sem);
2809         al->tl_shared = shared;
2810         lu_object_get(&obj->oo_dt.do_lu);
2811
2812         list_add(&al->tl_list, &oh->ot_trunc_locks);
2813
2814         return 0;
2815 }
2816
2817 void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
2818 {
2819         struct osd_access_lock *al, *tmp;
2820
2821         list_for_each_entry_safe(al, tmp, list, tl_list) {
2822                 if (al->tl_shared)
2823                         up_read(&al->tl_obj->oo_ext_idx_sem);
2824                 else
2825                         up_write(&al->tl_obj->oo_ext_idx_sem);
2826                 osd_object_put(env, al->tl_obj);
2827                 list_del(&al->tl_list);
2828                 OBD_FREE_PTR(al);
2829         }
2830 }
2831
2832 /* For a partial-page punch, flush punch range to disk immediately */
2833 static void osd_partial_page_flush_punch(struct osd_device *d,
2834                                          struct inode *inode, loff_t start,
2835                                          loff_t end)
2836 {
2837         if (osd_use_page_cache(d)) {
2838                 filemap_fdatawrite_range(inode->i_mapping, start, end);
2839         } else {
2840                 /* Notice we use "wait" version to ensure I/O is complete */
2841                 filemap_write_and_wait_range(inode->i_mapping, start,
2842                                              end);
2843                 invalidate_mapping_pages(inode->i_mapping, start >> PAGE_SHIFT,
2844                                          end >> PAGE_SHIFT);
2845         }
2846 }
2847
2848 /*
2849  * For a partial-page truncate, flush the page to disk immediately to
2850  * avoid data corruption during direct disk write.  b=17397
2851  */
2852 static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
2853                                    loff_t offset)
2854 {
2855         if (!(offset & ~PAGE_MASK))
2856                 return;
2857
2858         if (osd_use_page_cache(d)) {
2859                 filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
2860         } else {
2861                 /* Notice we use "wait" version to ensure I/O is complete */
2862                 filemap_write_and_wait_range(inode->i_mapping, offset,
2863                                              offset + 1);
2864                 invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
2865                                          offset >> PAGE_SHIFT);
2866         }
2867 }
2868
2869 void osd_execute_truncate(struct osd_object *obj)
2870 {
2871         struct osd_device *d = osd_obj2dev(obj);
2872         struct inode *inode = obj->oo_inode;
2873         __u64 size;
2874
2875         /* simulate crash before (in the middle) of delayed truncate */
2876         if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FAIL_AT_TRUNCATE)) {
2877                 struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
2878                 struct ldiskfs_sb_info *sbi = LDISKFS_SB(inode->i_sb);
2879
2880                 mutex_lock(&sbi->s_orphan_lock);
2881                 list_del_init(&ei->i_orphan);
2882                 mutex_unlock(&sbi->s_orphan_lock);
2883                 return;
2884         }
2885
2886         size = i_size_read(inode);
2887         inode_lock(inode);
2888         /* if object holds encrypted content, we need to make sure we truncate
2889          * on an encryption unit boundary, or block content will get corrupted
2890          */
2891         if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL &&
2892             size & ~LUSTRE_ENCRYPTION_MASK)
2893                 inode->i_size = (size & LUSTRE_ENCRYPTION_MASK) +
2894                         LUSTRE_ENCRYPTION_UNIT_SIZE;
2895         ldiskfs_truncate(inode);
2896         inode_unlock(inode);
2897         if (inode->i_size != size) {
2898                 spin_lock(&inode->i_lock);
2899                 i_size_write(inode, size);
2900                 LDISKFS_I(inode)->i_disksize = size;
2901                 spin_unlock(&inode->i_lock);
2902                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
2903         }
2904         osd_partial_page_flush(d, inode, size);
2905 }
2906
2907 static int osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
2908                              loff_t start, loff_t end, int mode)
2909 {
2910         struct osd_device *d = osd_obj2dev(obj);
2911         struct inode *inode = obj->oo_inode;
2912         struct file *file;
2913         int rc;
2914
2915         file = alloc_file_pseudo(inode, d->od_mnt, "/", O_NOATIME,
2916                                  inode->i_fop);
2917         if (IS_ERR(file))
2918                 RETURN(PTR_ERR(file));
2919
2920         file->f_mode |= FMODE_64BITHASH;
2921         rc = file->f_op->fallocate(file, mode, start, end - start);
2922         ihold(inode);
2923         fput(file);
2924         if (rc == 0)
2925                 osd_partial_page_flush_punch(d, inode, start, end - 1);
2926         return rc;
2927 }
2928
2929 int osd_process_truncates(const struct lu_env *env, struct list_head *list)
2930 {
2931         struct osd_access_lock *al;
2932         int rc = 0;
2933
2934         LASSERT(!journal_current_handle());
2935
2936         list_for_each_entry(al, list, tl_list) {
2937                 if (al->tl_shared)
2938                         continue;
2939                 if (al->tl_truncate)
2940                         osd_execute_truncate(al->tl_obj);
2941                 else if (al->tl_punch)
2942                         rc = osd_execute_punch(env, al->tl_obj, al->tl_start,
2943                                                al->tl_end, al->tl_mode);
2944         }
2945
2946         return rc;
2947 }