4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * Implementation of cl_io for VVP layer.
33 * Author: Nikita Danilov <nikita.danilov@sun.com>
34 * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
37 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <linux/pagevec.h>
41 #include <linux/memcontrol.h>
42 #include <linux/falloc.h>
44 #include "llite_internal.h"
45 #include "vvp_internal.h"
46 #include <lustre_compat.h>
47 #include <libcfs/linux/linux-misc.h>
49 static struct vvp_io *cl2vvp_io(const struct lu_env *env,
50 const struct cl_io_slice *slice)
54 vio = container_of(slice, struct vvp_io, vui_cl);
55 LASSERT(vio == vvp_env_io(env));
60 /* For swapping layout. The file's layout may have changed.
61 * To avoid populating pages to a wrong stripe, we have to verify the
62 * correctness of layout. It works because swapping layout processes
63 * have to acquire group lock.
65 static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
68 struct ll_inode_info *lli = ll_i2info(inode);
69 struct vvp_io *vio = vvp_env_io(env);
72 switch (io->ci_type) {
75 /* don't need lock here to check lli_layout_gen as we have held
76 * extent lock and GROUP lock has to hold to swap layout
78 if (ll_layout_version_get(lli) != vio->vui_layout_gen ||
79 CFS_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) {
80 io->ci_need_restart = 1;
81 /* this will cause a short read/write */
86 /* fault is okay because we've already had a page. */
94 static void vvp_object_size_lock(struct cl_object *obj)
96 struct inode *inode = vvp_object_inode(obj);
98 ll_inode_size_lock(inode);
99 cl_object_attr_lock(obj);
102 static void vvp_object_size_unlock(struct cl_object *obj)
104 struct inode *inode = vvp_object_inode(obj);
106 cl_object_attr_unlock(obj);
107 ll_inode_size_unlock(inode);
111 * Helper function that if necessary adjusts file size (inode->i_size), when
112 * position at the offset \a pos is accessed. File size can be arbitrary stale
113 * on a Lustre client, but client at least knows KMS. If accessed area is
114 * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
116 * Locking: i_size_lock is used to serialize changes to inode size and to
117 * protect consistency between inode size and cl_object
118 * attributes. cl_object_size_lock() protects consistency between cl_attr's of
119 * top-object and sub-objects.
121 static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj,
122 struct cl_io *io, loff_t start, size_t bytes,
125 struct cl_attr *attr = vvp_env_thread_attr(env);
126 struct inode *inode = vvp_object_inode(obj);
127 loff_t pos = start + bytes - 1;
132 * Consistency guarantees: following possibilities exist for the
133 * relation between region being accessed and real file size at this
136 * (A): the region is completely inside of the file;
138 * (B-x): x bytes of region are inside of the file, the rest is
141 * (C): the region is completely outside of the file.
143 * This classification is stable under DLM lock already acquired by
144 * the caller, because to change the class, other client has to take
145 * DLM lock conflicting with our lock. Also, any updates to ->i_size
146 * by other threads on this client are serialized by
147 * ll_inode_size_lock(). This guarantees that short reads are handled
148 * correctly in the face of concurrent writes and truncates.
150 vvp_object_size_lock(obj);
151 result = cl_object_attr_get(env, obj, attr);
154 if (pos > kms || !attr->cat_kms_valid) {
156 * A glimpse is necessary to determine whether we
157 * return a short read (B) or some zeroes at the end
160 vvp_object_size_unlock(obj);
161 result = cl_glimpse_lock(env, io, inode, obj, 0);
162 if (result == 0 && exceed != NULL) {
163 /* If objective page index exceed end-of-file
164 * page index, return directly. Do not expect
165 * kernel will check such case correctly.
167 loff_t size = i_size_read(inode);
168 unsigned long cur_index = start >>
171 if ((size == 0 && cur_index != 0) ||
172 (((size - 1) >> PAGE_SHIFT) <
180 * region is within kms and, hence, within real file
181 * size (A). We need to increase i_size to cover the
182 * read region so that generic_file_read() will do its
183 * job, but that doesn't mean the kms size is
184 * _correct_, it is only the _minimum_ size. If
185 * someone does a stat they will get the correct size
186 * which will always be >= the kms value here.
189 if (i_size_read(inode) < kms) {
190 i_size_write(inode, kms);
192 DFID" updating i_size %llu\n",
193 PFID(lu_object_fid(&obj->co_lu)),
194 (__u64)i_size_read(inode));
198 vvp_object_size_unlock(obj);
204 static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
205 __u32 enqflags, enum cl_lock_mode mode,
206 pgoff_t start, pgoff_t end)
208 struct vvp_io *vio = vvp_env_io(env);
209 struct cl_lock_descr *descr = &vio->vui_link.cill_descr;
210 struct cl_object *obj = io->ci_obj;
212 CLOBINVRNT(env, obj, vvp_object_invariant(obj));
215 CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
217 memset(&vio->vui_link, 0, sizeof(vio->vui_link));
219 if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
220 descr->cld_mode = CLM_GROUP;
221 descr->cld_gid = vio->vui_fd->fd_grouplock.lg_gid;
222 enqflags |= CEF_LOCK_MATCH;
224 descr->cld_mode = mode;
227 descr->cld_obj = obj;
228 descr->cld_start = start;
229 descr->cld_end = end;
230 descr->cld_enq_flags = enqflags;
232 cl_io_lock_add(env, io, &vio->vui_link);
237 static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io,
238 __u32 enqflags, enum cl_lock_mode mode,
239 loff_t start, loff_t end)
241 return vvp_io_one_lock_index(env, io, enqflags, mode,
242 start >> PAGE_SHIFT, end >> PAGE_SHIFT);
245 static int vvp_io_write_iter_init(const struct lu_env *env,
246 const struct cl_io_slice *ios)
248 struct vvp_io *vio = cl2vvp_io(env, ios);
250 cl_page_list_init(&vio->u.readwrite.vui_queue);
251 vio->u.readwrite.vui_written = 0;
252 vio->u.readwrite.vui_from = 0;
253 vio->u.readwrite.vui_to = PAGE_SIZE;
258 static int vvp_io_read_iter_init(const struct lu_env *env,
259 const struct cl_io_slice *ios)
261 struct vvp_io *vio = cl2vvp_io(env, ios);
263 vio->u.readwrite.vui_read = 0;
268 static void vvp_io_write_iter_fini(const struct lu_env *env,
269 const struct cl_io_slice *ios)
271 struct vvp_io *vio = cl2vvp_io(env, ios);
273 LASSERT(vio->u.readwrite.vui_queue.pl_nr == 0);
276 static int vvp_io_fault_iter_init(const struct lu_env *env,
277 const struct cl_io_slice *ios)
279 struct vvp_io *vio = cl2vvp_io(env, ios);
280 struct inode *inode = vvp_object_inode(ios->cis_obj);
282 LASSERT(inode == file_inode(vio->vui_fd->fd_file));
287 static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
289 struct cl_io *io = ios->cis_io;
290 struct cl_object *obj = io->ci_obj;
291 struct vvp_io *vio = cl2vvp_io(env, ios);
292 struct inode *inode = vvp_object_inode(obj);
298 CLOBINVRNT(env, obj, vvp_object_invariant(obj));
300 CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d need write layout %d, restore needed %d, invalidate_lock %d\n",
301 PFID(lu_object_fid(&obj->co_lu)),
302 io->ci_ignore_layout, io->ci_verify_layout,
303 vio->vui_layout_gen, io->ci_need_write_intent,
304 io->ci_restore_needed, io->ci_invalidate_page_cache);
306 #ifdef HAVE_INVALIDATE_LOCK
307 if (io->ci_invalidate_page_cache) {
308 filemap_invalidate_unlock(inode->i_mapping);
309 io->ci_invalidate_page_cache = 0;
311 #endif /* HAVE_INVALIDATE_LOCK */
313 if (io->ci_restore_needed) {
314 /* file was detected release, we need to restore it
315 * before finishing the io
317 rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF);
318 /* if restore registration failed, no restart,
319 * we will return -ENODATA
321 * The layout will change after restore, so we need to
322 * block on layout lock held by the MDT
323 * as MDT will not send new layout in lvb (see LU-3124)
324 * we have to explicitly fetch it, all this will be done
325 * by ll_layout_refresh().
326 * Even if ll_layout_restore() returns zero, it doesn't mean
327 * that restore has been successful. Therefore it sets
328 * ci_verify_layout so that it will check layout at the end
332 io->ci_restore_needed = 1;
333 io->ci_need_restart = 0;
334 io->ci_verify_layout = 0;
339 io->ci_restore_needed = 0;
341 /* Even if ll_layout_restore() returns zero, it doesn't mean
342 * that restore has been successful. Therefore it should verify
343 * if there was layout change and restart I/O correspondingly.
345 ll_layout_refresh(inode, &gen);
346 io->ci_need_restart = vio->vui_layout_gen != gen;
347 if (io->ci_need_restart) {
349 DFID" layout changed from %d to %d.\n",
350 PFID(lu_object_fid(&obj->co_lu)),
351 vio->vui_layout_gen, gen);
352 /* today successful restore is the only possible case */
353 /* restore was done, clear restoring state */
354 clear_bit(LLIF_FILE_RESTORING,
355 &ll_i2info(vvp_object_inode(obj))->lli_flags);
360 /* dynamic layout change needed, send layout intent RPC. */
361 if (io->ci_need_write_intent || io->ci_need_pccro_clear) {
362 enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
364 io->ci_need_write_intent = 0;
366 LASSERT(io->ci_type == CIT_WRITE || cl_io_is_fallocate(io) ||
367 cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
369 CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
370 PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
371 PEXT(&io->ci_write_intent));
373 if (cl_io_is_trunc(io))
374 opc = LAYOUT_INTENT_TRUNC;
376 if (io->ci_need_pccro_clear) {
377 io->ci_need_pccro_clear = 0;
378 opc = LAYOUT_INTENT_PCCRO_CLEAR;
381 rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
384 io->ci_need_restart = 1;
388 if (!io->ci_need_restart &&
389 !io->ci_ignore_layout && io->ci_verify_layout) {
390 /* check layout version */
391 ll_layout_refresh(inode, &gen);
392 io->ci_need_restart = vio->vui_layout_gen != gen;
393 if (io->ci_need_restart) {
395 DFID" layout changed from %d to %d.\n",
396 PFID(lu_object_fid(&obj->co_lu)),
397 vio->vui_layout_gen, gen);
405 static void vvp_io_fault_fini(const struct lu_env *env,
406 const struct cl_io_slice *ios)
408 struct cl_io *io = ios->cis_io;
409 struct cl_page *page = io->u.ci_fault.ft_page;
411 CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj));
414 lu_ref_del(&page->cp_reference, "fault", io);
415 cl_page_put(env, page);
416 io->u.ci_fault.ft_page = NULL;
418 vvp_io_fini(env, ios);
421 static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
423 /* we only want to hold PW locks if the mmap() can generate
424 * writes back to the file and that only happens in shared
427 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
432 static int vvp_mmap_locks(const struct lu_env *env,
433 struct vvp_io *vio, struct cl_io *io)
435 struct vvp_thread_info *vti = vvp_env_info(env);
436 struct mm_struct *mm = current->mm;
437 struct vm_area_struct *vma;
438 struct cl_lock_descr *descr = &vti->vti_descr;
439 union ldlm_policy_data policy;
448 LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
450 /* nfs or loop back device write */
451 if (vio->vui_iter == NULL)
454 /* No MM (e.g. NFS)? No vmas too. */
458 if (!iter_is_iovec(vio->vui_iter) && !iov_iter_is_kvec(vio->vui_iter))
461 for (i = *vio->vui_iter;
463 iov_iter_advance(&i, iov.iov_len)) {
464 iov = iov_iter_iovec(&i);
465 addr = (unsigned long)iov.iov_base;
471 bytes += addr & ~PAGE_MASK;
475 while ((vma = our_vma(mm, addr, bytes)) != NULL) {
476 struct dentry *de = file_dentry(vma->vm_file);
477 struct inode *inode = de->d_inode;
478 int flags = CEF_MUST;
480 if (ll_file_nolock(vma->vm_file)) {
481 /* For no lock case is not allowed for mmap */
487 * XXX: Required lock mode can be weakened: CIT_WRITE
488 * io only ever reads user level buffer, and CIT_READ
491 policy_from_vma(&policy, vma, addr, bytes);
492 descr->cld_mode = vvp_mode_from_vma(vma);
493 descr->cld_obj = ll_i2info(inode)->lli_clob;
494 descr->cld_start = policy.l_extent.start >> PAGE_SHIFT;
495 descr->cld_end = policy.l_extent.end >> PAGE_SHIFT;
496 descr->cld_enq_flags = flags;
497 result = cl_io_lock_alloc_add(env, io, descr);
499 CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
500 descr->cld_mode, descr->cld_start,
506 if (vma->vm_end - addr >= bytes)
509 bytes -= vma->vm_end - addr;
512 mmap_read_unlock(mm);
519 static void vvp_io_advance(const struct lu_env *env,
520 const struct cl_io_slice *ios, size_t bytes)
522 struct cl_object *obj = ios->cis_io->ci_obj;
523 struct vvp_io *vio = cl2vvp_io(env, ios);
525 CLOBINVRNT(env, obj, vvp_object_invariant(obj));
528 * Since 3.16(26978b8b4) vfs revert iov iter to
529 * original position even io succeed, so instead
530 * of relying on VFS, we move iov iter by ourselves.
532 iov_iter_advance(vio->vui_iter, bytes);
533 CDEBUG(D_VFSTRACE, "advancing %ld bytes\n", bytes);
534 vio->vui_tot_bytes -= bytes;
535 iov_iter_reexpand(vio->vui_iter, vio->vui_tot_bytes);
538 static void vvp_io_update_iov(const struct lu_env *env,
539 struct vvp_io *vio, struct cl_io *io)
541 size_t size = io->u.ci_rw.crw_bytes;
546 iov_iter_truncate(vio->vui_iter, size);
549 static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
550 enum cl_lock_mode mode, loff_t start, loff_t end)
552 struct vvp_io *vio = vvp_env_io(env);
556 LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
559 vvp_io_update_iov(env, vio, io);
561 if (io->u.ci_rw.crw_nonblock)
562 ast_flags |= CEF_NONBLOCK;
563 if (io->ci_lock_no_expand)
564 ast_flags |= CEF_LOCK_NO_EXPAND;
568 /* Group lock held means no lockless any more */
569 if (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
572 flags = iocb_ki_flags_get(vio->vui_iocb->ki_filp,
574 if (ll_file_nolock(vio->vui_fd->fd_file) ||
575 (iocb_ki_flags_check(flags, DIRECT) &&
577 ast_flags |= CEF_NEVER;
580 result = vvp_mmap_locks(env, vio, io);
582 result = vvp_io_one_lock(env, io, ast_flags, mode, start, end);
587 static int vvp_io_read_lock(const struct lu_env *env,
588 const struct cl_io_slice *ios)
590 struct cl_io *io = ios->cis_io;
591 struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
595 result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
596 rd->crw_pos + rd->crw_bytes - 1);
600 static int vvp_io_fault_lock(const struct lu_env *env,
601 const struct cl_io_slice *ios)
603 struct cl_io *io = ios->cis_io;
604 struct vvp_io *vio = cl2vvp_io(env, ios);
605 /* XXX LDLM_FL_CBPENDING */
606 return vvp_io_one_lock_index(env,
608 vvp_mode_from_vma(vio->u.fault.ft_vma),
609 io->u.ci_fault.ft_index,
610 io->u.ci_fault.ft_index);
613 static int vvp_io_write_lock(const struct lu_env *env,
614 const struct cl_io_slice *ios)
616 struct cl_io *io = ios->cis_io;
620 if (io->u.ci_wr.wr_append) {
622 end = OBD_OBJECT_EOF;
624 start = io->u.ci_wr.wr.crw_pos;
625 end = start + io->u.ci_wr.wr.crw_bytes - 1;
628 RETURN(vvp_io_rw_lock(env, io, CLM_WRITE, start, end));
631 static int vvp_io_setattr_iter_init(const struct lu_env *env,
632 const struct cl_io_slice *ios)
639 * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
641 * Handles "lockless io" mode when extent locking is done by server.
643 static int vvp_io_setattr_lock(const struct lu_env *env,
644 const struct cl_io_slice *ios)
646 struct cl_io *io = ios->cis_io;
647 __u64 lock_start = 0;
648 __u64 lock_end = OBD_OBJECT_EOF;
651 if (cl_io_is_trunc(io)) {
652 struct inode *inode = vvp_object_inode(io->ci_obj);
654 /* set enqueue flags to CEF_MUST in case of encrypted file,
655 * to prevent lockless truncate
657 if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
659 else if (io->u.ci_setattr.sa_attr.lvb_size == 0)
660 enqflags = CEF_DISCARD_DATA;
661 } else if (cl_io_is_fallocate(io)) {
662 lock_start = io->u.ci_setattr.sa_falloc_offset;
663 lock_end = io->u.ci_setattr.sa_falloc_end - 1;
665 unsigned int valid = io->u.ci_setattr.sa_avalid;
667 if (!(valid & TIMES_SET_FLAGS))
670 if ((!(valid & ATTR_MTIME) ||
671 io->u.ci_setattr.sa_attr.lvb_mtime >=
672 io->u.ci_setattr.sa_attr.lvb_ctime) &&
673 (!(valid & ATTR_ATIME) ||
674 io->u.ci_setattr.sa_attr.lvb_atime >=
675 io->u.ci_setattr.sa_attr.lvb_ctime))
679 return vvp_io_one_lock(env, io, enqflags, CLM_WRITE,
680 lock_start, lock_end);
683 static int vvp_do_vmtruncate(struct inode *inode, size_t size)
687 /* Only ll_inode_size_lock is taken at this level. */
688 ll_inode_size_lock(inode);
689 result = inode_newsize_ok(inode, size);
691 ll_inode_size_unlock(inode);
694 i_size_write(inode, size);
696 ll_truncate_pagecache(inode, size);
697 ll_inode_size_unlock(inode);
701 static int vvp_io_setattr_time(const struct lu_env *env,
702 const struct cl_io_slice *ios)
704 struct cl_io *io = ios->cis_io;
705 struct cl_object *obj = io->ci_obj;
706 struct cl_attr *attr = vvp_env_thread_attr(env);
708 unsigned int valid = CAT_CTIME;
710 cl_object_attr_lock(obj);
711 attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
712 if (io->u.ci_setattr.sa_avalid & ATTR_ATIME_SET) {
713 attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
716 if (io->u.ci_setattr.sa_avalid & ATTR_MTIME_SET) {
717 attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
720 result = cl_object_attr_update(env, obj, attr, valid);
721 cl_object_attr_unlock(obj);
726 static int vvp_io_setattr_start(const struct lu_env *env,
727 const struct cl_io_slice *ios)
729 struct cl_io *io = ios->cis_io;
730 struct inode *inode = vvp_object_inode(io->ci_obj);
731 struct ll_inode_info *lli = ll_i2info(inode);
732 int mode = io->u.ci_setattr.sa_falloc_mode;
734 if (cl_io_is_trunc(io)) {
735 trunc_sem_down_write(&lli->lli_trunc_sem);
736 mutex_lock(&lli->lli_setattr_mutex);
737 inode_dio_wait(inode);
738 } else if (cl_io_is_fallocate(io)) {
741 trunc_sem_down_write(&lli->lli_trunc_sem);
742 mutex_lock(&lli->lli_setattr_mutex);
743 inode_dio_wait(inode);
745 ll_merge_attr(env, inode);
746 size = i_size_read(inode);
747 if (io->u.ci_setattr.sa_falloc_end > size &&
748 !(mode & FALLOC_FL_KEEP_SIZE)) {
749 size = io->u.ci_setattr.sa_falloc_end;
750 io->u.ci_setattr.sa_avalid |= ATTR_SIZE;
752 io->u.ci_setattr.sa_attr.lvb_size = size;
754 mutex_lock(&lli->lli_setattr_mutex);
757 if (io->u.ci_setattr.sa_avalid & TIMES_SET_FLAGS)
758 return vvp_io_setattr_time(env, ios);
763 static void vvp_io_setattr_end(const struct lu_env *env,
764 const struct cl_io_slice *ios)
766 struct cl_io *io = ios->cis_io;
767 struct inode *inode = vvp_object_inode(io->ci_obj);
768 struct ll_inode_info *lli = ll_i2info(inode);
769 loff_t size = io->u.ci_setattr.sa_attr.lvb_size;
771 if (cl_io_is_trunc(io)) {
772 /* Truncate in memory pages - they must be clean pages
773 * because osc has already notified to destroy osc_extents.
775 vvp_do_vmtruncate(inode, size);
776 mutex_unlock(&lli->lli_setattr_mutex);
777 trunc_sem_up_write(&lli->lli_trunc_sem);
779 /* Update size and blocks for LSOM */
780 if (!io->ci_ignore_layout)
781 ll_merge_attr(env, inode);
782 } else if (cl_io_is_fallocate(io)) {
783 int mode = io->u.ci_setattr.sa_falloc_mode;
785 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
786 size > i_size_read(inode)) {
787 ll_inode_size_lock(inode);
788 i_size_write(inode, size);
789 ll_inode_size_unlock(inode);
791 inode_set_ctime_current(inode);
792 mutex_unlock(&lli->lli_setattr_mutex);
793 trunc_sem_up_write(&lli->lli_trunc_sem);
795 mutex_unlock(&lli->lli_setattr_mutex);
799 static void vvp_io_setattr_fini(const struct lu_env *env,
800 const struct cl_io_slice *ios)
802 bool restore_needed = ios->cis_io->ci_restore_needed;
803 struct inode *inode = vvp_object_inode(ios->cis_obj);
805 vvp_io_fini(env, ios);
807 if (restore_needed && !ios->cis_io->ci_restore_needed) {
808 /* restore finished, set data modified flag for HSM */
809 set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags);
813 static int vvp_io_read_start(const struct lu_env *env,
814 const struct cl_io_slice *ios)
816 struct vvp_io *vio = cl2vvp_io(env, ios);
817 struct cl_io *io = ios->cis_io;
818 struct cl_object *obj = io->ci_obj;
819 struct inode *inode = vvp_object_inode(obj);
820 struct ll_inode_info *lli = ll_i2info(inode);
821 struct file *file = vio->vui_fd->fd_file;
822 loff_t pos = io->u.ci_rd.rd.crw_pos;
823 size_t crw_bytes = io->u.ci_rd.rd.crw_bytes;
824 size_t tot_bytes = vio->vui_tot_bytes;
825 struct ll_cl_context *lcc;
829 int total_bytes_read = 0;
830 struct iov_iter iter;
836 CLOBINVRNT(env, obj, vvp_object_invariant(obj));
838 CDEBUG(D_VFSTRACE, "%s: read [%llu, %llu)\n",
839 file_dentry(file)->d_name.name,
840 pos, pos + crw_bytes);
842 trunc_sem_down_read(&lli->lli_trunc_sem);
844 if (io->ci_async_readahead) {
849 if (!can_populate_pages(env, io, inode))
852 flags = iocb_ki_flags_get(file, vio->vui_iocb);
853 if (!iocb_ki_flags_check(flags, DIRECT)) {
854 result = cl_io_lru_reserve(env, io, pos, crw_bytes);
859 /* Unless this is reading a sparse file, otherwise the lock has already
860 * been acquired so vvp_prep_size() is an empty op.
862 result = vvp_prep_size(env, obj, io, pos, crw_bytes, &exceed);
865 else if (exceed != 0)
868 LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
869 "Read ino %lu, %zu bytes, offset %lld, size %llu\n",
870 inode->i_ino, crw_bytes, pos, i_size_read(inode));
872 /* initialize read-ahead window once per syscall */
873 if (!vio->vui_ra_valid) {
874 vio->vui_ra_valid = true;
875 vio->vui_ra_start_idx = pos >> PAGE_SHIFT;
876 vio->vui_ra_pages = 0;
877 page_offset = pos & ~PAGE_MASK;
880 if (tot_bytes > PAGE_SIZE - page_offset)
881 tot_bytes -= (PAGE_SIZE - page_offset);
885 vio->vui_ra_pages += (tot_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
887 CDEBUG(D_READA, "tot %zu, ra_start %lu, ra_count %lu\n",
888 vio->vui_tot_bytes, vio->vui_ra_start_idx,
894 LASSERT(vio->vui_iocb->ki_pos == pos);
895 iter = *vio->vui_iter;
897 lcc = ll_cl_find(inode);
898 lcc->lcc_end_index = DIV_ROUND_UP(pos + iter.count, PAGE_SIZE);
899 CDEBUG(D_VFSTRACE, "count:%ld iocb pos:%lld\n", iter.count, pos);
901 /* this seqlock lets us notice if a page has been deleted on this inode
902 * during the fault process, allowing us to catch an erroneous short
903 * read or EIO. See LU-16160
906 seq = read_seqbegin(&ll_i2info(inode)->lli_page_inv_lock);
907 result = generic_file_read_iter(vio->vui_iocb, &iter);
909 io->ci_bytes += result;
910 total_bytes_read += result;
912 /* got a short read or -EIO and we raced with page invalidation retry */
913 } while (read_seqretry(&ll_i2info(inode)->lli_page_inv_lock, seq) &&
914 ((result >= 0 && iov_iter_count(&iter) > 0)
919 if (total_bytes_read < crw_bytes)
922 } else if (result == -EIOCBQUEUED) {
923 io->ci_bytes += vio->u.readwrite.vui_read;
924 vio->vui_iocb->ki_pos = pos + vio->u.readwrite.vui_read;
930 static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io,
931 struct cl_page_list *plist, int from, int to)
933 struct cl_2queue *queue = &io->ci_queue;
934 struct cl_page *page;
935 unsigned int bytes = 0;
940 if (plist->pl_nr == 0)
943 if (from > 0 || to != PAGE_SIZE) {
944 page = cl_page_list_first(plist);
945 if (plist->pl_nr == 1) {
946 cl_page_clip(env, page, from, to);
949 cl_page_clip(env, page, from, PAGE_SIZE);
950 if (to != PAGE_SIZE) {
951 page = cl_page_list_last(plist);
952 cl_page_clip(env, page, 0, to);
957 cl_2queue_init(queue);
958 cl_page_list_splice(plist, &queue->c2_qin);
959 rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0);
961 /* plist is not sorted any more */
962 cl_page_list_splice(&queue->c2_qin, plist);
963 cl_page_list_splice(&queue->c2_qout, plist);
964 cl_2queue_fini(env, queue);
967 /* calculate bytes */
968 bytes = plist->pl_nr << PAGE_SHIFT;
969 bytes -= from + PAGE_SIZE - to;
971 while (plist->pl_nr > 0) {
972 page = cl_page_list_first(plist);
973 cl_page_list_del(env, plist, page, true);
975 cl_page_clip(env, page, 0, PAGE_SIZE);
977 SetPageUptodate(cl_page_vmpage(page));
978 cl_page_disown(env, io, page);
980 /* held in ll_cl_init() */
981 lu_ref_del(&page->cp_reference, "cl_io", io);
982 cl_page_put(env, page);
986 RETURN(bytes > 0 ? bytes : rc);
990 * From kernel v4.19-rc5-248-g9b89a0355144 use XArrary
991 * Prior kernels use radix_tree for tags
993 static inline void ll_page_tag_dirty(struct page *page,
994 struct address_space *mapping)
996 #ifndef HAVE_RADIX_TREE_TAG_SET
997 __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY);
999 radix_tree_tag_set(&mapping->page_tree, page_index(page),
1000 PAGECACHE_TAG_DIRTY);
1005 * Kernels 4.2 - 4.5 pass memcg argument to account_page_dirtied()
1006 * Kernel v5.2-5678-gac1c3e4 no longer exports account_page_dirtied
1008 static inline void ll_account_page_dirtied(struct page *page,
1009 struct address_space *mapping)
1011 #ifdef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS
1012 struct mem_cgroup *memcg = mem_cgroup_begin_page_stat(page);
1014 account_page_dirtied(page, mapping, memcg);
1015 mem_cgroup_end_page_stat(memcg);
1016 #elif defined(HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT)
1017 account_page_dirtied(page, mapping);
1019 vvp_account_page_dirtied(page, mapping);
1021 ll_page_tag_dirty(page, mapping);
1024 /* Taken from kernel set_page_dirty, __set_page_dirty_nobuffers
1025 * Last change to this area: b93b016313b3ba8003c3b8bb71f569af91f19fc7
1027 * Current with Linus tip of tree (7/13/2019):
1028 * v5.2-rc4-224-ge01e060fe0
1030 * Backwards compat for 3.x, 5.x kernels relating to memcg handling
1031 * & rename of radix tree to xarray.
1033 static void vvp_set_batch_dirty(struct folio_batch *fbatch)
1035 struct page *page = fbatch_at_pg(fbatch, 0, 0);
1036 int count = folio_batch_count(fbatch);
1038 #if !defined(HAVE_FOLIO_BATCH) || defined(HAVE_KALLSYMS_LOOKUP_NAME)
1041 #ifdef HAVE_KALLSYMS_LOOKUP_NAME
1042 struct address_space *mapping = page->mapping;
1043 unsigned long flags;
1044 unsigned long skip_pages = 0;
1051 BUILD_BUG_ON(PAGEVEC_SIZE > BITS_PER_LONG);
1052 LASSERTF(page->mapping,
1053 "mapping must be set. page %px, page->private (cl_page) %px\n",
1054 page, (void *) page->private);
1057 * kernels without HAVE_KALLSYMS_LOOKUP_NAME also don't have
1058 * account_dirty_page exported, and if we can't access that symbol,
1059 * we can't do page dirtying in batch (taking the xarray lock only once)
1060 * so we just fall back to a looped call to __set_page_dirty_nobuffers
1062 #ifndef HAVE_ACCOUNT_PAGE_DIRTIED_EXPORT
1063 if (!vvp_account_page_dirtied) {
1064 for (i = 0; i < count; i++) {
1065 #ifdef HAVE_FOLIO_BATCH
1066 filemap_dirty_folio(page->mapping, fbatch->folios[i]);
1068 npgs = fbatch_at_npgs(fbatch, i);
1069 for (pg = 0; pg < npgs; pg++) {
1070 page = fbatch_at_pg(fbatch, i, pg);
1071 __set_page_dirty_nobuffers(page);
1079 /* account_page_dirtied is available directly or via kallsyms */
1080 #ifdef HAVE_KALLSYMS_LOOKUP_NAME
1081 for (pgno = i = 0; i < count; i++) {
1082 npgs = fbatch_at_npgs(fbatch, i);
1083 for (pg = 0; pg < npgs; pg++) {
1084 page = fbatch_at_pg(fbatch, i, pg);
1086 ClearPageReclaim(page);
1088 vvp_lock_page_memcg(page);
1089 if (TestSetPageDirty(page)) {
1090 /* page is already dirty .. no extra work needed
1091 * set a flag for the i'th page to be skipped
1093 vvp_unlock_page_memcg(page);
1094 skip_pages |= (1ul << pgno++);
1095 LASSERTF(pgno <= BITS_PER_LONG,
1096 "Limit exceeded pgno: %d/%d\n", pgno,
1102 ll_xa_lock_irqsave(&mapping->i_pages, flags);
1104 /* Notes on differences with __set_page_dirty_nobuffers:
1105 * 1. We don't need to call page_mapping because we know this is a page
1107 * 2. We have the pages locked, so there is no need for the careful
1108 * mapping/mapping2 dance.
1109 * 3. No mapping is impossible. (Race w/truncate mentioned in
1110 * dirty_nobuffers should be impossible because we hold the page lock.)
1111 * 4. All mappings are the same because i/o is only to one file.
1113 for (pgno = i = 0; i < count; i++) {
1114 npgs = fbatch_at_npgs(fbatch, f);
1115 for (pg = 0; pg < npgs; pg++) {
1116 page = fbatch_at_pg(fbatch, i, pg);
1117 /* if the i'th page was unlocked above, skip it here */
1118 if ((skip_pages >> pgno++) & 1)
1121 LASSERTF(page->mapping == mapping,
1122 "all pages must have the same mapping. page %px, mapping %px, first mapping %px\n",
1123 page, page->mapping, mapping);
1124 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1125 ll_account_page_dirtied(page, mapping);
1127 vvp_unlock_page_memcg(page);
1130 ll_xa_unlock_irqrestore(&mapping->i_pages, flags);
1132 CDEBUG(D_VFSTRACE, "mapping %p, count %d, dirtied %d\n", mapping,
1135 if (mapping->host && dirtied) {
1136 /* !PageAnon && !swapper_space */
1137 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1143 static void write_commit_callback(const struct lu_env *env, struct cl_io *io,
1144 struct folio_batch *fbatch)
1146 struct page *vmpage;
1147 struct cl_page *page;
1154 count = folio_batch_count(fbatch);
1157 for (i = 0; i < count; i++) {
1158 npgs = fbatch_at_npgs(fbatch, i);
1159 for (pg = 0; pg < npgs; pg++)
1160 SetPageUptodate(fbatch_at_pg(fbatch, i, pg));
1163 vvp_set_batch_dirty(fbatch);
1165 for (i = 0; i < count; i++) {
1166 npgs = fbatch_at_npgs(fbatch, i);
1167 for (pg = 0; pg < npgs; pg++) {
1168 vmpage = fbatch_at_pg(fbatch, i, pg);
1169 page = (struct cl_page *) vmpage->private;
1170 cl_page_disown(env, io, page);
1171 lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io));
1172 cl_page_put(env, page);
1179 /* make sure the page list is contiguous */
1180 static bool page_list_sanity_check(struct cl_object *obj,
1181 struct cl_page_list *plist)
1183 struct cl_page *page;
1184 pgoff_t index = CL_PAGE_EOF;
1186 cl_page_list_for_each(page, plist) {
1187 if (index == CL_PAGE_EOF) {
1188 index = cl_page_index(page);
1193 if (index == cl_page_index(page))
1201 /* Return how many bytes have queued or written */
1202 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
1204 struct cl_object *obj = io->ci_obj;
1205 struct inode *inode = vvp_object_inode(obj);
1206 struct vvp_io *vio = vvp_env_io(env);
1207 struct cl_page_list *queue = &vio->u.readwrite.vui_queue;
1208 struct cl_page *page;
1211 unsigned int npages = vio->u.readwrite.vui_queue.pl_nr;
1218 CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n",
1219 npages, vio->u.readwrite.vui_from, vio->u.readwrite.vui_to);
1221 LASSERT(page_list_sanity_check(obj, queue));
1223 /* submit IO with async write */
1224 rc = cl_io_commit_async(env, io, queue,
1225 vio->u.readwrite.vui_from,
1226 vio->u.readwrite.vui_to,
1227 write_commit_callback);
1228 npages -= queue->pl_nr; /* already committed pages */
1230 /* calculate how many bytes were written */
1231 bytes = npages << PAGE_SHIFT;
1234 bytes -= vio->u.readwrite.vui_from;
1235 if (queue->pl_nr == 0) /* last page */
1236 bytes -= PAGE_SIZE - vio->u.readwrite.vui_to;
1237 LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages);
1239 vio->u.readwrite.vui_written += bytes;
1241 CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n",
1242 npages, bytes, vio->u.readwrite.vui_written);
1244 /* the first page must have been written. */
1245 vio->u.readwrite.vui_from = 0;
1247 LASSERT(page_list_sanity_check(obj, queue));
1248 LASSERT(ergo(rc == 0, queue->pl_nr == 0));
1250 /* out of quota, try sync write */
1251 if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) {
1252 struct ll_inode_info *lli = ll_i2info(inode);
1254 rc = vvp_io_commit_sync(env, io, queue,
1255 vio->u.readwrite.vui_from,
1256 vio->u.readwrite.vui_to);
1258 vio->u.readwrite.vui_written += rc;
1261 if (lli->lli_clob != NULL)
1262 lov_read_and_clear_async_rc(lli->lli_clob);
1263 lli->lli_async_rc = 0;
1266 /* Now the pages in queue were failed to commit, discard them
1267 * unless they were dirtied before.
1269 while (queue->pl_nr > 0) {
1270 page = cl_page_list_first(queue);
1271 cl_page_list_del(env, queue, page, true);
1273 if (!PageDirty(cl_page_vmpage(page)))
1274 cl_page_discard(env, io, page);
1276 cl_page_disown(env, io, page);
1278 /* held in ll_cl_init() */
1279 lu_ref_del(&page->cp_reference, "cl_io", io);
1280 cl_page_put(env, page);
1282 cl_page_list_fini(env, queue);
1287 static int vvp_io_write_start(const struct lu_env *env,
1288 const struct cl_io_slice *ios)
1290 struct vvp_io *vio = cl2vvp_io(env, ios);
1291 struct cl_io *io = ios->cis_io;
1292 struct cl_object *obj = io->ci_obj;
1293 struct inode *inode = vvp_object_inode(obj);
1294 struct ll_inode_info *lli = ll_i2info(inode);
1295 struct file *file = vio->vui_fd->fd_file;
1297 loff_t pos = io->u.ci_wr.wr.crw_pos;
1298 size_t crw_bytes = io->u.ci_wr.wr.crw_bytes;
1299 bool lock_inode = !IS_NOSEC(inode);
1300 size_t ci_bytes = io->ci_bytes;
1301 struct iov_iter iter;
1307 trunc_sem_down_read(&lli->lli_trunc_sem);
1309 if (!can_populate_pages(env, io, inode))
1312 if (cl_io_is_append(io)) {
1314 * PARALLEL IO This has to be changed for parallel IO doing
1315 * out-of-order writes.
1317 ll_merge_attr(env, inode);
1318 pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
1319 vio->vui_iocb->ki_pos = pos;
1321 LASSERTF(vio->vui_iocb->ki_pos == pos,
1322 "ki_pos %lld [%lld, %lld)\n",
1323 vio->vui_iocb->ki_pos,
1324 pos, pos + crw_bytes);
1327 CDEBUG(D_VFSTRACE, "%s: write [%llu, %llu)\n",
1328 file_dentry(file)->d_name.name, pos, pos + crw_bytes);
1330 /* The maximum Lustre file size is variable, based on the OST maximum
1331 * object size and number of stripes. This needs another check in
1332 * addition to the VFS checks earlier.
1334 if (pos + crw_bytes > ll_file_maxbytes(inode)) {
1336 "%s: file %s ("DFID") offset %llu > maxbytes %llu\n",
1337 ll_i2sbi(inode)->ll_fsname,
1338 file_dentry(file)->d_name.name,
1339 PFID(ll_inode2fid(inode)), pos + crw_bytes,
1340 ll_file_maxbytes(inode));
1344 /* Tests to verify we take the i_mutex correctly */
1345 if (CFS_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_SEC) && !lock_inode)
1348 if (CFS_FAIL_CHECK(OBD_FAIL_LLITE_IMUTEX_NOSEC) && lock_inode)
1351 flags = iocb_ki_flags_get(file, vio->vui_iocb);
1352 if (!iocb_ki_flags_check(flags, DIRECT)) {
1353 result = cl_io_lru_reserve(env, io, pos, crw_bytes);
1358 if (vio->vui_iter == NULL) {
1359 /* from a temp io in ll_cl_init(). */
1363 * When using the locked AIO function (generic_file_aio_write())
1364 * testing has shown the inode mutex to be a limiting factor
1365 * with multi-threaded single shared file performance. To get
1366 * around this, we now use the lockless version. To maintain
1367 * consistency, proper locking to protect against writes,
1368 * trucates, etc. is handled in the higher layers of lustre.
1370 lock_inode = !IS_NOSEC(inode);
1371 iter = *vio->vui_iter;
1373 if (unlikely(lock_inode))
1374 ll_inode_lock(inode);
1375 result = __generic_file_write_iter(vio->vui_iocb, &iter);
1376 if (unlikely(lock_inode))
1377 ll_inode_unlock(inode);
1381 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1382 result = generic_write_sync(vio->vui_iocb, result);
1387 err = generic_write_sync(vio->vui_iocb->ki_filp, pos,
1389 if (err < 0 && result > 0)
1396 result = vvp_io_write_commit(env, io);
1397 /* Simulate short commit */
1398 if (CFS_FAULT_CHECK(OBD_FAIL_LLITE_SHORT_COMMIT)) {
1399 vio->u.readwrite.vui_written >>= 1;
1400 if (vio->u.readwrite.vui_written > 0)
1401 io->ci_need_restart = 1;
1403 if (vio->u.readwrite.vui_written > 0) {
1404 result = vio->u.readwrite.vui_written;
1405 CDEBUG(D_VFSTRACE, "%s: write bytes %zd, result: %zd\n",
1406 file_dentry(file)->d_name.name,
1407 io->ci_bytes, result);
1408 io->ci_bytes += result;
1410 io->ci_continue = 0;
1413 if (vio->vui_iocb->ki_pos != (pos + io->ci_bytes - ci_bytes)) {
1415 "%s: write position mismatch: ki_pos %lld vs. pos %lld, written %zd, commit %ld: rc = %zd\n",
1416 file_dentry(file)->d_name.name,
1417 vio->vui_iocb->ki_pos, pos + io->ci_bytes - ci_bytes,
1418 written, io->ci_bytes - ci_bytes, result);
1420 * Rewind ki_pos and vui_iter to where it has
1421 * successfully committed.
1423 vio->vui_iocb->ki_pos = pos + io->ci_bytes - ci_bytes;
1425 if (result > 0 || result == -EIOCBQUEUED) {
1426 set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags);
1428 if (result != -EIOCBQUEUED && result < crw_bytes)
1429 io->ci_continue = 0;
1433 if (result == -EIOCBQUEUED) {
1434 io->ci_bytes += vio->u.readwrite.vui_written;
1435 vio->vui_iocb->ki_pos = pos +
1436 vio->u.readwrite.vui_written;
1443 static void vvp_io_rw_end(const struct lu_env *env,
1444 const struct cl_io_slice *ios)
1446 struct inode *inode = vvp_object_inode(ios->cis_obj);
1447 struct ll_inode_info *lli = ll_i2info(inode);
1449 trunc_sem_up_read(&lli->lli_trunc_sem);
1452 static void vvp_io_write_end(const struct lu_env *env,
1453 const struct cl_io_slice *ios)
1455 struct inode *inode = vvp_object_inode(ios->cis_obj);
1456 struct cl_io *io = ios->cis_io;
1458 vvp_io_rw_end(env, ios);
1460 /* Update size and blocks for LSOM (best effort) */
1461 if (!io->ci_ignore_layout && cl_io_is_sync_write(io))
1462 ll_merge_attr_try(env, inode);
1466 static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
1468 struct vm_fault *vmf = cfio->ft_vmf;
1470 cfio->ft_flags = ll_filemap_fault(cfio->ft_vma, vmf);
1471 cfio->ft_flags_valid = 1;
1474 /* success, vmpage is locked */
1475 LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
1476 get_vmf_address(vmf));
1477 if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
1478 lock_page(vmf->page);
1479 cfio->ft_flags |= VM_FAULT_LOCKED;
1482 cfio->ft_vmpage = vmf->page;
1487 if (cfio->ft_flags & VM_FAULT_SIGBUS) {
1488 CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", get_vmf_address(vmf));
1492 if (cfio->ft_flags & VM_FAULT_OOM) {
1493 CDEBUG(D_PAGE, "got addr %p - OOM\n", get_vmf_address(vmf));
1497 if (cfio->ft_flags & VM_FAULT_RETRY)
1500 CERROR("unknown error in page fault %d\n", cfio->ft_flags);
1505 static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
1506 struct folio_batch *fbatch)
1508 vvp_set_batch_dirty(fbatch);
1511 static int vvp_io_fault_start(const struct lu_env *env,
1512 const struct cl_io_slice *ios)
1514 struct vvp_io *vio = cl2vvp_io(env, ios);
1515 struct cl_io *io = ios->cis_io;
1516 struct cl_object *obj = io->ci_obj;
1517 struct inode *inode = vvp_object_inode(obj);
1518 struct ll_inode_info *lli = ll_i2info(inode);
1519 struct cl_fault_io *fio = &io->u.ci_fault;
1520 struct vvp_fault_io *cfio = &vio->u.fault;
1523 struct page *vmpage = NULL;
1524 struct cl_page *page;
1530 trunc_sem_down_read_nowait(&lli->lli_trunc_sem);
1532 /* offset of the last byte on the page */
1533 offset = ((fio->ft_index + 1) << PAGE_SHIFT) - 1;
1534 LASSERT((offset >> PAGE_SHIFT) == fio->ft_index);
1535 result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL);
1539 CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_FAULT_PAUSE, cfs_fail_val);
1541 /* must return locked page */
1542 if (fio->ft_mkwrite) {
1543 LASSERT(cfio->ft_vmpage != NULL);
1544 vmpage = cfio->ft_vmpage;
1547 * page was turncated and lock was cancelled, return ENODATA
1548 * so that VM_FAULT_NOPAGE will be returned to handle_mm_fault()
1549 * XXX: cannot return VM_FAULT_RETRY to vfs since we cannot
1550 * release mmap_lock and VM_FAULT_RETRY implies that the
1551 * mmap_lock is released.
1553 if (!PageUptodate(vmpage))
1554 GOTO(out, result = -ENODATA);
1556 result = vvp_io_kernel_fault(cfio);
1561 vmpage = cfio->ft_vmpage;
1562 LASSERT(PageLocked(vmpage));
1564 if (CFS_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
1565 generic_error_remove_folio(vmpage->mapping, page_folio(vmpage));
1567 size = i_size_read(inode);
1568 /* Though we have already held a cl_lock upon this page, but
1569 * it still can be truncated locally.
1571 if (unlikely((vmpage->mapping != inode->i_mapping) ||
1572 (page_offset(vmpage) > size))) {
1573 CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
1575 /* return +1 to stop cl_io_loop() and ll_fault() will catch
1578 GOTO(out, result = + 1);
1581 last_index = (size - 1) >> PAGE_SHIFT;
1583 if (fio->ft_mkwrite) {
1585 * Capture the size while holding the lli_trunc_sem from above
1586 * we want to make sure that we complete the mkwrite action
1587 * while holding this lock. We need to make sure that we are
1588 * not past the end of the file.
1590 if (last_index < fio->ft_index) {
1592 "llite: mkwrite and truncate race happened: %p: 0x%lx 0x%lx\n",
1593 vmpage->mapping, fio->ft_index, last_index);
1595 * We need to return if we are
1596 * passed the end of the file. This will propagate
1597 * up the call stack to ll_page_mkwrite where
1598 * we will return VM_FAULT_NOPAGE. Any non-negative
1599 * value returned here will be silently
1600 * converted to 0. If the vmpage->mapping is null
1601 * the error code would be converted back to ENODATA
1602 * in ll_page_mkwrite0. Thus we return -ENODATA
1603 * to handle both cases
1605 GOTO(out, result = -ENODATA);
1609 page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
1611 GOTO(out, result = PTR_ERR(page));
1613 /* if page will be written, then add this page into cache earlier. */
1614 if (fio->ft_mkwrite) {
1615 wait_on_page_writeback(vmpage);
1616 if (!PageDirty(vmpage)) {
1617 struct cl_page_list *plist = &vio->u.fault.ft_queue;
1620 /* vvp_page_assume() calls wait_on_page_writeback(). */
1621 cl_page_assume(env, io, page);
1623 cl_page_list_init(plist);
1624 cl_page_list_add(plist, page, true);
1627 if (last_index == cl_page_index(page))
1628 to = ((size - 1) & ~PAGE_MASK) + 1;
1630 /* Do not set Dirty bit here so that in case IO is
1631 * started before the page is really made dirty, we
1632 * still have chance to detect it.
1634 result = cl_io_commit_async(env, io, plist, 0, to,
1635 mkwrite_commit_callback);
1636 /* Have overquota flag, trying sync write to check
1637 * whether indeed out of quota
1639 if (result == -EDQUOT) {
1641 result = vvp_io_commit_sync(env, io,
1645 cl_page_own(env, io, page);
1646 cl_page_list_add(plist, page, true);
1647 lu_ref_add(&page->cp_reference,
1649 result = cl_io_commit_async(env, io,
1651 mkwrite_commit_callback);
1654 cl_page_put(env, page);
1658 LASSERT(cl_page_is_owned(page, io));
1659 cl_page_list_fini(env, plist);
1663 cl_page_discard(env, io, page);
1664 cl_page_disown(env, io, page);
1666 cl_page_put(env, page);
1668 /* we're in big trouble, what can we do now? */
1669 if (result == -EDQUOT)
1673 cl_page_disown(env, io, page);
1679 * The ft_index is only used in the case of mkwrite action. We need to
1680 * check our assertions are correct, since we should have caught this
1683 LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index);
1684 if (fio->ft_index == last_index)
1685 /* Last page is mapped partially. */
1686 fio->ft_bytes = size - (fio->ft_index << PAGE_SHIFT);
1688 fio->ft_bytes = PAGE_SIZE;
1690 lu_ref_add(&page->cp_reference, "fault", io);
1691 fio->ft_page = page;
1695 /* return unlocked vmpage to avoid deadlocking */
1697 unlock_page(vmpage);
1699 cfio->ft_flags &= ~VM_FAULT_LOCKED;
1704 static void vvp_io_fault_end(const struct lu_env *env,
1705 const struct cl_io_slice *ios)
1707 struct inode *inode = vvp_object_inode(ios->cis_obj);
1708 struct ll_inode_info *lli = ll_i2info(inode);
1710 CLOBINVRNT(env, ios->cis_io->ci_obj,
1711 vvp_object_invariant(ios->cis_io->ci_obj));
1712 trunc_sem_up_read(&lli->lli_trunc_sem);
1715 static int vvp_io_fsync_start(const struct lu_env *env,
1716 const struct cl_io_slice *ios)
1718 /* mark TOWRITE bit to each dirty page in radix tree to verify pages
1719 * have been written, but this is difficult because of race.
1724 static void vvp_io_fsync_end(const struct lu_env *env,
1725 const struct cl_io_slice *ios)
1727 struct inode *inode = vvp_object_inode(ios->cis_obj);
1728 struct cl_io *io = ios->cis_io;
1730 /* Update size and blocks for LSOM (best effort) */
1731 if (!io->ci_ignore_layout)
1732 ll_merge_attr_try(env, inode);
1735 static int vvp_io_read_ahead(const struct lu_env *env,
1736 const struct cl_io_slice *ios,
1737 pgoff_t start, struct cl_read_ahead *ra)
1743 if (ios->cis_io->ci_type == CIT_READ ||
1744 ios->cis_io->ci_type == CIT_FAULT) {
1745 struct vvp_io *vio = cl2vvp_io(env, ios);
1747 if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1748 ra->cra_end_idx = CL_PAGE_EOF;
1749 result = 1; /* no need to call down */
1756 static int vvp_io_lseek_lock(const struct lu_env *env,
1757 const struct cl_io_slice *ios)
1759 struct cl_io *io = ios->cis_io;
1760 __u64 lock_start = io->u.ci_lseek.ls_start;
1761 __u64 lock_end = OBD_OBJECT_EOF;
1762 __u32 enqflags = CEF_MUST; /* always take client lock */
1764 return vvp_io_one_lock(env, io, enqflags, CLM_READ,
1765 lock_start, lock_end);
1768 static int vvp_io_lseek_start(const struct lu_env *env,
1769 const struct cl_io_slice *ios)
1771 struct cl_io *io = ios->cis_io;
1772 struct inode *inode = vvp_object_inode(io->ci_obj);
1773 __u64 start = io->u.ci_lseek.ls_start;
1775 ll_inode_lock(inode);
1776 inode_dio_wait(inode);
1778 /* At the moment we have DLM lock so just update inode
1779 * to know the file size.
1781 ll_merge_attr(env, inode);
1782 if (start >= i_size_read(inode)) {
1783 io->u.ci_lseek.ls_result = -ENXIO;
1789 static void vvp_io_lseek_end(const struct lu_env *env,
1790 const struct cl_io_slice *ios)
1792 struct cl_io *io = ios->cis_io;
1793 struct inode *inode = vvp_object_inode(io->ci_obj);
1795 if (io->u.ci_lseek.ls_result > i_size_read(inode))
1796 io->u.ci_lseek.ls_result = -ENXIO;
1798 ll_inode_unlock(inode);
1801 static const struct cl_io_operations vvp_io_ops = {
1804 .cio_fini = vvp_io_fini,
1805 .cio_iter_init = vvp_io_read_iter_init,
1806 .cio_lock = vvp_io_read_lock,
1807 .cio_start = vvp_io_read_start,
1808 .cio_end = vvp_io_rw_end,
1809 .cio_advance = vvp_io_advance,
1812 .cio_fini = vvp_io_fini,
1813 .cio_iter_init = vvp_io_write_iter_init,
1814 .cio_iter_fini = vvp_io_write_iter_fini,
1815 .cio_lock = vvp_io_write_lock,
1816 .cio_start = vvp_io_write_start,
1817 .cio_end = vvp_io_write_end,
1818 .cio_advance = vvp_io_advance,
1821 .cio_fini = vvp_io_setattr_fini,
1822 .cio_iter_init = vvp_io_setattr_iter_init,
1823 .cio_lock = vvp_io_setattr_lock,
1824 .cio_start = vvp_io_setattr_start,
1825 .cio_end = vvp_io_setattr_end
1828 .cio_fini = vvp_io_fault_fini,
1829 .cio_iter_init = vvp_io_fault_iter_init,
1830 .cio_lock = vvp_io_fault_lock,
1831 .cio_start = vvp_io_fault_start,
1832 .cio_end = vvp_io_fault_end,
1835 .cio_start = vvp_io_fsync_start,
1836 .cio_fini = vvp_io_fini,
1837 .cio_end = vvp_io_fsync_end,
1840 .cio_fini = vvp_io_fini
1843 .cio_fini = vvp_io_fini
1846 .cio_fini = vvp_io_fini
1849 .cio_fini = vvp_io_fini,
1850 .cio_lock = vvp_io_lseek_lock,
1851 .cio_start = vvp_io_lseek_start,
1852 .cio_end = vvp_io_lseek_end,
1855 .cio_read_ahead = vvp_io_read_ahead
1858 int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
1861 struct vvp_io *vio = vvp_env_io(env);
1862 struct inode *inode = vvp_object_inode(obj);
1865 CLOBINVRNT(env, obj, vvp_object_invariant(obj));
1868 CDEBUG(D_VFSTRACE, DFID" ignore/verify layout %d/%d, layout version %d restore needed %d\n",
1869 PFID(lu_object_fid(&obj->co_lu)),
1870 io->ci_ignore_layout, io->ci_verify_layout,
1871 vio->vui_layout_gen, io->ci_restore_needed);
1873 CL_IO_SLICE_CLEAN(vio, vui_cl);
1874 cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops);
1875 vio->vui_ra_valid = false;
1877 if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
1879 struct ll_inode_info *lli = ll_i2info(inode);
1881 bytes = io->u.ci_rw.crw_bytes;
1882 /* "If nbyte is 0, read() will return 0 and have no other
1883 * results." -- Single Unix Spec
1888 vio->vui_tot_bytes = bytes;
1890 /* for read/write, we store the process jobid/gid/uid in the
1891 * inode, and it'll be fetched by osc when building RPC.
1893 * it's not accurate if the file is shared by different
1896 lustre_get_jobid(lli->lli_jobid, sizeof(lli->lli_jobid));
1897 lli->lli_uid = from_kuid(&init_user_ns, current_uid());
1898 lli->lli_gid = from_kgid(&init_user_ns, current_gid());
1899 } else if (io->ci_type == CIT_SETATTR) {
1900 if (!cl_io_is_trunc(io))
1901 io->ci_lockreq = CILR_MANDATORY;
1904 /* Enqueue layout lock and get layout version. We need to do this
1905 * even for operations requiring to open file, such as read and write,
1906 * because it might not grant layout lock in IT_OPEN.
1908 if (result == 0 && !io->ci_ignore_layout) {
1909 result = ll_layout_refresh(inode, &vio->vui_layout_gen);
1910 if (result == -ENOENT)
1911 /* If the inode on MDS has been removed, but the objects
1912 * on OSTs haven't been destroyed (async unlink), layout
1913 * fetch will return -ENOENT, we'd ingore this error
1914 * and continue with dirty flush. LU-3230.
1918 CERROR("%s: refresh file layout " DFID " error %d.\n",
1919 ll_i2sbi(inode)->ll_fsname,
1920 PFID(lu_object_fid(&obj->co_lu)), result);
1923 #ifdef HAVE_INVALIDATE_LOCK
1924 if (io->ci_invalidate_page_cache)
1925 filemap_invalidate_lock(inode->i_mapping);
1926 #endif /* HAVE_INVALIDATE_LOCK */
1928 io->ci_result = result < 0 ? result : 0;