4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE;
158 op_data->op_xvalid |= OP_XVALID_BLOCKS;
159 case MDS_CLOSE_LAYOUT_SPLIT:
160 case MDS_CLOSE_LAYOUT_SWAP: {
161 struct split_param *sp = data;
163 LASSERT(data != NULL);
164 op_data->op_bias |= bias;
165 op_data->op_data_version = 0;
166 op_data->op_lease_handle = och->och_lease_handle;
167 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
168 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
169 op_data->op_mirror_id = sp->sp_mirror_id;
171 op_data->op_fid2 = *ll_inode2fid(data);
176 case MDS_CLOSE_RESYNC_DONE: {
177 struct ll_ioc_lease *ioc = data;
179 LASSERT(data != NULL);
180 op_data->op_attr_blocks +=
181 ioc->lil_count * op_data->op_attr_blocks;
182 op_data->op_attr.ia_valid |= ATTR_SIZE;
183 op_data->op_xvalid |= OP_XVALID_BLOCKS;
184 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
186 op_data->op_lease_handle = och->och_lease_handle;
187 op_data->op_data = &ioc->lil_ids[0];
188 op_data->op_data_size =
189 ioc->lil_count * sizeof(ioc->lil_ids[0]);
193 case MDS_HSM_RELEASE:
194 LASSERT(data != NULL);
195 op_data->op_bias |= MDS_HSM_RELEASE;
196 op_data->op_data_version = *(__u64 *)data;
197 op_data->op_lease_handle = och->och_lease_handle;
198 op_data->op_attr.ia_valid |= ATTR_SIZE;
199 op_data->op_xvalid |= OP_XVALID_BLOCKS;
203 LASSERT(data == NULL);
207 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
208 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
209 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
210 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
212 rc = md_close(md_exp, op_data, och->och_mod, &req);
213 if (rc != 0 && rc != -EINTR)
214 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
215 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
217 if (rc == 0 && op_data->op_bias & bias) {
218 struct mdt_body *body;
220 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
221 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
225 ll_finish_md_op_data(op_data);
229 md_clear_open_replay_data(md_exp, och);
230 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
233 ptlrpc_req_finished(req); /* This is close request */
237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
239 struct ll_inode_info *lli = ll_i2info(inode);
240 struct obd_client_handle **och_p;
241 struct obd_client_handle *och;
246 if (fmode & FMODE_WRITE) {
247 och_p = &lli->lli_mds_write_och;
248 och_usecount = &lli->lli_open_fd_write_count;
249 } else if (fmode & FMODE_EXEC) {
250 och_p = &lli->lli_mds_exec_och;
251 och_usecount = &lli->lli_open_fd_exec_count;
253 LASSERT(fmode & FMODE_READ);
254 och_p = &lli->lli_mds_read_och;
255 och_usecount = &lli->lli_open_fd_read_count;
258 mutex_lock(&lli->lli_och_mutex);
259 if (*och_usecount > 0) {
260 /* There are still users of this handle, so skip
262 mutex_unlock(&lli->lli_och_mutex);
268 mutex_unlock(&lli->lli_och_mutex);
271 /* There might be a race and this handle may already
273 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
279 static int ll_md_close(struct inode *inode, struct file *file)
281 union ldlm_policy_data policy = {
282 .l_inodebits = { MDS_INODELOCK_OPEN },
284 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
286 struct ll_inode_info *lli = ll_i2info(inode);
287 struct lustre_handle lockh;
288 enum ldlm_mode lockmode;
292 /* clear group lock, if present */
293 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
294 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
296 if (fd->fd_lease_och != NULL) {
299 /* Usually the lease is not released when the
300 * application crashed, we need to release here. */
301 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
302 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
303 PFID(&lli->lli_fid), rc, lease_broken);
305 fd->fd_lease_och = NULL;
308 if (fd->fd_och != NULL) {
309 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
314 /* Let's see if we have good enough OPEN lock on the file and if
315 we can skip talking to MDS */
316 mutex_lock(&lli->lli_och_mutex);
317 if (fd->fd_omode & FMODE_WRITE) {
319 LASSERT(lli->lli_open_fd_write_count);
320 lli->lli_open_fd_write_count--;
321 } else if (fd->fd_omode & FMODE_EXEC) {
323 LASSERT(lli->lli_open_fd_exec_count);
324 lli->lli_open_fd_exec_count--;
327 LASSERT(lli->lli_open_fd_read_count);
328 lli->lli_open_fd_read_count--;
330 mutex_unlock(&lli->lli_och_mutex);
332 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
333 LDLM_IBITS, &policy, lockmode, &lockh))
334 rc = ll_md_real_close(inode, fd->fd_omode);
337 LUSTRE_FPRIVATE(file) = NULL;
338 ll_file_data_put(fd);
343 /* While this returns an error code, fput() the caller does not, so we need
344 * to make every effort to clean up all of our state here. Also, applications
345 * rarely check close errors and even if an error is returned they will not
346 * re-try the close call.
348 int ll_file_release(struct inode *inode, struct file *file)
350 struct ll_file_data *fd;
351 struct ll_sb_info *sbi = ll_i2sbi(inode);
352 struct ll_inode_info *lli = ll_i2info(inode);
356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
357 PFID(ll_inode2fid(inode)), inode);
359 if (inode->i_sb->s_root != file_dentry(file))
360 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
361 fd = LUSTRE_FPRIVATE(file);
364 /* The last ref on @file, maybe not the the owner pid of statahead,
365 * because parent and child process can share the same file handle. */
366 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
367 ll_deauthorize_statahead(inode, fd);
369 if (inode->i_sb->s_root == file_dentry(file)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 if (lli->lli_clob != NULL)
377 lov_read_and_clear_async_rc(lli->lli_clob);
378 lli->lli_async_rc = 0;
381 rc = ll_md_close(inode, file);
383 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
384 libcfs_debug_dumplog();
389 static inline int ll_dom_readpage(void *data, struct page *page)
391 struct niobuf_local *lnb = data;
394 kaddr = ll_kmap_atomic(page, KM_USER0);
395 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
396 if (lnb->lnb_len < PAGE_SIZE)
397 memset(kaddr + lnb->lnb_len, 0,
398 PAGE_SIZE - lnb->lnb_len);
399 flush_dcache_page(page);
400 SetPageUptodate(page);
401 ll_kunmap_atomic(kaddr, KM_USER0);
407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
408 struct lookup_intent *it)
410 struct ll_inode_info *lli = ll_i2info(inode);
411 struct cl_object *obj = lli->lli_clob;
412 struct address_space *mapping = inode->i_mapping;
414 struct niobuf_remote *rnb;
415 struct mdt_body *body;
417 unsigned long index, start;
418 struct niobuf_local lnb;
425 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
429 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
430 if (rnb == NULL || rnb->rnb_len == 0)
433 /* LU-11595: Server may return whole file and that is OK always or
434 * it may return just file tail and its offset must be aligned with
435 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
436 * smaller then offset may be not aligned and that data is just ignored.
438 if (rnb->rnb_offset % PAGE_SIZE)
441 /* Server returns whole file or just file tail if it fills in reply
442 * buffer, in both cases total size should be equal to the file size.
444 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
445 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
446 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
447 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
448 rnb->rnb_len, body->mbo_dom_size);
452 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
453 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
455 data = (char *)rnb + sizeof(*rnb);
457 lnb.lnb_file_offset = rnb->rnb_offset;
458 start = lnb.lnb_file_offset / PAGE_SIZE;
460 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
461 lnb.lnb_page_offset = 0;
463 lnb.lnb_data = data + (index << PAGE_SHIFT);
464 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
465 if (lnb.lnb_len > PAGE_SIZE)
466 lnb.lnb_len = PAGE_SIZE;
468 vmpage = read_cache_page(mapping, index + start,
469 ll_dom_readpage, &lnb);
470 if (IS_ERR(vmpage)) {
471 CWARN("%s: cannot fill page %lu for "DFID
472 " with data: rc = %li\n",
473 ll_i2sbi(inode)->ll_fsname, index + start,
474 PFID(lu_object_fid(&obj->co_lu)),
480 } while (rnb->rnb_len > (index << PAGE_SHIFT));
484 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
485 struct lookup_intent *itp)
487 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
488 struct dentry *parent = de->d_parent;
491 struct md_op_data *op_data;
492 struct ptlrpc_request *req = NULL;
496 LASSERT(parent != NULL);
497 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
499 /* if server supports open-by-fid, or file name is invalid, don't pack
500 * name in open request */
501 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
502 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
504 len = de->d_name.len;
505 name = kmalloc(len + 1, GFP_NOFS);
510 spin_lock(&de->d_lock);
511 if (len != de->d_name.len) {
512 spin_unlock(&de->d_lock);
516 memcpy(name, de->d_name.name, len);
518 spin_unlock(&de->d_lock);
520 if (!lu_name_is_valid_2(name, len)) {
526 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
527 name, len, 0, LUSTRE_OPC_ANY, NULL);
528 if (IS_ERR(op_data)) {
530 RETURN(PTR_ERR(op_data));
532 op_data->op_data = lmm;
533 op_data->op_data_size = lmmsize;
535 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
536 &ll_md_blocking_ast, 0);
538 ll_finish_md_op_data(op_data);
540 /* reason for keep own exit path - don`t flood log
541 * with messages with -ESTALE errors.
543 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
544 it_open_error(DISP_OPEN_OPEN, itp))
546 ll_release_openhandle(de, itp);
550 if (it_disposition(itp, DISP_LOOKUP_NEG))
551 GOTO(out, rc = -ENOENT);
553 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
554 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
555 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
559 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
561 if (!rc && itp->it_lock_mode) {
562 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
563 struct ldlm_lock *lock;
564 bool has_dom_bit = false;
566 /* If we got a lock back and it has a LOOKUP bit set,
567 * make sure the dentry is marked as valid so we can find it.
568 * We don't need to care about actual hashing since other bits
569 * of kernel will deal with that later.
571 lock = ldlm_handle2lock(&handle);
573 has_dom_bit = ldlm_has_dom(lock);
574 if (lock->l_policy_data.l_inodebits.bits &
575 MDS_INODELOCK_LOOKUP)
576 d_lustre_revalidate(de);
580 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
582 ll_dom_finish_open(de->d_inode, req, itp);
586 ptlrpc_req_finished(req);
587 ll_intent_drop_lock(itp);
589 /* We did open by fid, but by the time we got to the server,
590 * the object disappeared. If this is a create, we cannot really
591 * tell the userspace that the file it was trying to create
592 * does not exist. Instead let's return -ESTALE, and the VFS will
593 * retry the create with LOOKUP_REVAL that we are going to catch
594 * in ll_revalidate_dentry() and use lookup then.
596 if (rc == -ENOENT && itp->it_op & IT_CREAT)
602 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
603 struct obd_client_handle *och)
605 struct mdt_body *body;
607 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
608 och->och_open_handle = body->mbo_open_handle;
609 och->och_fid = body->mbo_fid1;
610 och->och_lease_handle.cookie = it->it_lock_handle;
611 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
612 och->och_flags = it->it_flags;
614 return md_set_open_replay_data(md_exp, och, it);
617 static int ll_local_open(struct file *file, struct lookup_intent *it,
618 struct ll_file_data *fd, struct obd_client_handle *och)
620 struct inode *inode = file_inode(file);
623 LASSERT(!LUSTRE_FPRIVATE(file));
630 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
635 LUSTRE_FPRIVATE(file) = fd;
636 ll_readahead_init(inode, &fd->fd_ras);
637 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
639 /* ll_cl_context initialize */
640 rwlock_init(&fd->fd_lock);
641 INIT_LIST_HEAD(&fd->fd_lccs);
646 /* Open a file, and (for the very first open) create objects on the OSTs at
647 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
648 * creation or open until ll_lov_setstripe() ioctl is called.
650 * If we already have the stripe MD locally then we don't request it in
651 * md_open(), by passing a lmm_size = 0.
653 * It is up to the application to ensure no other processes open this file
654 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
655 * used. We might be able to avoid races of that sort by getting lli_open_sem
656 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
657 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
659 int ll_file_open(struct inode *inode, struct file *file)
661 struct ll_inode_info *lli = ll_i2info(inode);
662 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
663 .it_flags = file->f_flags };
664 struct obd_client_handle **och_p = NULL;
665 __u64 *och_usecount = NULL;
666 struct ll_file_data *fd;
670 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
671 PFID(ll_inode2fid(inode)), inode, file->f_flags);
673 it = file->private_data; /* XXX: compat macro */
674 file->private_data = NULL; /* prevent ll_local_open assertion */
676 fd = ll_file_data_get();
678 GOTO(out_nofiledata, rc = -ENOMEM);
681 if (S_ISDIR(inode->i_mode))
682 ll_authorize_statahead(inode, fd);
684 if (inode->i_sb->s_root == file_dentry(file)) {
685 LUSTRE_FPRIVATE(file) = fd;
689 if (!it || !it->it_disposition) {
690 /* Convert f_flags into access mode. We cannot use file->f_mode,
691 * because everything but O_ACCMODE mask was stripped from
693 if ((oit.it_flags + 1) & O_ACCMODE)
695 if (file->f_flags & O_TRUNC)
696 oit.it_flags |= FMODE_WRITE;
698 /* kernel only call f_op->open in dentry_open. filp_open calls
699 * dentry_open after call to open_namei that checks permissions.
700 * Only nfsd_open call dentry_open directly without checking
701 * permissions and because of that this code below is safe.
703 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
704 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
706 /* We do not want O_EXCL here, presumably we opened the file
707 * already? XXX - NFS implications? */
708 oit.it_flags &= ~O_EXCL;
710 /* bug20584, if "it_flags" contains O_CREAT, the file will be
711 * created if necessary, then "IT_CREAT" should be set to keep
712 * consistent with it */
713 if (oit.it_flags & O_CREAT)
714 oit.it_op |= IT_CREAT;
720 /* Let's see if we have file open on MDS already. */
721 if (it->it_flags & FMODE_WRITE) {
722 och_p = &lli->lli_mds_write_och;
723 och_usecount = &lli->lli_open_fd_write_count;
724 } else if (it->it_flags & FMODE_EXEC) {
725 och_p = &lli->lli_mds_exec_och;
726 och_usecount = &lli->lli_open_fd_exec_count;
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 mutex_lock(&lli->lli_och_mutex);
733 if (*och_p) { /* Open handle is present */
734 if (it_disposition(it, DISP_OPEN_OPEN)) {
735 /* Well, there's extra open request that we do not need,
736 let's close it somehow. This will decref request. */
737 rc = it_open_error(DISP_OPEN_OPEN, it);
739 mutex_unlock(&lli->lli_och_mutex);
740 GOTO(out_openerr, rc);
743 ll_release_openhandle(file_dentry(file), it);
747 rc = ll_local_open(file, it, fd, NULL);
750 mutex_unlock(&lli->lli_och_mutex);
751 GOTO(out_openerr, rc);
754 LASSERT(*och_usecount == 0);
755 if (!it->it_disposition) {
756 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
757 /* We cannot just request lock handle now, new ELC code
758 means that one of other OPEN locks for this file
759 could be cancelled, and since blocking ast handler
760 would attempt to grab och_mutex as well, that would
761 result in a deadlock */
762 mutex_unlock(&lli->lli_och_mutex);
764 * Normally called under two situations:
766 * 2. A race/condition on MDS resulting in no open
767 * handle to be returned from LOOKUP|OPEN request,
768 * for example if the target entry was a symlink.
770 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
771 * marked by a bit set in ll_iget_for_nfs. Clear the
772 * bit so that it's not confusing later callers.
774 * NB; when ldd is NULL, it must have come via normal
775 * lookup path only, since ll_iget_for_nfs always calls
778 if (ldd && ldd->lld_nfs_dentry) {
779 ldd->lld_nfs_dentry = 0;
780 it->it_flags |= MDS_OPEN_LOCK;
784 * Always specify MDS_OPEN_BY_FID because we don't want
785 * to get file with different fid.
787 it->it_flags |= MDS_OPEN_BY_FID;
788 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
791 GOTO(out_openerr, rc);
795 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
797 GOTO(out_och_free, rc = -ENOMEM);
801 /* md_intent_lock() didn't get a request ref if there was an
802 * open error, so don't do cleanup on the request here
804 /* XXX (green): Should not we bail out on any error here, not
805 * just open error? */
806 rc = it_open_error(DISP_OPEN_OPEN, it);
808 GOTO(out_och_free, rc);
810 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
811 "inode %p: disposition %x, status %d\n", inode,
812 it_disposition(it, ~0), it->it_status);
814 rc = ll_local_open(file, it, fd, *och_p);
816 GOTO(out_och_free, rc);
818 mutex_unlock(&lli->lli_och_mutex);
821 /* Must do this outside lli_och_mutex lock to prevent deadlock where
822 different kind of OPEN lock for this same inode gets cancelled
823 by ldlm_cancel_lru */
824 if (!S_ISREG(inode->i_mode))
825 GOTO(out_och_free, rc);
827 cl_lov_delay_create_clear(&file->f_flags);
828 GOTO(out_och_free, rc);
832 if (och_p && *och_p) {
833 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
834 *och_p = NULL; /* OBD_FREE writes some magic there */
837 mutex_unlock(&lli->lli_och_mutex);
840 if (lli->lli_opendir_key == fd)
841 ll_deauthorize_statahead(inode, fd);
843 ll_file_data_put(fd);
845 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
849 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
850 ptlrpc_req_finished(it->it_request);
851 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
857 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
858 struct ldlm_lock_desc *desc, void *data, int flag)
861 struct lustre_handle lockh;
865 case LDLM_CB_BLOCKING:
866 ldlm_lock2handle(lock, &lockh);
867 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
869 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
873 case LDLM_CB_CANCELING:
881 * When setting a lease on a file, we take ownership of the lli_mds_*_och
882 * and save it as fd->fd_och so as to force client to reopen the file even
883 * if it has an open lock in cache already.
885 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
886 struct lustre_handle *old_open_handle)
888 struct ll_inode_info *lli = ll_i2info(inode);
889 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
890 struct obd_client_handle **och_p;
895 /* Get the openhandle of the file */
896 mutex_lock(&lli->lli_och_mutex);
897 if (fd->fd_lease_och != NULL)
898 GOTO(out_unlock, rc = -EBUSY);
900 if (fd->fd_och == NULL) {
901 if (file->f_mode & FMODE_WRITE) {
902 LASSERT(lli->lli_mds_write_och != NULL);
903 och_p = &lli->lli_mds_write_och;
904 och_usecount = &lli->lli_open_fd_write_count;
906 LASSERT(lli->lli_mds_read_och != NULL);
907 och_p = &lli->lli_mds_read_och;
908 och_usecount = &lli->lli_open_fd_read_count;
911 if (*och_usecount > 1)
912 GOTO(out_unlock, rc = -EBUSY);
919 *old_open_handle = fd->fd_och->och_open_handle;
923 mutex_unlock(&lli->lli_och_mutex);
928 * Release ownership on lli_mds_*_och when putting back a file lease.
930 static int ll_lease_och_release(struct inode *inode, struct file *file)
932 struct ll_inode_info *lli = ll_i2info(inode);
933 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
934 struct obd_client_handle **och_p;
935 struct obd_client_handle *old_och = NULL;
940 mutex_lock(&lli->lli_och_mutex);
941 if (file->f_mode & FMODE_WRITE) {
942 och_p = &lli->lli_mds_write_och;
943 och_usecount = &lli->lli_open_fd_write_count;
945 och_p = &lli->lli_mds_read_och;
946 och_usecount = &lli->lli_open_fd_read_count;
949 /* The file may have been open by another process (broken lease) so
950 * *och_p is not NULL. In this case we should simply increase usecount
953 if (*och_p != NULL) {
954 old_och = fd->fd_och;
961 mutex_unlock(&lli->lli_och_mutex);
964 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
970 * Acquire a lease and open the file.
972 static struct obd_client_handle *
973 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
976 struct lookup_intent it = { .it_op = IT_OPEN };
977 struct ll_sb_info *sbi = ll_i2sbi(inode);
978 struct md_op_data *op_data;
979 struct ptlrpc_request *req = NULL;
980 struct lustre_handle old_open_handle = { 0 };
981 struct obd_client_handle *och = NULL;
986 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
987 RETURN(ERR_PTR(-EINVAL));
990 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
991 RETURN(ERR_PTR(-EPERM));
993 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1000 RETURN(ERR_PTR(-ENOMEM));
1002 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1003 LUSTRE_OPC_ANY, NULL);
1004 if (IS_ERR(op_data))
1005 GOTO(out, rc = PTR_ERR(op_data));
1007 /* To tell the MDT this openhandle is from the same owner */
1008 op_data->op_open_handle = old_open_handle;
1010 it.it_flags = fmode | open_flags;
1011 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1012 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1013 &ll_md_blocking_lease_ast,
1014 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1015 * it can be cancelled which may mislead applications that the lease is
1017 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1018 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1019 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1020 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1021 ll_finish_md_op_data(op_data);
1022 ptlrpc_req_finished(req);
1024 GOTO(out_release_it, rc);
1026 if (it_disposition(&it, DISP_LOOKUP_NEG))
1027 GOTO(out_release_it, rc = -ENOENT);
1029 rc = it_open_error(DISP_OPEN_OPEN, &it);
1031 GOTO(out_release_it, rc);
1033 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1034 ll_och_fill(sbi->ll_md_exp, &it, och);
1036 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1037 GOTO(out_close, rc = -EOPNOTSUPP);
1039 /* already get lease, handle lease lock */
1040 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1041 if (it.it_lock_mode == 0 ||
1042 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1043 /* open lock must return for lease */
1044 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1045 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1047 GOTO(out_close, rc = -EPROTO);
1050 ll_intent_release(&it);
1054 /* Cancel open lock */
1055 if (it.it_lock_mode != 0) {
1056 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1058 it.it_lock_mode = 0;
1059 och->och_lease_handle.cookie = 0ULL;
1061 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1063 CERROR("%s: error closing file "DFID": %d\n",
1064 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1065 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1067 ll_intent_release(&it);
1071 RETURN(ERR_PTR(rc));
1075 * Check whether a layout swap can be done between two inodes.
1077 * \param[in] inode1 First inode to check
1078 * \param[in] inode2 Second inode to check
1080 * \retval 0 on success, layout swap can be performed between both inodes
1081 * \retval negative error code if requirements are not met
1083 static int ll_check_swap_layouts_validity(struct inode *inode1,
1084 struct inode *inode2)
1086 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1089 if (inode_permission(inode1, MAY_WRITE) ||
1090 inode_permission(inode2, MAY_WRITE))
1093 if (inode1->i_sb != inode2->i_sb)
1099 static int ll_swap_layouts_close(struct obd_client_handle *och,
1100 struct inode *inode, struct inode *inode2)
1102 const struct lu_fid *fid1 = ll_inode2fid(inode);
1103 const struct lu_fid *fid2;
1107 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1108 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1110 rc = ll_check_swap_layouts_validity(inode, inode2);
1112 GOTO(out_free_och, rc);
1114 /* We now know that inode2 is a lustre inode */
1115 fid2 = ll_inode2fid(inode2);
1117 rc = lu_fid_cmp(fid1, fid2);
1119 GOTO(out_free_och, rc = -EINVAL);
1121 /* Close the file and {swap,merge} layouts between inode & inode2.
1122 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1123 * because we still need it to pack l_remote_handle to MDT. */
1124 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1127 och = NULL; /* freed in ll_close_inode_openhandle() */
1137 * Release lease and close the file.
1138 * It will check if the lease has ever broken.
1140 static int ll_lease_close_intent(struct obd_client_handle *och,
1141 struct inode *inode,
1142 bool *lease_broken, enum mds_op_bias bias,
1145 struct ldlm_lock *lock;
1146 bool cancelled = true;
1150 lock = ldlm_handle2lock(&och->och_lease_handle);
1152 lock_res_and_lock(lock);
1153 cancelled = ldlm_is_cancel(lock);
1154 unlock_res_and_lock(lock);
1155 LDLM_LOCK_PUT(lock);
1158 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1159 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1161 if (lease_broken != NULL)
1162 *lease_broken = cancelled;
1164 if (!cancelled && !bias)
1165 ldlm_cli_cancel(&och->och_lease_handle, 0);
1167 if (cancelled) { /* no need to excute intent */
1172 rc = ll_close_inode_openhandle(inode, och, bias, data);
1176 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1179 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1183 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1185 static int ll_lease_file_resync(struct obd_client_handle *och,
1186 struct inode *inode, unsigned long arg)
1188 struct ll_sb_info *sbi = ll_i2sbi(inode);
1189 struct md_op_data *op_data;
1190 struct ll_ioc_lease_id ioc;
1191 __u64 data_version_unused;
1195 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1196 LUSTRE_OPC_ANY, NULL);
1197 if (IS_ERR(op_data))
1198 RETURN(PTR_ERR(op_data));
1200 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1204 /* before starting file resync, it's necessary to clean up page cache
1205 * in client memory, otherwise once the layout version is increased,
1206 * writing back cached data will be denied the OSTs. */
1207 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1211 op_data->op_lease_handle = och->och_lease_handle;
1212 op_data->op_mirror_id = ioc.lil_mirror_id;
1213 rc = md_file_resync(sbi->ll_md_exp, op_data);
1219 ll_finish_md_op_data(op_data);
1223 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1225 struct ll_inode_info *lli = ll_i2info(inode);
1226 struct cl_object *obj = lli->lli_clob;
1227 struct cl_attr *attr = vvp_env_thread_attr(env);
1235 ll_inode_size_lock(inode);
1237 /* Merge timestamps the most recently obtained from MDS with
1238 * timestamps obtained from OSTs.
1240 * Do not overwrite atime of inode because it may be refreshed
1241 * by file_accessed() function. If the read was served by cache
1242 * data, there is no RPC to be sent so that atime may not be
1243 * transferred to OSTs at all. MDT only updates atime at close time
1244 * if it's at least 'mdd.*.atime_diff' older.
1245 * All in all, the atime in Lustre does not strictly comply with
1246 * POSIX. Solving this problem needs to send an RPC to MDT for each
1247 * read, this will hurt performance.
1249 if (inode->i_atime.tv_sec < lli->lli_atime ||
1250 lli->lli_update_atime) {
1251 inode->i_atime.tv_sec = lli->lli_atime;
1252 lli->lli_update_atime = 0;
1254 inode->i_mtime.tv_sec = lli->lli_mtime;
1255 inode->i_ctime.tv_sec = lli->lli_ctime;
1257 mtime = inode->i_mtime.tv_sec;
1258 atime = inode->i_atime.tv_sec;
1259 ctime = inode->i_ctime.tv_sec;
1261 cl_object_attr_lock(obj);
1262 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1265 rc = cl_object_attr_get(env, obj, attr);
1266 cl_object_attr_unlock(obj);
1269 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1271 if (atime < attr->cat_atime)
1272 atime = attr->cat_atime;
1274 if (ctime < attr->cat_ctime)
1275 ctime = attr->cat_ctime;
1277 if (mtime < attr->cat_mtime)
1278 mtime = attr->cat_mtime;
1280 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1281 PFID(&lli->lli_fid), attr->cat_size);
1283 i_size_write(inode, attr->cat_size);
1284 inode->i_blocks = attr->cat_blocks;
1286 inode->i_mtime.tv_sec = mtime;
1287 inode->i_atime.tv_sec = atime;
1288 inode->i_ctime.tv_sec = ctime;
1291 ll_inode_size_unlock(inode);
1297 * Set designated mirror for I/O.
1299 * So far only read, write, and truncated can support to issue I/O to
1300 * designated mirror.
1302 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1304 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1306 /* clear layout version for generic(non-resync) I/O in case it carries
1307 * stale layout version due to I/O restart */
1308 io->ci_layout_version = 0;
1310 /* FLR: disable non-delay for designated mirror I/O because obviously
1311 * only one mirror is available */
1312 if (fd->fd_designated_mirror > 0) {
1314 io->ci_designated_mirror = fd->fd_designated_mirror;
1315 io->ci_layout_version = fd->fd_layout_version;
1318 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1319 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1322 static bool file_is_noatime(const struct file *file)
1324 const struct vfsmount *mnt = file->f_path.mnt;
1325 const struct inode *inode = file_inode((struct file *)file);
1327 /* Adapted from file_accessed() and touch_atime().*/
1328 if (file->f_flags & O_NOATIME)
1331 if (inode->i_flags & S_NOATIME)
1334 if (IS_NOATIME(inode))
1337 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1340 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1343 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1349 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1351 struct inode *inode = file_inode(file);
1352 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1354 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1355 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1357 if (iot == CIT_WRITE) {
1358 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1359 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1360 file->f_flags & O_DIRECT ||
1363 io->ci_obj = ll_i2info(inode)->lli_clob;
1364 io->ci_lockreq = CILR_MAYBE;
1365 if (ll_file_nolock(file)) {
1366 io->ci_lockreq = CILR_NEVER;
1367 io->ci_no_srvlock = 1;
1368 } else if (file->f_flags & O_APPEND) {
1369 io->ci_lockreq = CILR_MANDATORY;
1371 io->ci_noatime = file_is_noatime(file);
1373 /* FLR: only use non-delay I/O for read as there is only one
1374 * avaliable mirror for write. */
1375 io->ci_ndelay = !(iot == CIT_WRITE);
1377 ll_io_set_mirror(io, file);
1380 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1383 struct ll_inode_info *lli = ll_i2info(inode);
1384 struct ll_sb_info *sbi = ll_i2sbi(inode);
1385 enum obd_heat_type sample_type;
1386 enum obd_heat_type iobyte_type;
1387 __u64 now = ktime_get_real_seconds();
1389 if (!ll_sbi_has_file_heat(sbi) ||
1390 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1393 if (iot == CIT_READ) {
1394 sample_type = OBD_HEAT_READSAMPLE;
1395 iobyte_type = OBD_HEAT_READBYTE;
1396 } else if (iot == CIT_WRITE) {
1397 sample_type = OBD_HEAT_WRITESAMPLE;
1398 iobyte_type = OBD_HEAT_WRITEBYTE;
1403 spin_lock(&lli->lli_heat_lock);
1404 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1405 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1406 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1407 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1408 spin_unlock(&lli->lli_heat_lock);
1412 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1413 struct file *file, enum cl_io_type iot,
1414 loff_t *ppos, size_t count)
1416 struct vvp_io *vio = vvp_env_io(env);
1417 struct inode *inode = file_inode(file);
1418 struct ll_inode_info *lli = ll_i2info(inode);
1419 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1420 struct range_lock range;
1424 unsigned retried = 0;
1425 bool restarted = false;
1429 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1430 file_dentry(file)->d_name.name,
1431 iot == CIT_READ ? "read" : "write", *ppos, count);
1434 io = vvp_env_thread_io(env);
1435 ll_io_init(io, file, iot);
1436 io->ci_ndelay_tried = retried;
1438 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1439 bool range_locked = false;
1441 if (file->f_flags & O_APPEND)
1442 range_lock_init(&range, 0, LUSTRE_EOF);
1444 range_lock_init(&range, *ppos, *ppos + count - 1);
1446 vio->vui_fd = LUSTRE_FPRIVATE(file);
1447 vio->vui_io_subtype = args->via_io_subtype;
1449 switch (vio->vui_io_subtype) {
1451 vio->vui_iter = args->u.normal.via_iter;
1452 vio->vui_iocb = args->u.normal.via_iocb;
1453 /* Direct IO reads must also take range lock,
1454 * or multiple reads will try to work on the same pages
1455 * See LU-6227 for details. */
1456 if (((iot == CIT_WRITE) ||
1457 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1458 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1459 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1461 rc = range_lock(&lli->lli_write_tree, &range);
1465 range_locked = true;
1469 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1470 vio->u.splice.vui_flags = args->u.splice.via_flags;
1473 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1477 ll_cl_add(file, env, io, LCC_RW);
1478 rc = cl_io_loop(env, io);
1479 ll_cl_remove(file, env);
1482 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1484 range_unlock(&lli->lli_write_tree, &range);
1487 /* cl_io_rw_init() handled IO */
1491 if (io->ci_nob > 0) {
1492 result += io->ci_nob;
1493 count -= io->ci_nob;
1494 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1496 /* prepare IO restart */
1497 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1498 args->u.normal.via_iter = vio->vui_iter;
1501 cl_io_fini(env, io);
1504 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1505 file->f_path.dentry->d_name.name,
1506 iot, rc, result, io->ci_need_restart);
1508 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1510 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1511 file_dentry(file)->d_name.name,
1512 iot == CIT_READ ? "read" : "write",
1513 *ppos, count, result, rc);
1514 /* preserve the tried count for FLR */
1515 retried = io->ci_ndelay_tried;
1520 if (iot == CIT_READ) {
1522 ll_stats_ops_tally(ll_i2sbi(inode),
1523 LPROC_LL_READ_BYTES, result);
1524 } else if (iot == CIT_WRITE) {
1526 ll_stats_ops_tally(ll_i2sbi(inode),
1527 LPROC_LL_WRITE_BYTES, result);
1528 fd->fd_write_failed = false;
1529 } else if (result == 0 && rc == 0) {
1532 fd->fd_write_failed = true;
1534 fd->fd_write_failed = false;
1535 } else if (rc != -ERESTARTSYS) {
1536 fd->fd_write_failed = true;
1540 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1542 ll_heat_add(inode, iot, result);
1544 RETURN(result > 0 ? result : rc);
1548 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1549 * especially for small I/O.
1551 * To serve a read request, CLIO has to create and initialize a cl_io and
1552 * then request DLM lock. This has turned out to have siginificant overhead
1553 * and affects the performance of small I/O dramatically.
1555 * It's not necessary to create a cl_io for each I/O. Under the help of read
1556 * ahead, most of the pages being read are already in memory cache and we can
1557 * read those pages directly because if the pages exist, the corresponding DLM
1558 * lock must exist so that page content must be valid.
1560 * In fast read implementation, the llite speculatively finds and reads pages
1561 * in memory cache. There are three scenarios for fast read:
1562 * - If the page exists and is uptodate, kernel VM will provide the data and
1563 * CLIO won't be intervened;
1564 * - If the page was brought into memory by read ahead, it will be exported
1565 * and read ahead parameters will be updated;
1566 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1567 * it will go back and invoke normal read, i.e., a cl_io will be created
1568 * and DLM lock will be requested.
1570 * POSIX compliance: posix standard states that read is intended to be atomic.
1571 * Lustre read implementation is in line with Linux kernel read implementation
1572 * and neither of them complies with POSIX standard in this matter. Fast read
1573 * doesn't make the situation worse on single node but it may interleave write
1574 * results from multiple nodes due to short read handling in ll_file_aio_read().
1576 * \param env - lu_env
1577 * \param iocb - kiocb from kernel
1578 * \param iter - user space buffers where the data will be copied
1580 * \retval - number of bytes have been read, or error code if error occurred.
1583 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1587 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1590 /* NB: we can't do direct IO for fast read because it will need a lock
1591 * to make IO engine happy. */
1592 if (iocb->ki_filp->f_flags & O_DIRECT)
1595 result = generic_file_read_iter(iocb, iter);
1597 /* If the first page is not in cache, generic_file_aio_read() will be
1598 * returned with -ENODATA.
1599 * See corresponding code in ll_readpage(). */
1600 if (result == -ENODATA)
1604 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1605 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1606 LPROC_LL_READ_BYTES, result);
1613 * Read from a file (through the page cache).
1615 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1618 struct vvp_io_args *args;
1623 ll_ras_enter(iocb->ki_filp);
1625 result = ll_do_fast_read(iocb, to);
1626 if (result < 0 || iov_iter_count(to) == 0)
1629 env = cl_env_get(&refcheck);
1631 return PTR_ERR(env);
1633 args = ll_env_args(env, IO_NORMAL);
1634 args->u.normal.via_iter = to;
1635 args->u.normal.via_iocb = iocb;
1637 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1638 &iocb->ki_pos, iov_iter_count(to));
1641 else if (result == 0)
1644 cl_env_put(env, &refcheck);
1650 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1651 * If a page is already in the page cache and dirty (and some other things -
1652 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1653 * write to it without doing a full I/O, because Lustre already knows about it
1654 * and will write it out. This saves a lot of processing time.
1656 * All writes here are within one page, so exclusion is handled by the page
1657 * lock on the vm page. We do not do tiny writes for writes which touch
1658 * multiple pages because it's very unlikely multiple sequential pages are
1659 * are already dirty.
1661 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1662 * and are unlikely to be to already dirty pages.
1664 * Attribute updates are important here, we do them in ll_tiny_write_end.
1666 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1668 ssize_t count = iov_iter_count(iter);
1669 struct file *file = iocb->ki_filp;
1670 struct inode *inode = file_inode(file);
1671 bool lock_inode = !IS_NOSEC(inode);
1676 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1677 * of function for why.
1679 if (count >= PAGE_SIZE ||
1680 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1683 if (unlikely(lock_inode))
1685 result = __generic_file_write_iter(iocb, iter);
1687 if (unlikely(lock_inode))
1688 inode_unlock(inode);
1690 /* If the page is not already dirty, ll_tiny_write_begin returns
1691 * -ENODATA. We continue on to normal write.
1693 if (result == -ENODATA)
1697 ll_heat_add(inode, CIT_WRITE, result);
1698 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1700 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1703 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1709 * Write to a file (through the page cache).
1711 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1713 struct vvp_io_args *args;
1715 ssize_t rc_tiny = 0, rc_normal;
1720 /* NB: we can't do direct IO for tiny writes because they use the page
1721 * cache, we can't do sync writes because tiny writes can't flush
1722 * pages, and we can't do append writes because we can't guarantee the
1723 * required DLM locks are held to protect file size.
1725 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1726 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1727 rc_tiny = ll_do_tiny_write(iocb, from);
1729 /* In case of error, go on and try normal write - Only stop if tiny
1730 * write completed I/O.
1732 if (iov_iter_count(from) == 0)
1733 GOTO(out, rc_normal = rc_tiny);
1735 env = cl_env_get(&refcheck);
1737 return PTR_ERR(env);
1739 args = ll_env_args(env, IO_NORMAL);
1740 args->u.normal.via_iter = from;
1741 args->u.normal.via_iocb = iocb;
1743 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1744 &iocb->ki_pos, iov_iter_count(from));
1746 /* On success, combine bytes written. */
1747 if (rc_tiny >= 0 && rc_normal > 0)
1748 rc_normal += rc_tiny;
1749 /* On error, only return error from normal write if tiny write did not
1750 * write any bytes. Otherwise return bytes written by tiny write.
1752 else if (rc_tiny > 0)
1753 rc_normal = rc_tiny;
1755 cl_env_put(env, &refcheck);
1760 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1762 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1764 static int ll_file_get_iov_count(const struct iovec *iov,
1765 unsigned long *nr_segs, size_t *count)
1770 for (seg = 0; seg < *nr_segs; seg++) {
1771 const struct iovec *iv = &iov[seg];
1774 * If any segment has a negative length, or the cumulative
1775 * length ever wraps negative then return -EINVAL.
1778 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1780 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1785 cnt -= iv->iov_len; /* This segment is no good */
1792 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1793 unsigned long nr_segs, loff_t pos)
1800 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1804 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1805 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1806 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1807 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1808 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1810 result = ll_file_read_iter(iocb, &to);
1815 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1818 struct iovec iov = { .iov_base = buf, .iov_len = count };
1823 init_sync_kiocb(&kiocb, file);
1824 kiocb.ki_pos = *ppos;
1825 #ifdef HAVE_KIOCB_KI_LEFT
1826 kiocb.ki_left = count;
1827 #elif defined(HAVE_KI_NBYTES)
1828 kiocb.i_nbytes = count;
1831 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1832 *ppos = kiocb.ki_pos;
1838 * Write to a file (through the page cache).
1841 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1842 unsigned long nr_segs, loff_t pos)
1844 struct iov_iter from;
1849 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1853 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1854 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1855 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1856 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1857 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1859 result = ll_file_write_iter(iocb, &from);
1864 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1865 size_t count, loff_t *ppos)
1867 struct iovec iov = { .iov_base = (void __user *)buf,
1874 init_sync_kiocb(&kiocb, file);
1875 kiocb.ki_pos = *ppos;
1876 #ifdef HAVE_KIOCB_KI_LEFT
1877 kiocb.ki_left = count;
1878 #elif defined(HAVE_KI_NBYTES)
1879 kiocb.ki_nbytes = count;
1882 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1883 *ppos = kiocb.ki_pos;
1887 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1890 * Send file content (through pagecache) somewhere with helper
1892 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1893 struct pipe_inode_info *pipe, size_t count,
1897 struct vvp_io_args *args;
1902 ll_ras_enter(in_file);
1904 env = cl_env_get(&refcheck);
1906 RETURN(PTR_ERR(env));
1908 args = ll_env_args(env, IO_SPLICE);
1909 args->u.splice.via_pipe = pipe;
1910 args->u.splice.via_flags = flags;
1912 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1913 cl_env_put(env, &refcheck);
1917 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1918 __u64 flags, struct lov_user_md *lum, int lum_size)
1920 struct lookup_intent oit = {
1922 .it_flags = flags | MDS_OPEN_BY_FID,
1927 ll_inode_size_lock(inode);
1928 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1930 GOTO(out_unlock, rc);
1932 ll_release_openhandle(dentry, &oit);
1935 ll_inode_size_unlock(inode);
1936 ll_intent_release(&oit);
1941 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1942 struct lov_mds_md **lmmp, int *lmm_size,
1943 struct ptlrpc_request **request)
1945 struct ll_sb_info *sbi = ll_i2sbi(inode);
1946 struct mdt_body *body;
1947 struct lov_mds_md *lmm = NULL;
1948 struct ptlrpc_request *req = NULL;
1949 struct md_op_data *op_data;
1952 rc = ll_get_default_mdsize(sbi, &lmmsize);
1956 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1957 strlen(filename), lmmsize,
1958 LUSTRE_OPC_ANY, NULL);
1959 if (IS_ERR(op_data))
1960 RETURN(PTR_ERR(op_data));
1962 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1963 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1964 ll_finish_md_op_data(op_data);
1966 CDEBUG(D_INFO, "md_getattr_name failed "
1967 "on %s: rc %d\n", filename, rc);
1971 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1972 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1974 lmmsize = body->mbo_eadatasize;
1976 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1978 GOTO(out, rc = -ENODATA);
1981 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1982 LASSERT(lmm != NULL);
1984 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1985 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1986 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
1987 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
1988 GOTO(out, rc = -EPROTO);
1991 * This is coming from the MDS, so is probably in
1992 * little endian. We convert it to host endian before
1993 * passing it to userspace.
1995 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1998 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1999 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2000 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2001 if (le32_to_cpu(lmm->lmm_pattern) &
2002 LOV_PATTERN_F_RELEASED)
2006 /* if function called for directory - we should
2007 * avoid swab not existent lsm objects */
2008 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2009 lustre_swab_lov_user_md_v1(
2010 (struct lov_user_md_v1 *)lmm);
2011 if (S_ISREG(body->mbo_mode))
2012 lustre_swab_lov_user_md_objects(
2013 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2015 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2016 lustre_swab_lov_user_md_v3(
2017 (struct lov_user_md_v3 *)lmm);
2018 if (S_ISREG(body->mbo_mode))
2019 lustre_swab_lov_user_md_objects(
2020 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2022 } else if (lmm->lmm_magic ==
2023 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2024 lustre_swab_lov_comp_md_v1(
2025 (struct lov_comp_md_v1 *)lmm);
2026 } else if (lmm->lmm_magic ==
2027 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2028 struct lov_foreign_md *lfm;
2030 lfm = (struct lov_foreign_md *)lmm;
2031 __swab32s(&lfm->lfm_magic);
2032 __swab32s(&lfm->lfm_length);
2033 __swab32s(&lfm->lfm_type);
2034 __swab32s(&lfm->lfm_flags);
2040 *lmm_size = lmmsize;
2045 static int ll_lov_setea(struct inode *inode, struct file *file,
2048 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2049 struct lov_user_md *lump;
2050 int lum_size = sizeof(struct lov_user_md) +
2051 sizeof(struct lov_user_ost_data);
2055 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2058 OBD_ALLOC_LARGE(lump, lum_size);
2062 if (copy_from_user(lump, arg, lum_size))
2063 GOTO(out_lump, rc = -EFAULT);
2065 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2067 cl_lov_delay_create_clear(&file->f_flags);
2070 OBD_FREE_LARGE(lump, lum_size);
2074 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2081 env = cl_env_get(&refcheck);
2083 RETURN(PTR_ERR(env));
2085 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2086 cl_env_put(env, &refcheck);
2090 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2093 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2094 struct lov_user_md *klum;
2096 __u64 flags = FMODE_WRITE;
2099 rc = ll_copy_user_md(lum, &klum);
2104 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2109 rc = put_user(0, &lum->lmm_stripe_count);
2113 rc = ll_layout_refresh(inode, &gen);
2117 rc = ll_file_getstripe(inode, arg, lum_size);
2119 cl_lov_delay_create_clear(&file->f_flags);
2122 OBD_FREE(klum, lum_size);
2127 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2129 struct ll_inode_info *lli = ll_i2info(inode);
2130 struct cl_object *obj = lli->lli_clob;
2131 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2132 struct ll_grouplock grouplock;
2137 CWARN("group id for group lock must not be 0\n");
2141 if (ll_file_nolock(file))
2142 RETURN(-EOPNOTSUPP);
2144 spin_lock(&lli->lli_lock);
2145 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2146 CWARN("group lock already existed with gid %lu\n",
2147 fd->fd_grouplock.lg_gid);
2148 spin_unlock(&lli->lli_lock);
2151 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2152 spin_unlock(&lli->lli_lock);
2155 * XXX: group lock needs to protect all OST objects while PFL
2156 * can add new OST objects during the IO, so we'd instantiate
2157 * all OST objects before getting its group lock.
2162 struct cl_layout cl = {
2163 .cl_is_composite = false,
2165 struct lu_extent ext = {
2167 .e_end = OBD_OBJECT_EOF,
2170 env = cl_env_get(&refcheck);
2172 RETURN(PTR_ERR(env));
2174 rc = cl_object_layout_get(env, obj, &cl);
2175 if (!rc && cl.cl_is_composite)
2176 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2179 cl_env_put(env, &refcheck);
2184 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2185 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2189 spin_lock(&lli->lli_lock);
2190 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2191 spin_unlock(&lli->lli_lock);
2192 CERROR("another thread just won the race\n");
2193 cl_put_grouplock(&grouplock);
2197 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2198 fd->fd_grouplock = grouplock;
2199 spin_unlock(&lli->lli_lock);
2201 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2205 static int ll_put_grouplock(struct inode *inode, struct file *file,
2208 struct ll_inode_info *lli = ll_i2info(inode);
2209 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2210 struct ll_grouplock grouplock;
2213 spin_lock(&lli->lli_lock);
2214 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2215 spin_unlock(&lli->lli_lock);
2216 CWARN("no group lock held\n");
2220 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2222 if (fd->fd_grouplock.lg_gid != arg) {
2223 CWARN("group lock %lu doesn't match current id %lu\n",
2224 arg, fd->fd_grouplock.lg_gid);
2225 spin_unlock(&lli->lli_lock);
2229 grouplock = fd->fd_grouplock;
2230 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2231 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2232 spin_unlock(&lli->lli_lock);
2234 cl_put_grouplock(&grouplock);
2235 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2240 * Close inode open handle
2242 * \param dentry [in] dentry which contains the inode
2243 * \param it [in,out] intent which contains open info and result
2246 * \retval <0 failure
2248 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2250 struct inode *inode = dentry->d_inode;
2251 struct obd_client_handle *och;
2257 /* Root ? Do nothing. */
2258 if (dentry->d_inode->i_sb->s_root == dentry)
2261 /* No open handle to close? Move away */
2262 if (!it_disposition(it, DISP_OPEN_OPEN))
2265 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2267 OBD_ALLOC(och, sizeof(*och));
2269 GOTO(out, rc = -ENOMEM);
2271 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2273 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2275 /* this one is in place of ll_file_open */
2276 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2277 ptlrpc_req_finished(it->it_request);
2278 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2284 * Get size for inode for which FIEMAP mapping is requested.
2285 * Make the FIEMAP get_info call and returns the result.
2286 * \param fiemap kernel buffer to hold extens
2287 * \param num_bytes kernel buffer size
2289 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2295 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2298 /* Checks for fiemap flags */
2299 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2300 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2304 /* Check for FIEMAP_FLAG_SYNC */
2305 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2306 rc = filemap_fdatawrite(inode->i_mapping);
2311 env = cl_env_get(&refcheck);
2313 RETURN(PTR_ERR(env));
2315 if (i_size_read(inode) == 0) {
2316 rc = ll_glimpse_size(inode);
2321 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2322 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2323 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2325 /* If filesize is 0, then there would be no objects for mapping */
2326 if (fmkey.lfik_oa.o_size == 0) {
2327 fiemap->fm_mapped_extents = 0;
2331 fmkey.lfik_fiemap = *fiemap;
2333 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2334 &fmkey, fiemap, &num_bytes);
2336 cl_env_put(env, &refcheck);
2340 int ll_fid2path(struct inode *inode, void __user *arg)
2342 struct obd_export *exp = ll_i2mdexp(inode);
2343 const struct getinfo_fid2path __user *gfin = arg;
2345 struct getinfo_fid2path *gfout;
2351 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2352 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2355 /* Only need to get the buflen */
2356 if (get_user(pathlen, &gfin->gf_pathlen))
2359 if (pathlen > PATH_MAX)
2362 outsize = sizeof(*gfout) + pathlen;
2363 OBD_ALLOC(gfout, outsize);
2367 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2368 GOTO(gf_free, rc = -EFAULT);
2369 /* append root FID after gfout to let MDT know the root FID so that it
2370 * can lookup the correct path, this is mainly for fileset.
2371 * old server without fileset mount support will ignore this. */
2372 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2374 /* Call mdc_iocontrol */
2375 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2379 if (copy_to_user(arg, gfout, outsize))
2383 OBD_FREE(gfout, outsize);
2388 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2390 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2398 ioc->idv_version = 0;
2399 ioc->idv_layout_version = UINT_MAX;
2401 /* If no file object initialized, we consider its version is 0. */
2405 env = cl_env_get(&refcheck);
2407 RETURN(PTR_ERR(env));
2409 io = vvp_env_thread_io(env);
2411 io->u.ci_data_version.dv_data_version = 0;
2412 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2413 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2416 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2417 result = cl_io_loop(env, io);
2419 result = io->ci_result;
2421 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2422 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2424 cl_io_fini(env, io);
2426 if (unlikely(io->ci_need_restart))
2429 cl_env_put(env, &refcheck);
2435 * Read the data_version for inode.
2437 * This value is computed using stripe object version on OST.
2438 * Version is computed using server side locking.
2440 * @param flags if do sync on the OST side;
2442 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2443 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2445 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2447 struct ioc_data_version ioc = { .idv_flags = flags };
2450 rc = ll_ioc_data_version(inode, &ioc);
2452 *data_version = ioc.idv_version;
2458 * Trigger a HSM release request for the provided inode.
2460 int ll_hsm_release(struct inode *inode)
2463 struct obd_client_handle *och = NULL;
2464 __u64 data_version = 0;
2469 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2470 ll_i2sbi(inode)->ll_fsname,
2471 PFID(&ll_i2info(inode)->lli_fid));
2473 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2475 GOTO(out, rc = PTR_ERR(och));
2477 /* Grab latest data_version and [am]time values */
2478 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2482 env = cl_env_get(&refcheck);
2484 GOTO(out, rc = PTR_ERR(env));
2486 rc = ll_merge_attr(env, inode);
2487 cl_env_put(env, &refcheck);
2489 /* If error happen, we have the wrong size for a file.
2495 /* Release the file.
2496 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2497 * we still need it to pack l_remote_handle to MDT. */
2498 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2504 if (och != NULL && !IS_ERR(och)) /* close the file */
2505 ll_lease_close(och, inode, NULL);
2510 struct ll_swap_stack {
2513 struct inode *inode1;
2514 struct inode *inode2;
2519 static int ll_swap_layouts(struct file *file1, struct file *file2,
2520 struct lustre_swap_layouts *lsl)
2522 struct mdc_swap_layouts msl;
2523 struct md_op_data *op_data;
2526 struct ll_swap_stack *llss = NULL;
2529 OBD_ALLOC_PTR(llss);
2533 llss->inode1 = file_inode(file1);
2534 llss->inode2 = file_inode(file2);
2536 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2540 /* we use 2 bool because it is easier to swap than 2 bits */
2541 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2542 llss->check_dv1 = true;
2544 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2545 llss->check_dv2 = true;
2547 /* we cannot use lsl->sl_dvX directly because we may swap them */
2548 llss->dv1 = lsl->sl_dv1;
2549 llss->dv2 = lsl->sl_dv2;
2551 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2552 if (rc == 0) /* same file, done! */
2555 if (rc < 0) { /* sequentialize it */
2556 swap(llss->inode1, llss->inode2);
2558 swap(llss->dv1, llss->dv2);
2559 swap(llss->check_dv1, llss->check_dv2);
2563 if (gid != 0) { /* application asks to flush dirty cache */
2564 rc = ll_get_grouplock(llss->inode1, file1, gid);
2568 rc = ll_get_grouplock(llss->inode2, file2, gid);
2570 ll_put_grouplock(llss->inode1, file1, gid);
2575 /* ultimate check, before swaping the layouts we check if
2576 * dataversion has changed (if requested) */
2577 if (llss->check_dv1) {
2578 rc = ll_data_version(llss->inode1, &dv, 0);
2581 if (dv != llss->dv1)
2582 GOTO(putgl, rc = -EAGAIN);
2585 if (llss->check_dv2) {
2586 rc = ll_data_version(llss->inode2, &dv, 0);
2589 if (dv != llss->dv2)
2590 GOTO(putgl, rc = -EAGAIN);
2593 /* struct md_op_data is used to send the swap args to the mdt
2594 * only flags is missing, so we use struct mdc_swap_layouts
2595 * through the md_op_data->op_data */
2596 /* flags from user space have to be converted before they are send to
2597 * server, no flag is sent today, they are only used on the client */
2600 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2601 0, LUSTRE_OPC_ANY, &msl);
2602 if (IS_ERR(op_data))
2603 GOTO(free, rc = PTR_ERR(op_data));
2605 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2606 sizeof(*op_data), op_data, NULL);
2607 ll_finish_md_op_data(op_data);
2614 ll_put_grouplock(llss->inode2, file2, gid);
2615 ll_put_grouplock(llss->inode1, file1, gid);
2625 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2627 struct obd_export *exp = ll_i2mdexp(inode);
2628 struct md_op_data *op_data;
2632 /* Detect out-of range masks */
2633 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2636 /* Non-root users are forbidden to set or clear flags which are
2637 * NOT defined in HSM_USER_MASK. */
2638 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2639 !cfs_capable(CFS_CAP_SYS_ADMIN))
2642 if (!exp_connect_archive_id_array(exp)) {
2643 /* Detect out-of range archive id */
2644 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2645 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2649 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2650 LUSTRE_OPC_ANY, hss);
2651 if (IS_ERR(op_data))
2652 RETURN(PTR_ERR(op_data));
2654 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2657 ll_finish_md_op_data(op_data);
2662 static int ll_hsm_import(struct inode *inode, struct file *file,
2663 struct hsm_user_import *hui)
2665 struct hsm_state_set *hss = NULL;
2666 struct iattr *attr = NULL;
2670 if (!S_ISREG(inode->i_mode))
2676 GOTO(out, rc = -ENOMEM);
2678 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2679 hss->hss_archive_id = hui->hui_archive_id;
2680 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2681 rc = ll_hsm_state_set(inode, hss);
2685 OBD_ALLOC_PTR(attr);
2687 GOTO(out, rc = -ENOMEM);
2689 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2690 attr->ia_mode |= S_IFREG;
2691 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2692 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2693 attr->ia_size = hui->hui_size;
2694 attr->ia_mtime.tv_sec = hui->hui_mtime;
2695 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2696 attr->ia_atime.tv_sec = hui->hui_atime;
2697 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2699 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2700 ATTR_UID | ATTR_GID |
2701 ATTR_MTIME | ATTR_MTIME_SET |
2702 ATTR_ATIME | ATTR_ATIME_SET;
2706 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2710 inode_unlock(inode);
2722 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2724 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2725 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2728 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2730 struct inode *inode = file_inode(file);
2732 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2733 ATTR_MTIME | ATTR_MTIME_SET |
2736 .tv_sec = lfu->lfu_atime_sec,
2737 .tv_nsec = lfu->lfu_atime_nsec,
2740 .tv_sec = lfu->lfu_mtime_sec,
2741 .tv_nsec = lfu->lfu_mtime_nsec,
2744 .tv_sec = lfu->lfu_ctime_sec,
2745 .tv_nsec = lfu->lfu_ctime_nsec,
2751 if (!capable(CAP_SYS_ADMIN))
2754 if (!S_ISREG(inode->i_mode))
2758 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2760 inode_unlock(inode);
2765 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2768 case MODE_READ_USER:
2770 case MODE_WRITE_USER:
2777 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2779 /* Used to allow the upper layers of the client to request an LDLM lock
2780 * without doing an actual read or write.
2782 * Used for ladvise lockahead to manually request specific locks.
2784 * \param[in] file file this ladvise lock request is on
2785 * \param[in] ladvise ladvise struct describing this lock request
2787 * \retval 0 success, no detailed result available (sync requests
2788 * and requests sent to the server [not handled locally]
2789 * cannot return detailed results)
2790 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2791 * see definitions for details.
2792 * \retval negative negative errno on error
2794 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2796 struct lu_env *env = NULL;
2797 struct cl_io *io = NULL;
2798 struct cl_lock *lock = NULL;
2799 struct cl_lock_descr *descr = NULL;
2800 struct dentry *dentry = file->f_path.dentry;
2801 struct inode *inode = dentry->d_inode;
2802 enum cl_lock_mode cl_mode;
2803 off_t start = ladvise->lla_start;
2804 off_t end = ladvise->lla_end;
2810 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2811 "start=%llu, end=%llu\n", dentry->d_name.len,
2812 dentry->d_name.name, dentry->d_inode,
2813 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2816 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2818 GOTO(out, result = cl_mode);
2820 /* Get IO environment */
2821 result = cl_io_get(inode, &env, &io, &refcheck);
2825 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2828 * nothing to do for this io. This currently happens when
2829 * stripe sub-object's are not yet created.
2831 result = io->ci_result;
2832 } else if (result == 0) {
2833 lock = vvp_env_lock(env);
2834 descr = &lock->cll_descr;
2836 descr->cld_obj = io->ci_obj;
2837 /* Convert byte offsets to pages */
2838 descr->cld_start = cl_index(io->ci_obj, start);
2839 descr->cld_end = cl_index(io->ci_obj, end);
2840 descr->cld_mode = cl_mode;
2841 /* CEF_MUST is used because we do not want to convert a
2842 * lockahead request to a lockless lock */
2843 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2846 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2847 descr->cld_enq_flags |= CEF_SPECULATIVE;
2849 result = cl_lock_request(env, io, lock);
2851 /* On success, we need to release the lock */
2853 cl_lock_release(env, lock);
2855 cl_io_fini(env, io);
2856 cl_env_put(env, &refcheck);
2858 /* -ECANCELED indicates a matching lock with a different extent
2859 * was already present, and -EEXIST indicates a matching lock
2860 * on exactly the same extent was already present.
2861 * We convert them to positive values for userspace to make
2862 * recognizing true errors easier.
2863 * Note we can only return these detailed results on async requests,
2864 * as sync requests look the same as i/o requests for locking. */
2865 if (result == -ECANCELED)
2866 result = LLA_RESULT_DIFFERENT;
2867 else if (result == -EEXIST)
2868 result = LLA_RESULT_SAME;
2873 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2875 static int ll_ladvise_sanity(struct inode *inode,
2876 struct llapi_lu_ladvise *ladvise)
2878 struct ll_sb_info *sbi = ll_i2sbi(inode);
2879 enum lu_ladvise_type advice = ladvise->lla_advice;
2880 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2881 * be in the first 32 bits of enum ladvise_flags */
2882 __u32 flags = ladvise->lla_peradvice_flags;
2883 /* 3 lines at 80 characters per line, should be plenty */
2886 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2888 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2889 "last supported advice is %s (value '%d'): rc = %d\n",
2890 sbi->ll_fsname, advice,
2891 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2895 /* Per-advice checks */
2897 case LU_LADVISE_LOCKNOEXPAND:
2898 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2900 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2901 "rc = %d\n", sbi->ll_fsname, flags,
2902 ladvise_names[advice], rc);
2906 case LU_LADVISE_LOCKAHEAD:
2907 /* Currently only READ and WRITE modes can be requested */
2908 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2909 ladvise->lla_lockahead_mode == 0) {
2911 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2912 "rc = %d\n", sbi->ll_fsname,
2913 ladvise->lla_lockahead_mode,
2914 ladvise_names[advice], rc);
2917 case LU_LADVISE_WILLREAD:
2918 case LU_LADVISE_DONTNEED:
2920 /* Note fall through above - These checks apply to all advices
2921 * except LOCKNOEXPAND */
2922 if (flags & ~LF_DEFAULT_MASK) {
2924 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2925 "rc = %d\n", sbi->ll_fsname, flags,
2926 ladvise_names[advice], rc);
2929 if (ladvise->lla_start >= ladvise->lla_end) {
2931 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2932 "for %s: rc = %d\n", sbi->ll_fsname,
2933 ladvise->lla_start, ladvise->lla_end,
2934 ladvise_names[advice], rc);
2946 * Give file access advices
2948 * The ladvise interface is similar to Linux fadvise() system call, except it
2949 * forwards the advices directly from Lustre client to server. The server side
2950 * codes will apply appropriate read-ahead and caching techniques for the
2951 * corresponding files.
2953 * A typical workload for ladvise is e.g. a bunch of different clients are
2954 * doing small random reads of a file, so prefetching pages into OSS cache
2955 * with big linear reads before the random IO is a net benefit. Fetching
2956 * all that data into each client cache with fadvise() may not be, due to
2957 * much more data being sent to the client.
2959 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2960 struct llapi_lu_ladvise *ladvise)
2964 struct cl_ladvise_io *lio;
2969 env = cl_env_get(&refcheck);
2971 RETURN(PTR_ERR(env));
2973 io = vvp_env_thread_io(env);
2974 io->ci_obj = ll_i2info(inode)->lli_clob;
2976 /* initialize parameters for ladvise */
2977 lio = &io->u.ci_ladvise;
2978 lio->li_start = ladvise->lla_start;
2979 lio->li_end = ladvise->lla_end;
2980 lio->li_fid = ll_inode2fid(inode);
2981 lio->li_advice = ladvise->lla_advice;
2982 lio->li_flags = flags;
2984 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2985 rc = cl_io_loop(env, io);
2989 cl_io_fini(env, io);
2990 cl_env_put(env, &refcheck);
2994 static int ll_lock_noexpand(struct file *file, int flags)
2996 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2998 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3003 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3006 struct fsxattr fsxattr;
3008 if (copy_from_user(&fsxattr,
3009 (const struct fsxattr __user *)arg,
3013 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3014 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3015 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3016 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3017 if (copy_to_user((struct fsxattr __user *)arg,
3018 &fsxattr, sizeof(fsxattr)))
3024 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3027 * Project Quota ID state is only allowed to change from within the init
3028 * namespace. Enforce that restriction only if we are trying to change
3029 * the quota ID state. Everything else is allowed in user namespaces.
3031 if (current_user_ns() == &init_user_ns)
3034 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3037 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3038 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3041 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3048 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3052 struct md_op_data *op_data;
3053 struct ptlrpc_request *req = NULL;
3055 struct fsxattr fsxattr;
3056 struct cl_object *obj;
3060 if (copy_from_user(&fsxattr,
3061 (const struct fsxattr __user *)arg,
3065 rc = ll_ioctl_check_project(inode, &fsxattr);
3069 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3070 LUSTRE_OPC_ANY, NULL);
3071 if (IS_ERR(op_data))
3072 RETURN(PTR_ERR(op_data));
3074 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3075 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3076 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3077 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3078 op_data->op_projid = fsxattr.fsx_projid;
3079 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3080 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3082 ptlrpc_req_finished(req);
3084 GOTO(out_fsxattr, rc);
3085 ll_update_inode_flags(inode, op_data->op_attr_flags);
3086 obj = ll_i2info(inode)->lli_clob;
3088 GOTO(out_fsxattr, rc);
3090 OBD_ALLOC_PTR(attr);
3092 GOTO(out_fsxattr, rc = -ENOMEM);
3094 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3095 fsxattr.fsx_xflags);
3098 ll_finish_md_op_data(op_data);
3102 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3105 struct inode *inode = file_inode(file);
3106 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3107 struct ll_inode_info *lli = ll_i2info(inode);
3108 struct obd_client_handle *och = NULL;
3109 struct split_param sp;
3112 enum mds_op_bias bias = 0;
3113 struct file *layout_file = NULL;
3115 size_t data_size = 0;
3119 mutex_lock(&lli->lli_och_mutex);
3120 if (fd->fd_lease_och != NULL) {
3121 och = fd->fd_lease_och;
3122 fd->fd_lease_och = NULL;
3124 mutex_unlock(&lli->lli_och_mutex);
3127 GOTO(out, rc = -ENOLCK);
3129 fmode = och->och_flags;
3131 switch (ioc->lil_flags) {
3132 case LL_LEASE_RESYNC_DONE:
3133 if (ioc->lil_count > IOC_IDS_MAX)
3134 GOTO(out, rc = -EINVAL);
3136 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3137 OBD_ALLOC(data, data_size);
3139 GOTO(out, rc = -ENOMEM);
3141 if (copy_from_user(data, (void __user *)arg, data_size))
3142 GOTO(out, rc = -EFAULT);
3144 bias = MDS_CLOSE_RESYNC_DONE;
3146 case LL_LEASE_LAYOUT_MERGE: {
3149 if (ioc->lil_count != 1)
3150 GOTO(out, rc = -EINVAL);
3152 arg += sizeof(*ioc);
3153 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3154 GOTO(out, rc = -EFAULT);
3156 layout_file = fget(fd);
3158 GOTO(out, rc = -EBADF);
3160 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3161 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3162 GOTO(out, rc = -EPERM);
3164 data = file_inode(layout_file);
3165 bias = MDS_CLOSE_LAYOUT_MERGE;
3168 case LL_LEASE_LAYOUT_SPLIT: {
3172 if (ioc->lil_count != 2)
3173 GOTO(out, rc = -EINVAL);
3175 arg += sizeof(*ioc);
3176 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3177 GOTO(out, rc = -EFAULT);
3179 arg += sizeof(__u32);
3180 if (copy_from_user(&mirror_id, (void __user *)arg,
3182 GOTO(out, rc = -EFAULT);
3184 layout_file = fget(fdv);
3186 GOTO(out, rc = -EBADF);
3188 sp.sp_inode = file_inode(layout_file);
3189 sp.sp_mirror_id = (__u16)mirror_id;
3191 bias = MDS_CLOSE_LAYOUT_SPLIT;
3195 /* without close intent */
3199 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3203 rc = ll_lease_och_release(inode, file);
3212 switch (ioc->lil_flags) {
3213 case LL_LEASE_RESYNC_DONE:
3215 OBD_FREE(data, data_size);
3217 case LL_LEASE_LAYOUT_MERGE:
3218 case LL_LEASE_LAYOUT_SPLIT:
3225 rc = ll_lease_type_from_fmode(fmode);
3229 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3232 struct inode *inode = file_inode(file);
3233 struct ll_inode_info *lli = ll_i2info(inode);
3234 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3235 struct obd_client_handle *och = NULL;
3236 __u64 open_flags = 0;
3242 switch (ioc->lil_mode) {
3243 case LL_LEASE_WRLCK:
3244 if (!(file->f_mode & FMODE_WRITE))
3246 fmode = FMODE_WRITE;
3248 case LL_LEASE_RDLCK:
3249 if (!(file->f_mode & FMODE_READ))
3253 case LL_LEASE_UNLCK:
3254 RETURN(ll_file_unlock_lease(file, ioc, arg));
3259 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3261 /* apply for lease */
3262 if (ioc->lil_flags & LL_LEASE_RESYNC)
3263 open_flags = MDS_OPEN_RESYNC;
3264 och = ll_lease_open(inode, file, fmode, open_flags);
3266 RETURN(PTR_ERR(och));
3268 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3269 rc = ll_lease_file_resync(och, inode, arg);
3271 ll_lease_close(och, inode, NULL);
3274 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3276 ll_lease_close(och, inode, NULL);
3282 mutex_lock(&lli->lli_och_mutex);
3283 if (fd->fd_lease_och == NULL) {
3284 fd->fd_lease_och = och;
3287 mutex_unlock(&lli->lli_och_mutex);
3289 /* impossible now that only excl is supported for now */
3290 ll_lease_close(och, inode, &lease_broken);
3296 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3298 struct ll_inode_info *lli = ll_i2info(inode);
3299 struct ll_sb_info *sbi = ll_i2sbi(inode);
3300 __u64 now = ktime_get_real_seconds();
3303 spin_lock(&lli->lli_heat_lock);
3304 heat->lh_flags = lli->lli_heat_flags;
3305 for (i = 0; i < heat->lh_count; i++)
3306 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3307 now, sbi->ll_heat_decay_weight,
3308 sbi->ll_heat_period_second);
3309 spin_unlock(&lli->lli_heat_lock);
3312 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3314 struct ll_inode_info *lli = ll_i2info(inode);
3317 spin_lock(&lli->lli_heat_lock);
3318 if (flags & LU_HEAT_FLAG_CLEAR)
3319 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3321 if (flags & LU_HEAT_FLAG_OFF)
3322 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3324 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3326 spin_unlock(&lli->lli_heat_lock);
3332 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3334 struct inode *inode = file_inode(file);
3335 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3339 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3340 PFID(ll_inode2fid(inode)), inode, cmd);
3341 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3343 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3344 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3348 case LL_IOC_GETFLAGS:
3349 /* Get the current value of the file flags */
3350 return put_user(fd->fd_flags, (int __user *)arg);
3351 case LL_IOC_SETFLAGS:
3352 case LL_IOC_CLRFLAGS:
3353 /* Set or clear specific file flags */
3354 /* XXX This probably needs checks to ensure the flags are
3355 * not abused, and to handle any flag side effects.
3357 if (get_user(flags, (int __user *) arg))
3360 if (cmd == LL_IOC_SETFLAGS) {
3361 if ((flags & LL_FILE_IGNORE_LOCK) &&
3362 !(file->f_flags & O_DIRECT)) {
3363 CERROR("%s: unable to disable locking on "
3364 "non-O_DIRECT file\n", current->comm);
3368 fd->fd_flags |= flags;
3370 fd->fd_flags &= ~flags;
3373 case LL_IOC_LOV_SETSTRIPE:
3374 case LL_IOC_LOV_SETSTRIPE_NEW:
3375 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3376 case LL_IOC_LOV_SETEA:
3377 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3378 case LL_IOC_LOV_SWAP_LAYOUTS: {
3380 struct lustre_swap_layouts lsl;
3382 if (copy_from_user(&lsl, (char __user *)arg,
3383 sizeof(struct lustre_swap_layouts)))
3386 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3389 file2 = fget(lsl.sl_fd);
3393 /* O_WRONLY or O_RDWR */
3394 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3395 GOTO(out, rc = -EPERM);
3397 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3398 struct inode *inode2;
3399 struct ll_inode_info *lli;
3400 struct obd_client_handle *och = NULL;
3402 lli = ll_i2info(inode);
3403 mutex_lock(&lli->lli_och_mutex);
3404 if (fd->fd_lease_och != NULL) {
3405 och = fd->fd_lease_och;
3406 fd->fd_lease_och = NULL;
3408 mutex_unlock(&lli->lli_och_mutex);
3410 GOTO(out, rc = -ENOLCK);
3411 inode2 = file_inode(file2);
3412 rc = ll_swap_layouts_close(och, inode, inode2);
3414 rc = ll_swap_layouts(file, file2, &lsl);
3420 case LL_IOC_LOV_GETSTRIPE:
3421 case LL_IOC_LOV_GETSTRIPE_NEW:
3422 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3423 case FS_IOC_GETFLAGS:
3424 case FS_IOC_SETFLAGS:
3425 RETURN(ll_iocontrol(inode, file, cmd, arg));
3426 case FSFILT_IOC_GETVERSION:
3427 case FS_IOC_GETVERSION:
3428 RETURN(put_user(inode->i_generation, (int __user *)arg));
3429 /* We need to special case any other ioctls we want to handle,
3430 * to send them to the MDS/OST as appropriate and to properly
3431 * network encode the arg field. */
3432 case FS_IOC_SETVERSION:
3435 case LL_IOC_GROUP_LOCK:
3436 RETURN(ll_get_grouplock(inode, file, arg));
3437 case LL_IOC_GROUP_UNLOCK:
3438 RETURN(ll_put_grouplock(inode, file, arg));
3439 case IOC_OBD_STATFS:
3440 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3442 case LL_IOC_FLUSHCTX:
3443 RETURN(ll_flush_ctx(inode));
3444 case LL_IOC_PATH2FID: {
3445 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3446 sizeof(struct lu_fid)))
3451 case LL_IOC_GETPARENT:
3452 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3454 case OBD_IOC_FID2PATH:
3455 RETURN(ll_fid2path(inode, (void __user *)arg));
3456 case LL_IOC_DATA_VERSION: {
3457 struct ioc_data_version idv;
3460 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3463 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3464 rc = ll_ioc_data_version(inode, &idv);
3467 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3473 case LL_IOC_GET_MDTIDX: {
3476 mdtidx = ll_get_mdt_idx(inode);
3480 if (put_user((int)mdtidx, (int __user *)arg))
3485 case OBD_IOC_GETDTNAME:
3486 case OBD_IOC_GETMDNAME:
3487 RETURN(ll_get_obd_name(inode, cmd, arg));
3488 case LL_IOC_HSM_STATE_GET: {
3489 struct md_op_data *op_data;
3490 struct hsm_user_state *hus;
3497 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3498 LUSTRE_OPC_ANY, hus);
3499 if (IS_ERR(op_data)) {
3501 RETURN(PTR_ERR(op_data));
3504 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3507 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3510 ll_finish_md_op_data(op_data);
3514 case LL_IOC_HSM_STATE_SET: {
3515 struct hsm_state_set *hss;
3522 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3527 rc = ll_hsm_state_set(inode, hss);
3532 case LL_IOC_HSM_ACTION: {
3533 struct md_op_data *op_data;
3534 struct hsm_current_action *hca;
3541 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3542 LUSTRE_OPC_ANY, hca);
3543 if (IS_ERR(op_data)) {
3545 RETURN(PTR_ERR(op_data));
3548 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3551 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3554 ll_finish_md_op_data(op_data);
3558 case LL_IOC_SET_LEASE_OLD: {
3559 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3561 RETURN(ll_file_set_lease(file, &ioc, 0));
3563 case LL_IOC_SET_LEASE: {
3564 struct ll_ioc_lease ioc;
3566 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3569 RETURN(ll_file_set_lease(file, &ioc, arg));
3571 case LL_IOC_GET_LEASE: {
3572 struct ll_inode_info *lli = ll_i2info(inode);
3573 struct ldlm_lock *lock = NULL;
3576 mutex_lock(&lli->lli_och_mutex);
3577 if (fd->fd_lease_och != NULL) {
3578 struct obd_client_handle *och = fd->fd_lease_och;
3580 lock = ldlm_handle2lock(&och->och_lease_handle);
3582 lock_res_and_lock(lock);
3583 if (!ldlm_is_cancel(lock))
3584 fmode = och->och_flags;
3586 unlock_res_and_lock(lock);
3587 LDLM_LOCK_PUT(lock);
3590 mutex_unlock(&lli->lli_och_mutex);
3592 RETURN(ll_lease_type_from_fmode(fmode));
3594 case LL_IOC_HSM_IMPORT: {
3595 struct hsm_user_import *hui;
3601 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3606 rc = ll_hsm_import(inode, file, hui);
3611 case LL_IOC_FUTIMES_3: {
3612 struct ll_futimes_3 lfu;
3614 if (copy_from_user(&lfu,
3615 (const struct ll_futimes_3 __user *)arg,
3619 RETURN(ll_file_futimes_3(file, &lfu));
3621 case LL_IOC_LADVISE: {
3622 struct llapi_ladvise_hdr *k_ladvise_hdr;
3623 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3626 int alloc_size = sizeof(*k_ladvise_hdr);
3629 u_ladvise_hdr = (void __user *)arg;
3630 OBD_ALLOC_PTR(k_ladvise_hdr);
3631 if (k_ladvise_hdr == NULL)
3634 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3635 GOTO(out_ladvise, rc = -EFAULT);
3637 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3638 k_ladvise_hdr->lah_count < 1)
3639 GOTO(out_ladvise, rc = -EINVAL);
3641 num_advise = k_ladvise_hdr->lah_count;
3642 if (num_advise >= LAH_COUNT_MAX)
3643 GOTO(out_ladvise, rc = -EFBIG);
3645 OBD_FREE_PTR(k_ladvise_hdr);
3646 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3647 lah_advise[num_advise]);
3648 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3649 if (k_ladvise_hdr == NULL)
3653 * TODO: submit multiple advices to one server in a single RPC
3655 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3656 GOTO(out_ladvise, rc = -EFAULT);
3658 for (i = 0; i < num_advise; i++) {
3659 struct llapi_lu_ladvise *k_ladvise =
3660 &k_ladvise_hdr->lah_advise[i];
3661 struct llapi_lu_ladvise __user *u_ladvise =
3662 &u_ladvise_hdr->lah_advise[i];
3664 rc = ll_ladvise_sanity(inode, k_ladvise);
3666 GOTO(out_ladvise, rc);
3668 switch (k_ladvise->lla_advice) {
3669 case LU_LADVISE_LOCKNOEXPAND:
3670 rc = ll_lock_noexpand(file,
3671 k_ladvise->lla_peradvice_flags);
3672 GOTO(out_ladvise, rc);
3673 case LU_LADVISE_LOCKAHEAD:
3675 rc = ll_file_lock_ahead(file, k_ladvise);
3678 GOTO(out_ladvise, rc);
3681 &u_ladvise->lla_lockahead_result))
3682 GOTO(out_ladvise, rc = -EFAULT);
3685 rc = ll_ladvise(inode, file,
3686 k_ladvise_hdr->lah_flags,
3689 GOTO(out_ladvise, rc);
3696 OBD_FREE(k_ladvise_hdr, alloc_size);
3699 case LL_IOC_FLR_SET_MIRROR: {
3700 /* mirror I/O must be direct to avoid polluting page cache
3702 if (!(file->f_flags & O_DIRECT))
3705 fd->fd_designated_mirror = (__u32)arg;
3708 case LL_IOC_FSGETXATTR:
3709 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3710 case LL_IOC_FSSETXATTR:
3711 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3713 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3714 case LL_IOC_HEAT_GET: {
3715 struct lu_heat uheat;
3716 struct lu_heat *heat;
3719 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3722 if (uheat.lh_count > OBD_HEAT_COUNT)
3723 uheat.lh_count = OBD_HEAT_COUNT;
3725 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3726 OBD_ALLOC(heat, size);
3730 heat->lh_count = uheat.lh_count;
3731 ll_heat_get(inode, heat);
3732 rc = copy_to_user((char __user *)arg, heat, size);
3733 OBD_FREE(heat, size);
3734 RETURN(rc ? -EFAULT : 0);
3736 case LL_IOC_HEAT_SET: {
3739 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3742 rc = ll_heat_set(inode, flags);
3746 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3747 (void __user *)arg));
3751 #ifndef HAVE_FILE_LLSEEK_SIZE
3752 static inline loff_t
3753 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3755 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3757 if (offset > maxsize)
3760 if (offset != file->f_pos) {
3761 file->f_pos = offset;
3762 file->f_version = 0;
3768 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3769 loff_t maxsize, loff_t eof)
3771 struct inode *inode = file_inode(file);
3779 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3780 * position-querying operation. Avoid rewriting the "same"
3781 * f_pos value back to the file because a concurrent read(),
3782 * write() or lseek() might have altered it
3787 * f_lock protects against read/modify/write race with other
3788 * SEEK_CURs. Note that parallel writes and reads behave
3792 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3793 inode_unlock(inode);
3797 * In the generic case the entire file is data, so as long as
3798 * offset isn't at the end of the file then the offset is data.
3805 * There is a virtual hole at the end of the file, so as long as
3806 * offset isn't i_size or larger, return i_size.
3814 return llseek_execute(file, offset, maxsize);
3818 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3820 struct inode *inode = file_inode(file);
3821 loff_t retval, eof = 0;
3824 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3825 (origin == SEEK_CUR) ? file->f_pos : 0);
3826 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3827 PFID(ll_inode2fid(inode)), inode, retval, retval,
3829 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3831 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3832 retval = ll_glimpse_size(inode);
3835 eof = i_size_read(inode);
3838 retval = ll_generic_file_llseek_size(file, offset, origin,
3839 ll_file_maxbytes(inode), eof);
3843 static int ll_flush(struct file *file, fl_owner_t id)
3845 struct inode *inode = file_inode(file);
3846 struct ll_inode_info *lli = ll_i2info(inode);
3847 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3850 LASSERT(!S_ISDIR(inode->i_mode));
3852 /* catch async errors that were recorded back when async writeback
3853 * failed for pages in this mapping. */
3854 rc = lli->lli_async_rc;
3855 lli->lli_async_rc = 0;
3856 if (lli->lli_clob != NULL) {
3857 err = lov_read_and_clear_async_rc(lli->lli_clob);
3862 /* The application has been told write failure already.
3863 * Do not report failure again. */
3864 if (fd->fd_write_failed)
3866 return rc ? -EIO : 0;
3870 * Called to make sure a portion of file has been written out.
3871 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3873 * Return how many pages have been written.
3875 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3876 enum cl_fsync_mode mode, int ignore_layout)
3880 struct cl_fsync_io *fio;
3885 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3886 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3889 env = cl_env_get(&refcheck);
3891 RETURN(PTR_ERR(env));
3893 io = vvp_env_thread_io(env);
3894 io->ci_obj = ll_i2info(inode)->lli_clob;
3895 io->ci_ignore_layout = ignore_layout;
3897 /* initialize parameters for sync */
3898 fio = &io->u.ci_fsync;
3899 fio->fi_start = start;
3901 fio->fi_fid = ll_inode2fid(inode);
3902 fio->fi_mode = mode;
3903 fio->fi_nr_written = 0;
3905 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3906 result = cl_io_loop(env, io);
3908 result = io->ci_result;
3910 result = fio->fi_nr_written;
3911 cl_io_fini(env, io);
3912 cl_env_put(env, &refcheck);
3918 * When dentry is provided (the 'else' case), file_dentry() may be
3919 * null and dentry must be used directly rather than pulled from
3920 * file_dentry() as is done otherwise.
3923 #ifdef HAVE_FILE_FSYNC_4ARGS
3924 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3926 struct dentry *dentry = file_dentry(file);
3927 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3928 int ll_fsync(struct file *file, int datasync)
3930 struct dentry *dentry = file_dentry(file);
3932 loff_t end = LLONG_MAX;
3934 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3937 loff_t end = LLONG_MAX;
3939 struct inode *inode = dentry->d_inode;
3940 struct ll_inode_info *lli = ll_i2info(inode);
3941 struct ptlrpc_request *req;
3945 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3946 PFID(ll_inode2fid(inode)), inode);
3947 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3949 #ifdef HAVE_FILE_FSYNC_4ARGS
3950 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3953 /* fsync's caller has already called _fdata{sync,write}, we want
3954 * that IO to finish before calling the osc and mdc sync methods */
3955 rc = filemap_fdatawait(inode->i_mapping);
3958 /* catch async errors that were recorded back when async writeback
3959 * failed for pages in this mapping. */
3960 if (!S_ISDIR(inode->i_mode)) {
3961 err = lli->lli_async_rc;
3962 lli->lli_async_rc = 0;
3965 if (lli->lli_clob != NULL) {
3966 err = lov_read_and_clear_async_rc(lli->lli_clob);
3972 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3976 ptlrpc_req_finished(req);
3978 if (S_ISREG(inode->i_mode)) {
3979 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3981 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3982 if (rc == 0 && err < 0)
3985 fd->fd_write_failed = true;
3987 fd->fd_write_failed = false;
3990 #ifdef HAVE_FILE_FSYNC_4ARGS
3991 inode_unlock(inode);
3997 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3999 struct inode *inode = file_inode(file);
4000 struct ll_sb_info *sbi = ll_i2sbi(inode);
4001 struct ldlm_enqueue_info einfo = {
4002 .ei_type = LDLM_FLOCK,
4003 .ei_cb_cp = ldlm_flock_completion_ast,
4004 .ei_cbdata = file_lock,
4006 struct md_op_data *op_data;
4007 struct lustre_handle lockh = { 0 };
4008 union ldlm_policy_data flock = { { 0 } };
4009 int fl_type = file_lock->fl_type;
4015 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4016 PFID(ll_inode2fid(inode)), file_lock);
4018 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4020 if (file_lock->fl_flags & FL_FLOCK) {
4021 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4022 /* flocks are whole-file locks */
4023 flock.l_flock.end = OFFSET_MAX;
4024 /* For flocks owner is determined by the local file desctiptor*/
4025 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4026 } else if (file_lock->fl_flags & FL_POSIX) {
4027 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4028 flock.l_flock.start = file_lock->fl_start;
4029 flock.l_flock.end = file_lock->fl_end;
4033 flock.l_flock.pid = file_lock->fl_pid;
4035 /* Somewhat ugly workaround for svc lockd.
4036 * lockd installs custom fl_lmops->lm_compare_owner that checks
4037 * for the fl_owner to be the same (which it always is on local node
4038 * I guess between lockd processes) and then compares pid.
4039 * As such we assign pid to the owner field to make it all work,
4040 * conflict with normal locks is unlikely since pid space and
4041 * pointer space for current->files are not intersecting */
4042 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4043 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4047 einfo.ei_mode = LCK_PR;
4050 /* An unlock request may or may not have any relation to
4051 * existing locks so we may not be able to pass a lock handle
4052 * via a normal ldlm_lock_cancel() request. The request may even
4053 * unlock a byte range in the middle of an existing lock. In
4054 * order to process an unlock request we need all of the same
4055 * information that is given with a normal read or write record
4056 * lock request. To avoid creating another ldlm unlock (cancel)
4057 * message we'll treat a LCK_NL flock request as an unlock. */
4058 einfo.ei_mode = LCK_NL;
4061 einfo.ei_mode = LCK_PW;
4064 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4079 flags = LDLM_FL_BLOCK_NOWAIT;
4085 flags = LDLM_FL_TEST_LOCK;
4088 CERROR("unknown fcntl lock command: %d\n", cmd);
4092 /* Save the old mode so that if the mode in the lock changes we
4093 * can decrement the appropriate reader or writer refcount. */
4094 file_lock->fl_type = einfo.ei_mode;
4096 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4097 LUSTRE_OPC_ANY, NULL);
4098 if (IS_ERR(op_data))
4099 RETURN(PTR_ERR(op_data));
4101 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4102 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4103 flock.l_flock.pid, flags, einfo.ei_mode,
4104 flock.l_flock.start, flock.l_flock.end);
4106 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4109 /* Restore the file lock type if not TEST lock. */
4110 if (!(flags & LDLM_FL_TEST_LOCK))
4111 file_lock->fl_type = fl_type;
4113 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4114 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4115 !(flags & LDLM_FL_TEST_LOCK))
4116 rc2 = locks_lock_file_wait(file, file_lock);
4118 if ((file_lock->fl_flags & FL_FLOCK) &&
4119 (rc == 0 || file_lock->fl_type == F_UNLCK))
4120 rc2 = flock_lock_file_wait(file, file_lock);
4121 if ((file_lock->fl_flags & FL_POSIX) &&
4122 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4123 !(flags & LDLM_FL_TEST_LOCK))
4124 rc2 = posix_lock_file_wait(file, file_lock);
4125 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4127 if (rc2 && file_lock->fl_type != F_UNLCK) {
4128 einfo.ei_mode = LCK_NL;
4129 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4134 ll_finish_md_op_data(op_data);
4139 int ll_get_fid_by_name(struct inode *parent, const char *name,
4140 int namelen, struct lu_fid *fid,
4141 struct inode **inode)
4143 struct md_op_data *op_data = NULL;
4144 struct mdt_body *body;
4145 struct ptlrpc_request *req;
4149 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4150 LUSTRE_OPC_ANY, NULL);
4151 if (IS_ERR(op_data))
4152 RETURN(PTR_ERR(op_data));
4154 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4155 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4156 ll_finish_md_op_data(op_data);
4160 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4162 GOTO(out_req, rc = -EFAULT);
4164 *fid = body->mbo_fid1;
4167 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4169 ptlrpc_req_finished(req);
4173 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4176 struct dentry *dchild = NULL;
4177 struct inode *child_inode = NULL;
4178 struct md_op_data *op_data;
4179 struct ptlrpc_request *request = NULL;
4180 struct obd_client_handle *och = NULL;
4182 struct mdt_body *body;
4183 __u64 data_version = 0;
4184 size_t namelen = strlen(name);
4185 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4189 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4190 PFID(ll_inode2fid(parent)), name,
4191 lum->lum_stripe_offset, lum->lum_stripe_count);
4193 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4194 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4195 lustre_swab_lmv_user_md(lum);
4197 /* Get child FID first */
4198 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4201 dchild = d_lookup(file_dentry(file), &qstr);
4203 if (dchild->d_inode)
4204 child_inode = igrab(dchild->d_inode);
4209 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4218 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4219 OBD_CONNECT2_DIR_MIGRATE)) {
4220 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4221 ll_dir_striped(child_inode)) {
4222 CERROR("%s: MDT doesn't support stripe directory "
4223 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4224 GOTO(out_iput, rc = -EOPNOTSUPP);
4229 * lfs migrate command needs to be blocked on the client
4230 * by checking the migrate FID against the FID of the
4233 if (child_inode == parent->i_sb->s_root->d_inode)
4234 GOTO(out_iput, rc = -EINVAL);
4236 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4237 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4238 if (IS_ERR(op_data))
4239 GOTO(out_iput, rc = PTR_ERR(op_data));
4241 inode_lock(child_inode);
4242 op_data->op_fid3 = *ll_inode2fid(child_inode);
4243 if (!fid_is_sane(&op_data->op_fid3)) {
4244 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4245 ll_i2sbi(parent)->ll_fsname, name,
4246 PFID(&op_data->op_fid3));
4247 GOTO(out_unlock, rc = -EINVAL);
4250 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4251 op_data->op_data = lum;
4252 op_data->op_data_size = lumlen;
4255 if (S_ISREG(child_inode->i_mode)) {
4256 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4260 GOTO(out_unlock, rc);
4263 rc = ll_data_version(child_inode, &data_version,
4266 GOTO(out_close, rc);
4268 op_data->op_open_handle = och->och_open_handle;
4269 op_data->op_data_version = data_version;
4270 op_data->op_lease_handle = och->och_lease_handle;
4271 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4273 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4274 och->och_mod->mod_open_req->rq_replay = 0;
4275 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4278 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4279 name, namelen, &request);
4281 LASSERT(request != NULL);
4282 ll_update_times(request, parent);
4285 if (rc == 0 || rc == -EAGAIN) {
4286 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4287 LASSERT(body != NULL);
4289 /* If the server does release layout lock, then we cleanup
4290 * the client och here, otherwise release it in out_close: */
4291 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4292 obd_mod_put(och->och_mod);
4293 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4295 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4301 if (request != NULL) {
4302 ptlrpc_req_finished(request);
4306 /* Try again if the lease has cancelled. */
4307 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4312 ll_lease_close(och, child_inode, NULL);
4314 clear_nlink(child_inode);
4316 inode_unlock(child_inode);
4317 ll_finish_md_op_data(op_data);
4324 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4332 * test if some locks matching bits and l_req_mode are acquired
4333 * - bits can be in different locks
4334 * - if found clear the common lock bits in *bits
4335 * - the bits not found, are kept in *bits
4337 * \param bits [IN] searched lock bits [IN]
4338 * \param l_req_mode [IN] searched lock mode
4339 * \retval boolean, true iff all bits are found
4341 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4343 struct lustre_handle lockh;
4344 union ldlm_policy_data policy;
4345 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4346 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4355 fid = &ll_i2info(inode)->lli_fid;
4356 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4357 ldlm_lockname[mode]);
4359 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4360 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4361 policy.l_inodebits.bits = *bits & (1 << i);
4362 if (policy.l_inodebits.bits == 0)
4365 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4366 &policy, mode, &lockh)) {
4367 struct ldlm_lock *lock;
4369 lock = ldlm_handle2lock(&lockh);
4372 ~(lock->l_policy_data.l_inodebits.bits);
4373 LDLM_LOCK_PUT(lock);
4375 *bits &= ~policy.l_inodebits.bits;
4382 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4383 struct lustre_handle *lockh, __u64 flags,
4384 enum ldlm_mode mode)
4386 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4391 fid = &ll_i2info(inode)->lli_fid;
4392 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4394 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4395 fid, LDLM_IBITS, &policy, mode, lockh);
4400 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4402 /* Already unlinked. Just update nlink and return success */
4403 if (rc == -ENOENT) {
4405 /* If it is striped directory, and there is bad stripe
4406 * Let's revalidate the dentry again, instead of returning
4408 if (ll_dir_striped(inode))
4411 /* This path cannot be hit for regular files unless in
4412 * case of obscure races, so no need to to validate
4414 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4416 } else if (rc != 0) {
4417 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4418 "%s: revalidate FID "DFID" error: rc = %d\n",
4419 ll_i2sbi(inode)->ll_fsname,
4420 PFID(ll_inode2fid(inode)), rc);
4426 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4428 struct inode *inode = dentry->d_inode;
4429 struct obd_export *exp = ll_i2mdexp(inode);
4430 struct lookup_intent oit = {
4433 struct ptlrpc_request *req = NULL;
4434 struct md_op_data *op_data;
4438 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4439 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4441 /* Call getattr by fid, so do not provide name at all. */
4442 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4443 LUSTRE_OPC_ANY, NULL);
4444 if (IS_ERR(op_data))
4445 RETURN(PTR_ERR(op_data));
4447 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4448 ll_finish_md_op_data(op_data);
4450 rc = ll_inode_revalidate_fini(inode, rc);
4454 rc = ll_revalidate_it_finish(req, &oit, dentry);
4456 ll_intent_release(&oit);
4460 /* Unlinked? Unhash dentry, so it is not picked up later by
4461 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4462 * here to preserve get_cwd functionality on 2.6.
4464 if (!dentry->d_inode->i_nlink) {
4465 ll_lock_dcache(inode);
4466 d_lustre_invalidate(dentry, 0);
4467 ll_unlock_dcache(inode);
4470 ll_lookup_finish_locks(&oit, dentry);
4472 ptlrpc_req_finished(req);
4477 static int ll_merge_md_attr(struct inode *inode)
4479 struct ll_inode_info *lli = ll_i2info(inode);
4480 struct cl_attr attr = { 0 };
4483 LASSERT(lli->lli_lsm_md != NULL);
4485 if (!lmv_dir_striped(lli->lli_lsm_md))
4488 down_read(&lli->lli_lsm_sem);
4489 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4490 &attr, ll_md_blocking_ast);
4491 up_read(&lli->lli_lsm_sem);
4495 set_nlink(inode, attr.cat_nlink);
4496 inode->i_blocks = attr.cat_blocks;
4497 i_size_write(inode, attr.cat_size);
4499 ll_i2info(inode)->lli_atime = attr.cat_atime;
4500 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4501 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4506 static inline dev_t ll_compat_encode_dev(dev_t dev)
4508 /* The compat_sys_*stat*() syscalls will fail unless the
4509 * device majors and minors are both less than 256. Note that
4510 * the value returned here will be passed through
4511 * old_encode_dev() in cp_compat_stat(). And so we are not
4512 * trying to return a valid compat (u16) device number, just
4513 * one that will pass the old_valid_dev() check. */
4515 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4518 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4519 int ll_getattr(const struct path *path, struct kstat *stat,
4520 u32 request_mask, unsigned int flags)
4522 struct dentry *de = path->dentry;
4524 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4527 struct inode *inode = de->d_inode;
4528 struct ll_sb_info *sbi = ll_i2sbi(inode);
4529 struct ll_inode_info *lli = ll_i2info(inode);
4532 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4534 rc = ll_inode_revalidate(de, IT_GETATTR);
4538 if (S_ISREG(inode->i_mode)) {
4539 /* In case of restore, the MDT has the right size and has
4540 * already send it back without granting the layout lock,
4541 * inode is up-to-date so glimpse is useless.
4542 * Also to glimpse we need the layout, in case of a running
4543 * restore the MDT holds the layout lock so the glimpse will
4544 * block up to the end of restore (getattr will block)
4546 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4547 rc = ll_glimpse_size(inode);
4552 /* If object isn't regular a file then don't validate size. */
4553 if (ll_dir_striped(inode)) {
4554 rc = ll_merge_md_attr(inode);
4559 inode->i_atime.tv_sec = lli->lli_atime;
4560 inode->i_mtime.tv_sec = lli->lli_mtime;
4561 inode->i_ctime.tv_sec = lli->lli_ctime;
4564 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4566 if (ll_need_32bit_api(sbi)) {
4567 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4568 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4569 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4571 stat->ino = inode->i_ino;
4572 stat->dev = inode->i_sb->s_dev;
4573 stat->rdev = inode->i_rdev;
4576 stat->mode = inode->i_mode;
4577 stat->uid = inode->i_uid;
4578 stat->gid = inode->i_gid;
4579 stat->atime = inode->i_atime;
4580 stat->mtime = inode->i_mtime;
4581 stat->ctime = inode->i_ctime;
4582 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4584 stat->nlink = inode->i_nlink;
4585 stat->size = i_size_read(inode);
4586 stat->blocks = inode->i_blocks;
4591 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4592 __u64 start, __u64 len)
4596 struct fiemap *fiemap;
4597 unsigned int extent_count = fieinfo->fi_extents_max;
4599 num_bytes = sizeof(*fiemap) + (extent_count *
4600 sizeof(struct fiemap_extent));
4601 OBD_ALLOC_LARGE(fiemap, num_bytes);
4606 fiemap->fm_flags = fieinfo->fi_flags;
4607 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4608 fiemap->fm_start = start;
4609 fiemap->fm_length = len;
4610 if (extent_count > 0 &&
4611 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4612 sizeof(struct fiemap_extent)) != 0)
4613 GOTO(out, rc = -EFAULT);
4615 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4617 fieinfo->fi_flags = fiemap->fm_flags;
4618 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4619 if (extent_count > 0 &&
4620 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4621 fiemap->fm_mapped_extents *
4622 sizeof(struct fiemap_extent)) != 0)
4623 GOTO(out, rc = -EFAULT);
4625 OBD_FREE_LARGE(fiemap, num_bytes);
4629 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4631 struct ll_inode_info *lli = ll_i2info(inode);
4632 struct posix_acl *acl = NULL;
4635 spin_lock(&lli->lli_lock);
4636 /* VFS' acl_permission_check->check_acl will release the refcount */
4637 acl = posix_acl_dup(lli->lli_posix_acl);
4638 spin_unlock(&lli->lli_lock);
4643 #ifdef HAVE_IOP_SET_ACL
4644 #ifdef CONFIG_FS_POSIX_ACL
4645 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4647 struct ll_sb_info *sbi = ll_i2sbi(inode);
4648 struct ptlrpc_request *req = NULL;
4649 const char *name = NULL;
4651 size_t value_size = 0;
4656 case ACL_TYPE_ACCESS:
4657 name = XATTR_NAME_POSIX_ACL_ACCESS;
4659 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4662 case ACL_TYPE_DEFAULT:
4663 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4664 if (!S_ISDIR(inode->i_mode))
4665 rc = acl ? -EACCES : 0;
4676 value_size = posix_acl_xattr_size(acl->a_count);
4677 value = kmalloc(value_size, GFP_NOFS);
4679 GOTO(out, rc = -ENOMEM);
4681 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4683 GOTO(out_value, rc);
4686 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4687 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4688 name, value, value_size, 0, 0, &req);
4690 ptlrpc_req_finished(req);
4695 forget_cached_acl(inode, type);
4697 set_cached_acl(inode, type, acl);
4700 #endif /* CONFIG_FS_POSIX_ACL */
4701 #endif /* HAVE_IOP_SET_ACL */
4703 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4705 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4706 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4708 ll_check_acl(struct inode *inode, int mask)
4711 # ifdef CONFIG_FS_POSIX_ACL
4712 struct posix_acl *acl;
4716 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4717 if (flags & IPERM_FLAG_RCU)
4720 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4725 rc = posix_acl_permission(inode, acl, mask);
4726 posix_acl_release(acl);
4729 # else /* !CONFIG_FS_POSIX_ACL */
4731 # endif /* CONFIG_FS_POSIX_ACL */
4733 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4735 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4736 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4738 # ifdef HAVE_INODE_PERMISION_2ARGS
4739 int ll_inode_permission(struct inode *inode, int mask)
4741 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4746 struct ll_sb_info *sbi;
4747 struct root_squash_info *squash;
4748 struct cred *cred = NULL;
4749 const struct cred *old_cred = NULL;
4751 bool squash_id = false;
4754 #ifdef MAY_NOT_BLOCK
4755 if (mask & MAY_NOT_BLOCK)
4757 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4758 if (flags & IPERM_FLAG_RCU)
4762 /* as root inode are NOT getting validated in lookup operation,
4763 * need to do it before permission check. */
4765 if (inode == inode->i_sb->s_root->d_inode) {
4766 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4771 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4772 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4774 /* squash fsuid/fsgid if needed */
4775 sbi = ll_i2sbi(inode);
4776 squash = &sbi->ll_squash;
4777 if (unlikely(squash->rsi_uid != 0 &&
4778 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4779 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4783 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4784 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4785 squash->rsi_uid, squash->rsi_gid);
4787 /* update current process's credentials
4788 * and FS capability */
4789 cred = prepare_creds();
4793 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4794 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4795 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4796 if ((1 << cap) & CFS_CAP_FS_MASK)
4797 cap_lower(cred->cap_effective, cap);
4799 old_cred = override_creds(cred);
4802 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4803 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4804 /* restore current process's credentials and FS capability */
4806 revert_creds(old_cred);
4813 /* -o localflock - only provides locally consistent flock locks */
4814 struct file_operations ll_file_operations = {
4815 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4816 # ifdef HAVE_SYNC_READ_WRITE
4817 .read = new_sync_read,
4818 .write = new_sync_write,
4820 .read_iter = ll_file_read_iter,
4821 .write_iter = ll_file_write_iter,
4822 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4823 .read = ll_file_read,
4824 .aio_read = ll_file_aio_read,
4825 .write = ll_file_write,
4826 .aio_write = ll_file_aio_write,
4827 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4828 .unlocked_ioctl = ll_file_ioctl,
4829 .open = ll_file_open,
4830 .release = ll_file_release,
4831 .mmap = ll_file_mmap,
4832 .llseek = ll_file_seek,
4833 .splice_read = ll_file_splice_read,
4838 struct file_operations ll_file_operations_flock = {
4839 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4840 # ifdef HAVE_SYNC_READ_WRITE
4841 .read = new_sync_read,
4842 .write = new_sync_write,
4843 # endif /* HAVE_SYNC_READ_WRITE */
4844 .read_iter = ll_file_read_iter,
4845 .write_iter = ll_file_write_iter,
4846 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4847 .read = ll_file_read,
4848 .aio_read = ll_file_aio_read,
4849 .write = ll_file_write,
4850 .aio_write = ll_file_aio_write,
4851 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4852 .unlocked_ioctl = ll_file_ioctl,
4853 .open = ll_file_open,
4854 .release = ll_file_release,
4855 .mmap = ll_file_mmap,
4856 .llseek = ll_file_seek,
4857 .splice_read = ll_file_splice_read,
4860 .flock = ll_file_flock,
4861 .lock = ll_file_flock
4864 /* These are for -o noflock - to return ENOSYS on flock calls */
4865 struct file_operations ll_file_operations_noflock = {
4866 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4867 # ifdef HAVE_SYNC_READ_WRITE
4868 .read = new_sync_read,
4869 .write = new_sync_write,
4870 # endif /* HAVE_SYNC_READ_WRITE */
4871 .read_iter = ll_file_read_iter,
4872 .write_iter = ll_file_write_iter,
4873 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4874 .read = ll_file_read,
4875 .aio_read = ll_file_aio_read,
4876 .write = ll_file_write,
4877 .aio_write = ll_file_aio_write,
4878 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4879 .unlocked_ioctl = ll_file_ioctl,
4880 .open = ll_file_open,
4881 .release = ll_file_release,
4882 .mmap = ll_file_mmap,
4883 .llseek = ll_file_seek,
4884 .splice_read = ll_file_splice_read,
4887 .flock = ll_file_noflock,
4888 .lock = ll_file_noflock
4891 struct inode_operations ll_file_inode_operations = {
4892 .setattr = ll_setattr,
4893 .getattr = ll_getattr,
4894 .permission = ll_inode_permission,
4895 #ifdef HAVE_IOP_XATTR
4896 .setxattr = ll_setxattr,
4897 .getxattr = ll_getxattr,
4898 .removexattr = ll_removexattr,
4900 .listxattr = ll_listxattr,
4901 .fiemap = ll_fiemap,
4902 #ifdef HAVE_IOP_GET_ACL
4903 .get_acl = ll_get_acl,
4905 #ifdef HAVE_IOP_SET_ACL
4906 .set_acl = ll_set_acl,
4910 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4912 struct ll_inode_info *lli = ll_i2info(inode);
4913 struct cl_object *obj = lli->lli_clob;
4922 env = cl_env_get(&refcheck);
4924 RETURN(PTR_ERR(env));
4926 rc = cl_conf_set(env, lli->lli_clob, conf);
4930 if (conf->coc_opc == OBJECT_CONF_SET) {
4931 struct ldlm_lock *lock = conf->coc_lock;
4932 struct cl_layout cl = {
4936 LASSERT(lock != NULL);
4937 LASSERT(ldlm_has_layout(lock));
4939 /* it can only be allowed to match after layout is
4940 * applied to inode otherwise false layout would be
4941 * seen. Applying layout shoud happen before dropping
4942 * the intent lock. */
4943 ldlm_lock_allow_match(lock);
4945 rc = cl_object_layout_get(env, obj, &cl);
4950 DFID": layout version change: %u -> %u\n",
4951 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4953 ll_layout_version_set(lli, cl.cl_layout_gen);
4957 cl_env_put(env, &refcheck);
4962 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4963 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4966 struct ll_sb_info *sbi = ll_i2sbi(inode);
4967 struct ptlrpc_request *req;
4974 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4975 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4976 lock->l_lvb_data, lock->l_lvb_len);
4978 if (lock->l_lvb_data != NULL)
4981 /* if layout lock was granted right away, the layout is returned
4982 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4983 * blocked and then granted via completion ast, we have to fetch
4984 * layout here. Please note that we can't use the LVB buffer in
4985 * completion AST because it doesn't have a large enough buffer */
4986 rc = ll_get_default_mdsize(sbi, &lmmsize);
4990 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4991 XATTR_NAME_LOV, lmmsize, &req);
4994 GOTO(out, rc = 0); /* empty layout */
5001 if (lmmsize == 0) /* empty layout */
5004 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5006 GOTO(out, rc = -EFAULT);
5008 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5009 if (lvbdata == NULL)
5010 GOTO(out, rc = -ENOMEM);
5012 memcpy(lvbdata, lmm, lmmsize);
5013 lock_res_and_lock(lock);
5014 if (unlikely(lock->l_lvb_data == NULL)) {
5015 lock->l_lvb_type = LVB_T_LAYOUT;
5016 lock->l_lvb_data = lvbdata;
5017 lock->l_lvb_len = lmmsize;
5020 unlock_res_and_lock(lock);
5023 OBD_FREE_LARGE(lvbdata, lmmsize);
5028 ptlrpc_req_finished(req);
5033 * Apply the layout to the inode. Layout lock is held and will be released
5036 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5037 struct inode *inode)
5039 struct ll_inode_info *lli = ll_i2info(inode);
5040 struct ll_sb_info *sbi = ll_i2sbi(inode);
5041 struct ldlm_lock *lock;
5042 struct cl_object_conf conf;
5045 bool wait_layout = false;
5048 LASSERT(lustre_handle_is_used(lockh));
5050 lock = ldlm_handle2lock(lockh);
5051 LASSERT(lock != NULL);
5052 LASSERT(ldlm_has_layout(lock));
5054 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5055 PFID(&lli->lli_fid), inode);
5057 /* in case this is a caching lock and reinstate with new inode */
5058 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5060 lock_res_and_lock(lock);
5061 lvb_ready = ldlm_is_lvb_ready(lock);
5062 unlock_res_and_lock(lock);
5064 /* checking lvb_ready is racy but this is okay. The worst case is
5065 * that multi processes may configure the file on the same time. */
5069 rc = ll_layout_fetch(inode, lock);
5073 /* for layout lock, lmm is stored in lock's lvb.
5074 * lvb_data is immutable if the lock is held so it's safe to access it
5077 * set layout to file. Unlikely this will fail as old layout was
5078 * surely eliminated */
5079 memset(&conf, 0, sizeof conf);
5080 conf.coc_opc = OBJECT_CONF_SET;
5081 conf.coc_inode = inode;
5082 conf.coc_lock = lock;
5083 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5084 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5085 rc = ll_layout_conf(inode, &conf);
5087 /* refresh layout failed, need to wait */
5088 wait_layout = rc == -EBUSY;
5091 LDLM_LOCK_PUT(lock);
5092 ldlm_lock_decref(lockh, mode);
5094 /* wait for IO to complete if it's still being used. */
5096 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5097 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5099 memset(&conf, 0, sizeof conf);
5100 conf.coc_opc = OBJECT_CONF_WAIT;
5101 conf.coc_inode = inode;
5102 rc = ll_layout_conf(inode, &conf);
5106 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5107 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5113 * Issue layout intent RPC to MDS.
5114 * \param inode [in] file inode
5115 * \param intent [in] layout intent
5117 * \retval 0 on success
5118 * \retval < 0 error code
5120 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5122 struct ll_inode_info *lli = ll_i2info(inode);
5123 struct ll_sb_info *sbi = ll_i2sbi(inode);
5124 struct md_op_data *op_data;
5125 struct lookup_intent it;
5126 struct ptlrpc_request *req;
5130 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5131 0, 0, LUSTRE_OPC_ANY, NULL);
5132 if (IS_ERR(op_data))
5133 RETURN(PTR_ERR(op_data));
5135 op_data->op_data = intent;
5136 op_data->op_data_size = sizeof(*intent);
5138 memset(&it, 0, sizeof(it));
5139 it.it_op = IT_LAYOUT;
5140 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5141 intent->li_opc == LAYOUT_INTENT_TRUNC)
5142 it.it_flags = FMODE_WRITE;
5144 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5145 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5147 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5148 &ll_md_blocking_ast, 0);
5149 if (it.it_request != NULL)
5150 ptlrpc_req_finished(it.it_request);
5151 it.it_request = NULL;
5153 ll_finish_md_op_data(op_data);
5155 /* set lock data in case this is a new lock */
5157 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5159 ll_intent_drop_lock(&it);
5165 * This function checks if there exists a LAYOUT lock on the client side,
5166 * or enqueues it if it doesn't have one in cache.
5168 * This function will not hold layout lock so it may be revoked any time after
5169 * this function returns. Any operations depend on layout should be redone
5172 * This function should be called before lov_io_init() to get an uptodate
5173 * layout version, the caller should save the version number and after IO
5174 * is finished, this function should be called again to verify that layout
5175 * is not changed during IO time.
5177 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5179 struct ll_inode_info *lli = ll_i2info(inode);
5180 struct ll_sb_info *sbi = ll_i2sbi(inode);
5181 struct lustre_handle lockh;
5182 struct layout_intent intent = {
5183 .li_opc = LAYOUT_INTENT_ACCESS,
5185 enum ldlm_mode mode;
5189 *gen = ll_layout_version_get(lli);
5190 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5194 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5195 LASSERT(S_ISREG(inode->i_mode));
5197 /* take layout lock mutex to enqueue layout lock exclusively. */
5198 mutex_lock(&lli->lli_layout_mutex);
5201 /* mostly layout lock is caching on the local side, so try to
5202 * match it before grabbing layout lock mutex. */
5203 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5204 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5205 if (mode != 0) { /* hit cached lock */
5206 rc = ll_layout_lock_set(&lockh, mode, inode);
5212 rc = ll_layout_intent(inode, &intent);
5218 *gen = ll_layout_version_get(lli);
5219 mutex_unlock(&lli->lli_layout_mutex);
5225 * Issue layout intent RPC indicating where in a file an IO is about to write.
5227 * \param[in] inode file inode.
5228 * \param[in] ext write range with start offset of fille in bytes where
5229 * an IO is about to write, and exclusive end offset in
5232 * \retval 0 on success
5233 * \retval < 0 error code
5235 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5236 struct lu_extent *ext)
5238 struct layout_intent intent = {
5240 .li_extent.e_start = ext->e_start,
5241 .li_extent.e_end = ext->e_end,
5246 rc = ll_layout_intent(inode, &intent);
5252 * This function send a restore request to the MDT
5254 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5256 struct hsm_user_request *hur;
5260 len = sizeof(struct hsm_user_request) +
5261 sizeof(struct hsm_user_item);
5262 OBD_ALLOC(hur, len);
5266 hur->hur_request.hr_action = HUA_RESTORE;
5267 hur->hur_request.hr_archive_id = 0;
5268 hur->hur_request.hr_flags = 0;
5269 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5270 sizeof(hur->hur_user_item[0].hui_fid));
5271 hur->hur_user_item[0].hui_extent.offset = offset;
5272 hur->hur_user_item[0].hui_extent.length = length;
5273 hur->hur_request.hr_itemcount = 1;
5274 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,