4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
108 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
109 op_data->op_handle = och->och_fh;
111 if (och->och_flags & FMODE_WRITE &&
112 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
113 /* For HSM: if inode data has been modified, pack it so that
114 * MDT can set data dirty flag in the archive. */
115 op_data->op_bias |= MDS_DATA_MODIFIED;
121 * Perform a close, possibly with a bias.
122 * The meaning of "data" depends on the value of "bias".
124 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
125 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
128 static int ll_close_inode_openhandle(struct inode *inode,
129 struct obd_client_handle *och,
130 enum mds_op_bias bias, void *data)
132 struct obd_export *md_exp = ll_i2mdexp(inode);
133 const struct ll_inode_info *lli = ll_i2info(inode);
134 struct md_op_data *op_data;
135 struct ptlrpc_request *req = NULL;
139 if (class_exp2obd(md_exp) == NULL) {
140 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
141 ll_get_fsname(inode->i_sb, NULL, 0),
142 PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
158 case MDS_CLOSE_LAYOUT_SPLIT:
159 case MDS_CLOSE_LAYOUT_SWAP: {
160 struct split_param *sp = data;
162 LASSERT(data != NULL);
163 op_data->op_bias |= bias;
164 op_data->op_data_version = 0;
165 op_data->op_lease_handle = och->och_lease_handle;
166 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
167 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
168 op_data->op_mirror_id = sp->sp_mirror_id;
170 op_data->op_fid2 = *ll_inode2fid(data);
175 case MDS_CLOSE_RESYNC_DONE: {
176 struct ll_ioc_lease *ioc = data;
178 LASSERT(data != NULL);
179 op_data->op_attr_blocks +=
180 ioc->lil_count * op_data->op_attr_blocks;
181 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
182 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
184 op_data->op_lease_handle = och->och_lease_handle;
185 op_data->op_data = &ioc->lil_ids[0];
186 op_data->op_data_size =
187 ioc->lil_count * sizeof(ioc->lil_ids[0]);
191 case MDS_HSM_RELEASE:
192 LASSERT(data != NULL);
193 op_data->op_bias |= MDS_HSM_RELEASE;
194 op_data->op_data_version = *(__u64 *)data;
195 op_data->op_lease_handle = och->och_lease_handle;
196 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
200 LASSERT(data == NULL);
204 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
205 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
206 if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
207 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
209 rc = md_close(md_exp, op_data, och->och_mod, &req);
210 if (rc != 0 && rc != -EINTR)
211 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
212 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
214 if (rc == 0 && op_data->op_bias & bias) {
215 struct mdt_body *body;
217 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
218 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
222 ll_finish_md_op_data(op_data);
226 md_clear_open_replay_data(md_exp, och);
227 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
230 ptlrpc_req_finished(req); /* This is close request */
234 int ll_md_real_close(struct inode *inode, fmode_t fmode)
236 struct ll_inode_info *lli = ll_i2info(inode);
237 struct obd_client_handle **och_p;
238 struct obd_client_handle *och;
243 if (fmode & FMODE_WRITE) {
244 och_p = &lli->lli_mds_write_och;
245 och_usecount = &lli->lli_open_fd_write_count;
246 } else if (fmode & FMODE_EXEC) {
247 och_p = &lli->lli_mds_exec_och;
248 och_usecount = &lli->lli_open_fd_exec_count;
250 LASSERT(fmode & FMODE_READ);
251 och_p = &lli->lli_mds_read_och;
252 och_usecount = &lli->lli_open_fd_read_count;
255 mutex_lock(&lli->lli_och_mutex);
256 if (*och_usecount > 0) {
257 /* There are still users of this handle, so skip
259 mutex_unlock(&lli->lli_och_mutex);
265 mutex_unlock(&lli->lli_och_mutex);
268 /* There might be a race and this handle may already
270 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
276 static int ll_md_close(struct inode *inode, struct file *file)
278 union ldlm_policy_data policy = {
279 .l_inodebits = { MDS_INODELOCK_OPEN },
281 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
282 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
283 struct ll_inode_info *lli = ll_i2info(inode);
284 struct lustre_handle lockh;
285 enum ldlm_mode lockmode;
289 /* clear group lock, if present */
290 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
291 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
293 if (fd->fd_lease_och != NULL) {
296 /* Usually the lease is not released when the
297 * application crashed, we need to release here. */
298 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
299 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
300 PFID(&lli->lli_fid), rc, lease_broken);
302 fd->fd_lease_och = NULL;
305 if (fd->fd_och != NULL) {
306 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
311 /* Let's see if we have good enough OPEN lock on the file and if
312 we can skip talking to MDS */
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode, &lockh))
331 rc = ll_md_real_close(inode, fd->fd_omode);
334 LUSTRE_FPRIVATE(file) = NULL;
335 ll_file_data_put(fd);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 if (inode->i_sb->s_root != file_dentry(file))
357 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
358 fd = LUSTRE_FPRIVATE(file);
361 /* The last ref on @file, maybe not the the owner pid of statahead,
362 * because parent and child process can share the same file handle. */
363 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
364 ll_deauthorize_statahead(inode, fd);
366 if (inode->i_sb->s_root == file_dentry(file)) {
367 LUSTRE_FPRIVATE(file) = NULL;
368 ll_file_data_put(fd);
372 if (!S_ISDIR(inode->i_mode)) {
373 if (lli->lli_clob != NULL)
374 lov_read_and_clear_async_rc(lli->lli_clob);
375 lli->lli_async_rc = 0;
378 rc = ll_md_close(inode, file);
380 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
381 libcfs_debug_dumplog();
386 static inline int ll_dom_readpage(void *data, struct page *page)
388 struct niobuf_local *lnb = data;
391 kaddr = ll_kmap_atomic(page, KM_USER0);
392 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
393 if (lnb->lnb_len < PAGE_SIZE)
394 memset(kaddr + lnb->lnb_len, 0,
395 PAGE_SIZE - lnb->lnb_len);
396 flush_dcache_page(page);
397 SetPageUptodate(page);
398 ll_kunmap_atomic(kaddr, KM_USER0);
404 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
405 struct lookup_intent *it)
407 struct ll_inode_info *lli = ll_i2info(inode);
408 struct cl_object *obj = lli->lli_clob;
409 struct address_space *mapping = inode->i_mapping;
411 struct niobuf_remote *rnb;
416 struct lustre_handle lockh;
417 struct ldlm_lock *lock;
418 unsigned long index, start;
419 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
439 env = cl_env_get(&refcheck);
443 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
445 GOTO(out_env, rc = -ENODATA);
447 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
448 data = (char *)rnb + sizeof(*rnb);
450 if (rnb == NULL || rnb->rnb_len == 0)
451 GOTO(out_env, rc = 0);
453 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
454 rnb->rnb_len, i_size_read(inode));
456 io = vvp_env_thread_io(env);
458 io->ci_ignore_layout = 1;
459 rc = cl_io_init(env, io, CIT_MISC, obj);
463 lnb.lnb_file_offset = rnb->rnb_offset;
464 start = lnb.lnb_file_offset / PAGE_SIZE;
466 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
467 lnb.lnb_page_offset = 0;
471 lnb.lnb_data = data + (index << PAGE_SHIFT);
472 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
473 if (lnb.lnb_len > PAGE_SIZE)
474 lnb.lnb_len = PAGE_SIZE;
476 vmpage = read_cache_page(mapping, index + start,
477 ll_dom_readpage, &lnb);
478 if (IS_ERR(vmpage)) {
479 CWARN("%s: cannot fill page %lu for "DFID
480 " with data: rc = %li\n",
481 ll_get_fsname(inode->i_sb, NULL, 0),
482 index + start, PFID(lu_object_fid(&obj->co_lu)),
487 if (vmpage->mapping == NULL) {
490 /* page was truncated */
491 GOTO(out_io, rc = -ENODATA);
493 clp = cl_page_find(env, obj, vmpage->index, vmpage,
498 GOTO(out_io, rc = PTR_ERR(clp));
502 cl_page_export(env, clp, 1);
503 cl_page_put(env, clp);
507 } while (rnb->rnb_len > (index << PAGE_SHIFT));
513 cl_env_put(env, &refcheck);
516 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
517 struct lookup_intent *itp)
519 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
520 struct dentry *parent = de->d_parent;
521 const char *name = NULL;
523 struct md_op_data *op_data;
524 struct ptlrpc_request *req = NULL;
528 LASSERT(parent != NULL);
529 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
531 /* if server supports open-by-fid, or file name is invalid, don't pack
532 * name in open request */
533 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
534 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
535 name = de->d_name.name;
536 len = de->d_name.len;
539 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
540 name, len, 0, LUSTRE_OPC_ANY, NULL);
542 RETURN(PTR_ERR(op_data));
543 op_data->op_data = lmm;
544 op_data->op_data_size = lmmsize;
546 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
547 &ll_md_blocking_ast, 0);
548 ll_finish_md_op_data(op_data);
550 /* reason for keep own exit path - don`t flood log
551 * with messages with -ESTALE errors.
553 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
554 it_open_error(DISP_OPEN_OPEN, itp))
556 ll_release_openhandle(de, itp);
560 if (it_disposition(itp, DISP_LOOKUP_NEG))
561 GOTO(out, rc = -ENOENT);
563 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
564 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
565 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
569 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
571 if (!rc && itp->it_lock_mode) {
572 ll_dom_finish_open(de->d_inode, req, itp);
573 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
577 ptlrpc_req_finished(req);
578 ll_intent_drop_lock(itp);
580 /* We did open by fid, but by the time we got to the server,
581 * the object disappeared. If this is a create, we cannot really
582 * tell the userspace that the file it was trying to create
583 * does not exist. Instead let's return -ESTALE, and the VFS will
584 * retry the create with LOOKUP_REVAL that we are going to catch
585 * in ll_revalidate_dentry() and use lookup then.
587 if (rc == -ENOENT && itp->it_op & IT_CREAT)
593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
594 struct obd_client_handle *och)
596 struct mdt_body *body;
598 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
599 och->och_fh = body->mbo_handle;
600 och->och_fid = body->mbo_fid1;
601 och->och_lease_handle.cookie = it->it_lock_handle;
602 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
603 och->och_flags = it->it_flags;
605 return md_set_open_replay_data(md_exp, och, it);
608 static int ll_local_open(struct file *file, struct lookup_intent *it,
609 struct ll_file_data *fd, struct obd_client_handle *och)
611 struct inode *inode = file_inode(file);
614 LASSERT(!LUSTRE_FPRIVATE(file));
621 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
626 LUSTRE_FPRIVATE(file) = fd;
627 ll_readahead_init(inode, &fd->fd_ras);
628 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
630 /* ll_cl_context initialize */
631 rwlock_init(&fd->fd_lock);
632 INIT_LIST_HEAD(&fd->fd_lccs);
637 /* Open a file, and (for the very first open) create objects on the OSTs at
638 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
639 * creation or open until ll_lov_setstripe() ioctl is called.
641 * If we already have the stripe MD locally then we don't request it in
642 * md_open(), by passing a lmm_size = 0.
644 * It is up to the application to ensure no other processes open this file
645 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
646 * used. We might be able to avoid races of that sort by getting lli_open_sem
647 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
648 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
650 int ll_file_open(struct inode *inode, struct file *file)
652 struct ll_inode_info *lli = ll_i2info(inode);
653 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
654 .it_flags = file->f_flags };
655 struct obd_client_handle **och_p = NULL;
656 __u64 *och_usecount = NULL;
657 struct ll_file_data *fd;
661 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
662 PFID(ll_inode2fid(inode)), inode, file->f_flags);
664 it = file->private_data; /* XXX: compat macro */
665 file->private_data = NULL; /* prevent ll_local_open assertion */
667 fd = ll_file_data_get();
669 GOTO(out_nofiledata, rc = -ENOMEM);
672 if (S_ISDIR(inode->i_mode))
673 ll_authorize_statahead(inode, fd);
675 if (inode->i_sb->s_root == file_dentry(file)) {
676 LUSTRE_FPRIVATE(file) = fd;
680 if (!it || !it->it_disposition) {
681 /* Convert f_flags into access mode. We cannot use file->f_mode,
682 * because everything but O_ACCMODE mask was stripped from
684 if ((oit.it_flags + 1) & O_ACCMODE)
686 if (file->f_flags & O_TRUNC)
687 oit.it_flags |= FMODE_WRITE;
689 /* kernel only call f_op->open in dentry_open. filp_open calls
690 * dentry_open after call to open_namei that checks permissions.
691 * Only nfsd_open call dentry_open directly without checking
692 * permissions and because of that this code below is safe. */
693 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
694 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
696 /* We do not want O_EXCL here, presumably we opened the file
697 * already? XXX - NFS implications? */
698 oit.it_flags &= ~O_EXCL;
700 /* bug20584, if "it_flags" contains O_CREAT, the file will be
701 * created if necessary, then "IT_CREAT" should be set to keep
702 * consistent with it */
703 if (oit.it_flags & O_CREAT)
704 oit.it_op |= IT_CREAT;
710 /* Let's see if we have file open on MDS already. */
711 if (it->it_flags & FMODE_WRITE) {
712 och_p = &lli->lli_mds_write_och;
713 och_usecount = &lli->lli_open_fd_write_count;
714 } else if (it->it_flags & FMODE_EXEC) {
715 och_p = &lli->lli_mds_exec_och;
716 och_usecount = &lli->lli_open_fd_exec_count;
718 och_p = &lli->lli_mds_read_och;
719 och_usecount = &lli->lli_open_fd_read_count;
722 mutex_lock(&lli->lli_och_mutex);
723 if (*och_p) { /* Open handle is present */
724 if (it_disposition(it, DISP_OPEN_OPEN)) {
725 /* Well, there's extra open request that we do not need,
726 let's close it somehow. This will decref request. */
727 rc = it_open_error(DISP_OPEN_OPEN, it);
729 mutex_unlock(&lli->lli_och_mutex);
730 GOTO(out_openerr, rc);
733 ll_release_openhandle(file_dentry(file), it);
737 rc = ll_local_open(file, it, fd, NULL);
740 mutex_unlock(&lli->lli_och_mutex);
741 GOTO(out_openerr, rc);
744 LASSERT(*och_usecount == 0);
745 if (!it->it_disposition) {
746 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
747 /* We cannot just request lock handle now, new ELC code
748 means that one of other OPEN locks for this file
749 could be cancelled, and since blocking ast handler
750 would attempt to grab och_mutex as well, that would
751 result in a deadlock */
752 mutex_unlock(&lli->lli_och_mutex);
754 * Normally called under two situations:
756 * 2. A race/condition on MDS resulting in no open
757 * handle to be returned from LOOKUP|OPEN request,
758 * for example if the target entry was a symlink.
760 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
761 * marked by a bit set in ll_iget_for_nfs. Clear the
762 * bit so that it's not confusing later callers.
764 * NB; when ldd is NULL, it must have come via normal
765 * lookup path only, since ll_iget_for_nfs always calls
768 if (ldd && ldd->lld_nfs_dentry) {
769 ldd->lld_nfs_dentry = 0;
770 it->it_flags |= MDS_OPEN_LOCK;
774 * Always specify MDS_OPEN_BY_FID because we don't want
775 * to get file with different fid.
777 it->it_flags |= MDS_OPEN_BY_FID;
778 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
781 GOTO(out_openerr, rc);
785 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
787 GOTO(out_och_free, rc = -ENOMEM);
791 /* md_intent_lock() didn't get a request ref if there was an
792 * open error, so don't do cleanup on the request here
794 /* XXX (green): Should not we bail out on any error here, not
795 * just open error? */
796 rc = it_open_error(DISP_OPEN_OPEN, it);
798 GOTO(out_och_free, rc);
800 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
801 "inode %p: disposition %x, status %d\n", inode,
802 it_disposition(it, ~0), it->it_status);
804 rc = ll_local_open(file, it, fd, *och_p);
806 GOTO(out_och_free, rc);
808 mutex_unlock(&lli->lli_och_mutex);
811 /* Must do this outside lli_och_mutex lock to prevent deadlock where
812 different kind of OPEN lock for this same inode gets cancelled
813 by ldlm_cancel_lru */
814 if (!S_ISREG(inode->i_mode))
815 GOTO(out_och_free, rc);
817 cl_lov_delay_create_clear(&file->f_flags);
818 GOTO(out_och_free, rc);
822 if (och_p && *och_p) {
823 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
824 *och_p = NULL; /* OBD_FREE writes some magic there */
827 mutex_unlock(&lli->lli_och_mutex);
830 if (lli->lli_opendir_key == fd)
831 ll_deauthorize_statahead(inode, fd);
833 ll_file_data_put(fd);
835 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
839 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
840 ptlrpc_req_finished(it->it_request);
841 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
847 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
848 struct ldlm_lock_desc *desc, void *data, int flag)
851 struct lustre_handle lockh;
855 case LDLM_CB_BLOCKING:
856 ldlm_lock2handle(lock, &lockh);
857 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
859 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
863 case LDLM_CB_CANCELING:
871 * When setting a lease on a file, we take ownership of the lli_mds_*_och
872 * and save it as fd->fd_och so as to force client to reopen the file even
873 * if it has an open lock in cache already.
875 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
876 struct lustre_handle *old_handle)
878 struct ll_inode_info *lli = ll_i2info(inode);
879 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
880 struct obd_client_handle **och_p;
885 /* Get the openhandle of the file */
886 mutex_lock(&lli->lli_och_mutex);
887 if (fd->fd_lease_och != NULL)
888 GOTO(out_unlock, rc = -EBUSY);
890 if (fd->fd_och == NULL) {
891 if (file->f_mode & FMODE_WRITE) {
892 LASSERT(lli->lli_mds_write_och != NULL);
893 och_p = &lli->lli_mds_write_och;
894 och_usecount = &lli->lli_open_fd_write_count;
896 LASSERT(lli->lli_mds_read_och != NULL);
897 och_p = &lli->lli_mds_read_och;
898 och_usecount = &lli->lli_open_fd_read_count;
901 if (*och_usecount > 1)
902 GOTO(out_unlock, rc = -EBUSY);
909 *old_handle = fd->fd_och->och_fh;
913 mutex_unlock(&lli->lli_och_mutex);
918 * Release ownership on lli_mds_*_och when putting back a file lease.
920 static int ll_lease_och_release(struct inode *inode, struct file *file)
922 struct ll_inode_info *lli = ll_i2info(inode);
923 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
924 struct obd_client_handle **och_p;
925 struct obd_client_handle *old_och = NULL;
930 mutex_lock(&lli->lli_och_mutex);
931 if (file->f_mode & FMODE_WRITE) {
932 och_p = &lli->lli_mds_write_och;
933 och_usecount = &lli->lli_open_fd_write_count;
935 och_p = &lli->lli_mds_read_och;
936 och_usecount = &lli->lli_open_fd_read_count;
939 /* The file may have been open by another process (broken lease) so
940 * *och_p is not NULL. In this case we should simply increase usecount
943 if (*och_p != NULL) {
944 old_och = fd->fd_och;
951 mutex_unlock(&lli->lli_och_mutex);
954 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
960 * Acquire a lease and open the file.
962 static struct obd_client_handle *
963 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
966 struct lookup_intent it = { .it_op = IT_OPEN };
967 struct ll_sb_info *sbi = ll_i2sbi(inode);
968 struct md_op_data *op_data;
969 struct ptlrpc_request *req = NULL;
970 struct lustre_handle old_handle = { 0 };
971 struct obd_client_handle *och = NULL;
976 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
977 RETURN(ERR_PTR(-EINVAL));
980 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
981 RETURN(ERR_PTR(-EPERM));
983 rc = ll_lease_och_acquire(inode, file, &old_handle);
990 RETURN(ERR_PTR(-ENOMEM));
992 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
993 LUSTRE_OPC_ANY, NULL);
995 GOTO(out, rc = PTR_ERR(op_data));
997 /* To tell the MDT this openhandle is from the same owner */
998 op_data->op_handle = old_handle;
1000 it.it_flags = fmode | open_flags;
1001 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1002 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1003 &ll_md_blocking_lease_ast,
1004 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1005 * it can be cancelled which may mislead applications that the lease is
1007 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1008 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1009 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1010 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1011 ll_finish_md_op_data(op_data);
1012 ptlrpc_req_finished(req);
1014 GOTO(out_release_it, rc);
1016 if (it_disposition(&it, DISP_LOOKUP_NEG))
1017 GOTO(out_release_it, rc = -ENOENT);
1019 rc = it_open_error(DISP_OPEN_OPEN, &it);
1021 GOTO(out_release_it, rc);
1023 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1024 ll_och_fill(sbi->ll_md_exp, &it, och);
1026 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1027 GOTO(out_close, rc = -EOPNOTSUPP);
1029 /* already get lease, handle lease lock */
1030 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1031 if (it.it_lock_mode == 0 ||
1032 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1033 /* open lock must return for lease */
1034 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1035 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1037 GOTO(out_close, rc = -EPROTO);
1040 ll_intent_release(&it);
1044 /* Cancel open lock */
1045 if (it.it_lock_mode != 0) {
1046 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1048 it.it_lock_mode = 0;
1049 och->och_lease_handle.cookie = 0ULL;
1051 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1053 CERROR("%s: error closing file "DFID": %d\n",
1054 ll_get_fsname(inode->i_sb, NULL, 0),
1055 PFID(&ll_i2info(inode)->lli_fid), rc2);
1056 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1058 ll_intent_release(&it);
1062 RETURN(ERR_PTR(rc));
1066 * Check whether a layout swap can be done between two inodes.
1068 * \param[in] inode1 First inode to check
1069 * \param[in] inode2 Second inode to check
1071 * \retval 0 on success, layout swap can be performed between both inodes
1072 * \retval negative error code if requirements are not met
1074 static int ll_check_swap_layouts_validity(struct inode *inode1,
1075 struct inode *inode2)
1077 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1080 if (inode_permission(inode1, MAY_WRITE) ||
1081 inode_permission(inode2, MAY_WRITE))
1084 if (inode1->i_sb != inode2->i_sb)
1090 static int ll_swap_layouts_close(struct obd_client_handle *och,
1091 struct inode *inode, struct inode *inode2)
1093 const struct lu_fid *fid1 = ll_inode2fid(inode);
1094 const struct lu_fid *fid2;
1098 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1099 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1101 rc = ll_check_swap_layouts_validity(inode, inode2);
1103 GOTO(out_free_och, rc);
1105 /* We now know that inode2 is a lustre inode */
1106 fid2 = ll_inode2fid(inode2);
1108 rc = lu_fid_cmp(fid1, fid2);
1110 GOTO(out_free_och, rc = -EINVAL);
1112 /* Close the file and {swap,merge} layouts between inode & inode2.
1113 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1114 * because we still need it to pack l_remote_handle to MDT. */
1115 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1118 och = NULL; /* freed in ll_close_inode_openhandle() */
1128 * Release lease and close the file.
1129 * It will check if the lease has ever broken.
1131 static int ll_lease_close_intent(struct obd_client_handle *och,
1132 struct inode *inode,
1133 bool *lease_broken, enum mds_op_bias bias,
1136 struct ldlm_lock *lock;
1137 bool cancelled = true;
1141 lock = ldlm_handle2lock(&och->och_lease_handle);
1143 lock_res_and_lock(lock);
1144 cancelled = ldlm_is_cancel(lock);
1145 unlock_res_and_lock(lock);
1146 LDLM_LOCK_PUT(lock);
1149 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1150 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1152 if (lease_broken != NULL)
1153 *lease_broken = cancelled;
1155 if (!cancelled && !bias)
1156 ldlm_cli_cancel(&och->och_lease_handle, 0);
1158 if (cancelled) { /* no need to excute intent */
1163 rc = ll_close_inode_openhandle(inode, och, bias, data);
1167 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1170 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1174 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1176 static int ll_lease_file_resync(struct obd_client_handle *och,
1177 struct inode *inode)
1179 struct ll_sb_info *sbi = ll_i2sbi(inode);
1180 struct md_op_data *op_data;
1181 __u64 data_version_unused;
1185 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1186 LUSTRE_OPC_ANY, NULL);
1187 if (IS_ERR(op_data))
1188 RETURN(PTR_ERR(op_data));
1190 /* before starting file resync, it's necessary to clean up page cache
1191 * in client memory, otherwise once the layout version is increased,
1192 * writing back cached data will be denied the OSTs. */
1193 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1197 op_data->op_handle = och->och_lease_handle;
1198 rc = md_file_resync(sbi->ll_md_exp, op_data);
1204 ll_finish_md_op_data(op_data);
1208 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1210 struct ll_inode_info *lli = ll_i2info(inode);
1211 struct cl_object *obj = lli->lli_clob;
1212 struct cl_attr *attr = vvp_env_thread_attr(env);
1220 ll_inode_size_lock(inode);
1222 /* Merge timestamps the most recently obtained from MDS with
1223 * timestamps obtained from OSTs.
1225 * Do not overwrite atime of inode because it may be refreshed
1226 * by file_accessed() function. If the read was served by cache
1227 * data, there is no RPC to be sent so that atime may not be
1228 * transferred to OSTs at all. MDT only updates atime at close time
1229 * if it's at least 'mdd.*.atime_diff' older.
1230 * All in all, the atime in Lustre does not strictly comply with
1231 * POSIX. Solving this problem needs to send an RPC to MDT for each
1232 * read, this will hurt performance. */
1233 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1234 LTIME_S(inode->i_atime) = lli->lli_atime;
1235 lli->lli_update_atime = 0;
1237 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1238 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1240 atime = LTIME_S(inode->i_atime);
1241 mtime = LTIME_S(inode->i_mtime);
1242 ctime = LTIME_S(inode->i_ctime);
1244 cl_object_attr_lock(obj);
1245 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1248 rc = cl_object_attr_get(env, obj, attr);
1249 cl_object_attr_unlock(obj);
1252 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1254 if (atime < attr->cat_atime)
1255 atime = attr->cat_atime;
1257 if (ctime < attr->cat_ctime)
1258 ctime = attr->cat_ctime;
1260 if (mtime < attr->cat_mtime)
1261 mtime = attr->cat_mtime;
1263 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1264 PFID(&lli->lli_fid), attr->cat_size);
1266 i_size_write(inode, attr->cat_size);
1267 inode->i_blocks = attr->cat_blocks;
1269 LTIME_S(inode->i_atime) = atime;
1270 LTIME_S(inode->i_mtime) = mtime;
1271 LTIME_S(inode->i_ctime) = ctime;
1274 ll_inode_size_unlock(inode);
1280 * Set designated mirror for I/O.
1282 * So far only read, write, and truncated can support to issue I/O to
1283 * designated mirror.
1285 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1287 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1289 /* clear layout version for generic(non-resync) I/O in case it carries
1290 * stale layout version due to I/O restart */
1291 io->ci_layout_version = 0;
1293 /* FLR: disable non-delay for designated mirror I/O because obviously
1294 * only one mirror is available */
1295 if (fd->fd_designated_mirror > 0) {
1297 io->ci_designated_mirror = fd->fd_designated_mirror;
1298 io->ci_layout_version = fd->fd_layout_version;
1299 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1303 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1304 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1307 static bool file_is_noatime(const struct file *file)
1309 const struct vfsmount *mnt = file->f_path.mnt;
1310 const struct inode *inode = file_inode((struct file *)file);
1312 /* Adapted from file_accessed() and touch_atime().*/
1313 if (file->f_flags & O_NOATIME)
1316 if (inode->i_flags & S_NOATIME)
1319 if (IS_NOATIME(inode))
1322 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1325 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1328 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1334 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1336 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1338 struct inode *inode = file_inode(file);
1339 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1341 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1342 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1343 io->u.ci_rw.rw_file = file;
1344 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1345 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1346 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1348 if (iot == CIT_WRITE) {
1349 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1350 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1351 file->f_flags & O_DIRECT ||
1354 io->ci_obj = ll_i2info(inode)->lli_clob;
1355 io->ci_lockreq = CILR_MAYBE;
1356 if (ll_file_nolock(file)) {
1357 io->ci_lockreq = CILR_NEVER;
1358 io->ci_no_srvlock = 1;
1359 } else if (file->f_flags & O_APPEND) {
1360 io->ci_lockreq = CILR_MANDATORY;
1362 io->ci_noatime = file_is_noatime(file);
1363 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1364 io->ci_pio = !io->u.ci_rw.rw_append;
1368 /* FLR: only use non-delay I/O for read as there is only one
1369 * avaliable mirror for write. */
1370 io->ci_ndelay = !(iot == CIT_WRITE);
1372 ll_io_set_mirror(io, file);
1375 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1377 struct cl_io_pt *pt = ptask->pt_cbdata;
1378 struct file *file = pt->cip_file;
1381 loff_t pos = pt->cip_pos;
1386 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1387 file_dentry(file)->d_name.name,
1388 pt->cip_iot == CIT_READ ? "read" : "write",
1389 pos, pos + pt->cip_count);
1391 env = cl_env_get(&refcheck);
1393 RETURN(PTR_ERR(env));
1395 io = vvp_env_thread_io(env);
1396 ll_io_init(io, file, pt->cip_iot);
1397 io->u.ci_rw.rw_iter = pt->cip_iter;
1398 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1399 io->ci_pio = 0; /* It's already in parallel task */
1401 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1402 pt->cip_count - pt->cip_result);
1404 struct vvp_io *vio = vvp_env_io(env);
1406 vio->vui_io_subtype = IO_NORMAL;
1407 vio->vui_fd = LUSTRE_FPRIVATE(file);
1409 ll_cl_add(file, env, io, LCC_RW);
1410 rc = cl_io_loop(env, io);
1411 ll_cl_remove(file, env);
1413 /* cl_io_rw_init() handled IO */
1417 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1423 if (io->ci_nob > 0) {
1424 pt->cip_result += io->ci_nob;
1425 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1427 pt->cip_iocb.ki_pos = pos;
1428 #ifdef HAVE_KIOCB_KI_LEFT
1429 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1430 #elif defined(HAVE_KI_NBYTES)
1431 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1435 cl_io_fini(env, io);
1436 cl_env_put(env, &refcheck);
1438 pt->cip_need_restart = io->ci_need_restart;
1440 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1441 file_dentry(file)->d_name.name,
1442 pt->cip_iot == CIT_READ ? "read" : "write",
1443 pt->cip_result, rc);
1445 RETURN(pt->cip_result > 0 ? 0 : rc);
1449 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1450 struct file *file, enum cl_io_type iot,
1451 loff_t *ppos, size_t count)
1453 struct range_lock range;
1454 struct vvp_io *vio = vvp_env_io(env);
1455 struct inode *inode = file_inode(file);
1456 struct ll_inode_info *lli = ll_i2info(inode);
1457 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1462 unsigned retried = 0;
1463 bool restarted = false;
1467 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1468 file_dentry(file)->d_name.name,
1469 iot == CIT_READ ? "read" : "write", pos, pos + count);
1472 io = vvp_env_thread_io(env);
1473 ll_io_init(io, file, iot);
1474 if (args->via_io_subtype == IO_NORMAL) {
1475 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1476 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1478 if (args->via_io_subtype != IO_NORMAL || restarted)
1480 io->ci_ndelay_tried = retried;
1482 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1483 bool range_locked = false;
1485 if (file->f_flags & O_APPEND)
1486 range_lock_init(&range, 0, LUSTRE_EOF);
1488 range_lock_init(&range, pos, pos + count - 1);
1490 vio->vui_fd = LUSTRE_FPRIVATE(file);
1491 vio->vui_io_subtype = args->via_io_subtype;
1493 switch (vio->vui_io_subtype) {
1495 /* Direct IO reads must also take range lock,
1496 * or multiple reads will try to work on the same pages
1497 * See LU-6227 for details. */
1498 if (((iot == CIT_WRITE) ||
1499 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1500 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1501 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1503 rc = range_lock(&lli->lli_write_tree, &range);
1507 range_locked = true;
1511 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1512 vio->u.splice.vui_flags = args->u.splice.via_flags;
1515 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1519 ll_cl_add(file, env, io, LCC_RW);
1520 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1521 !lli->lli_inode_locked) {
1523 lli->lli_inode_locked = 1;
1525 rc = cl_io_loop(env, io);
1526 if (lli->lli_inode_locked) {
1527 lli->lli_inode_locked = 0;
1528 inode_unlock(inode);
1530 ll_cl_remove(file, env);
1533 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1535 range_unlock(&lli->lli_write_tree, &range);
1538 /* cl_io_rw_init() handled IO */
1542 if (io->ci_nob > 0) {
1543 result += io->ci_nob;
1544 count -= io->ci_nob;
1546 if (args->via_io_subtype == IO_NORMAL) {
1547 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1549 /* CLIO is too complicated. See LU-11069. */
1550 if (cl_io_is_append(io))
1551 pos = io->u.ci_rw.rw_iocb.ki_pos;
1555 args->u.normal.via_iocb->ki_pos = pos;
1556 #ifdef HAVE_KIOCB_KI_LEFT
1557 args->u.normal.via_iocb->ki_left = count;
1558 #elif defined(HAVE_KI_NBYTES)
1559 args->u.normal.via_iocb->ki_nbytes = count;
1563 pos = io->u.ci_rw.rw_range.cir_pos;
1567 cl_io_fini(env, io);
1570 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1571 file->f_path.dentry->d_name.name,
1572 iot, rc, result, io->ci_need_restart);
1574 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1576 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1577 file_dentry(file)->d_name.name,
1578 iot == CIT_READ ? "read" : "write",
1579 pos, pos + count, result, rc);
1580 /* preserve the tried count for FLR */
1581 retried = io->ci_ndelay_tried;
1586 if (iot == CIT_READ) {
1588 ll_stats_ops_tally(ll_i2sbi(inode),
1589 LPROC_LL_READ_BYTES, result);
1590 } else if (iot == CIT_WRITE) {
1592 ll_stats_ops_tally(ll_i2sbi(inode),
1593 LPROC_LL_WRITE_BYTES, result);
1594 fd->fd_write_failed = false;
1595 } else if (result == 0 && rc == 0) {
1598 fd->fd_write_failed = true;
1600 fd->fd_write_failed = false;
1601 } else if (rc != -ERESTARTSYS) {
1602 fd->fd_write_failed = true;
1606 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1607 file_dentry(file)->d_name.name,
1608 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1612 RETURN(result > 0 ? result : rc);
1616 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1617 * especially for small I/O.
1619 * To serve a read request, CLIO has to create and initialize a cl_io and
1620 * then request DLM lock. This has turned out to have siginificant overhead
1621 * and affects the performance of small I/O dramatically.
1623 * It's not necessary to create a cl_io for each I/O. Under the help of read
1624 * ahead, most of the pages being read are already in memory cache and we can
1625 * read those pages directly because if the pages exist, the corresponding DLM
1626 * lock must exist so that page content must be valid.
1628 * In fast read implementation, the llite speculatively finds and reads pages
1629 * in memory cache. There are three scenarios for fast read:
1630 * - If the page exists and is uptodate, kernel VM will provide the data and
1631 * CLIO won't be intervened;
1632 * - If the page was brought into memory by read ahead, it will be exported
1633 * and read ahead parameters will be updated;
1634 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1635 * it will go back and invoke normal read, i.e., a cl_io will be created
1636 * and DLM lock will be requested.
1638 * POSIX compliance: posix standard states that read is intended to be atomic.
1639 * Lustre read implementation is in line with Linux kernel read implementation
1640 * and neither of them complies with POSIX standard in this matter. Fast read
1641 * doesn't make the situation worse on single node but it may interleave write
1642 * results from multiple nodes due to short read handling in ll_file_aio_read().
1644 * \param env - lu_env
1645 * \param iocb - kiocb from kernel
1646 * \param iter - user space buffers where the data will be copied
1648 * \retval - number of bytes have been read, or error code if error occurred.
1651 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1655 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1658 /* NB: we can't do direct IO for fast read because it will need a lock
1659 * to make IO engine happy. */
1660 if (iocb->ki_filp->f_flags & O_DIRECT)
1663 result = generic_file_read_iter(iocb, iter);
1665 /* If the first page is not in cache, generic_file_aio_read() will be
1666 * returned with -ENODATA.
1667 * See corresponding code in ll_readpage(). */
1668 if (result == -ENODATA)
1672 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1673 LPROC_LL_READ_BYTES, result);
1679 * Read from a file (through the page cache).
1681 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1684 struct vvp_io_args *args;
1689 result = ll_do_fast_read(iocb, to);
1690 if (result < 0 || iov_iter_count(to) == 0)
1693 env = cl_env_get(&refcheck);
1695 return PTR_ERR(env);
1697 args = ll_env_args(env, IO_NORMAL);
1698 args->u.normal.via_iter = to;
1699 args->u.normal.via_iocb = iocb;
1701 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1702 &iocb->ki_pos, iov_iter_count(to));
1705 else if (result == 0)
1708 cl_env_put(env, &refcheck);
1714 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1715 * If a page is already in the page cache and dirty (and some other things -
1716 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1717 * write to it without doing a full I/O, because Lustre already knows about it
1718 * and will write it out. This saves a lot of processing time.
1720 * All writes here are within one page, so exclusion is handled by the page
1721 * lock on the vm page. We do not do tiny writes for writes which touch
1722 * multiple pages because it's very unlikely multiple sequential pages are
1723 * are already dirty.
1725 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1726 * and are unlikely to be to already dirty pages.
1728 * Attribute updates are important here, we do them in ll_tiny_write_end.
1730 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1732 ssize_t count = iov_iter_count(iter);
1733 struct file *file = iocb->ki_filp;
1734 struct inode *inode = file_inode(file);
1739 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1740 * of function for why.
1742 if (count >= PAGE_SIZE ||
1743 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1746 result = __generic_file_write_iter(iocb, iter);
1748 /* If the page is not already dirty, ll_tiny_write_begin returns
1749 * -ENODATA. We continue on to normal write.
1751 if (result == -ENODATA)
1755 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1757 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1760 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1766 * Write to a file (through the page cache).
1768 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1770 struct vvp_io_args *args;
1772 ssize_t rc_tiny = 0, rc_normal;
1777 /* NB: we can't do direct IO for tiny writes because they use the page
1778 * cache, we can't do sync writes because tiny writes can't flush
1779 * pages, and we can't do append writes because we can't guarantee the
1780 * required DLM locks are held to protect file size.
1782 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1783 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1784 rc_tiny = ll_do_tiny_write(iocb, from);
1786 /* In case of error, go on and try normal write - Only stop if tiny
1787 * write completed I/O.
1789 if (iov_iter_count(from) == 0)
1790 GOTO(out, rc_normal = rc_tiny);
1792 env = cl_env_get(&refcheck);
1794 return PTR_ERR(env);
1796 args = ll_env_args(env, IO_NORMAL);
1797 args->u.normal.via_iter = from;
1798 args->u.normal.via_iocb = iocb;
1800 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1801 &iocb->ki_pos, iov_iter_count(from));
1803 /* On success, combine bytes written. */
1804 if (rc_tiny >= 0 && rc_normal > 0)
1805 rc_normal += rc_tiny;
1806 /* On error, only return error from normal write if tiny write did not
1807 * write any bytes. Otherwise return bytes written by tiny write.
1809 else if (rc_tiny > 0)
1810 rc_normal = rc_tiny;
1812 cl_env_put(env, &refcheck);
1817 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1819 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1821 static int ll_file_get_iov_count(const struct iovec *iov,
1822 unsigned long *nr_segs, size_t *count)
1827 for (seg = 0; seg < *nr_segs; seg++) {
1828 const struct iovec *iv = &iov[seg];
1831 * If any segment has a negative length, or the cumulative
1832 * length ever wraps negative then return -EINVAL.
1835 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1837 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1842 cnt -= iv->iov_len; /* This segment is no good */
1849 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1850 unsigned long nr_segs, loff_t pos)
1857 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1861 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1862 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1863 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1864 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1865 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1867 result = ll_file_read_iter(iocb, &to);
1872 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1875 struct iovec iov = { .iov_base = buf, .iov_len = count };
1880 init_sync_kiocb(&kiocb, file);
1881 kiocb.ki_pos = *ppos;
1882 #ifdef HAVE_KIOCB_KI_LEFT
1883 kiocb.ki_left = count;
1884 #elif defined(HAVE_KI_NBYTES)
1885 kiocb.i_nbytes = count;
1888 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1889 *ppos = kiocb.ki_pos;
1895 * Write to a file (through the page cache).
1898 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1899 unsigned long nr_segs, loff_t pos)
1901 struct iov_iter from;
1906 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1910 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1911 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1912 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1913 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1914 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1916 result = ll_file_write_iter(iocb, &from);
1921 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1922 size_t count, loff_t *ppos)
1924 struct iovec iov = { .iov_base = (void __user *)buf,
1931 init_sync_kiocb(&kiocb, file);
1932 kiocb.ki_pos = *ppos;
1933 #ifdef HAVE_KIOCB_KI_LEFT
1934 kiocb.ki_left = count;
1935 #elif defined(HAVE_KI_NBYTES)
1936 kiocb.ki_nbytes = count;
1939 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1940 *ppos = kiocb.ki_pos;
1944 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1947 * Send file content (through pagecache) somewhere with helper
1949 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1950 struct pipe_inode_info *pipe, size_t count,
1954 struct vvp_io_args *args;
1959 env = cl_env_get(&refcheck);
1961 RETURN(PTR_ERR(env));
1963 args = ll_env_args(env, IO_SPLICE);
1964 args->u.splice.via_pipe = pipe;
1965 args->u.splice.via_flags = flags;
1967 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1968 cl_env_put(env, &refcheck);
1972 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1973 __u64 flags, struct lov_user_md *lum, int lum_size)
1975 struct lookup_intent oit = {
1977 .it_flags = flags | MDS_OPEN_BY_FID,
1982 ll_inode_size_lock(inode);
1983 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1985 GOTO(out_unlock, rc);
1987 ll_release_openhandle(dentry, &oit);
1990 ll_inode_size_unlock(inode);
1991 ll_intent_release(&oit);
1996 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1997 struct lov_mds_md **lmmp, int *lmm_size,
1998 struct ptlrpc_request **request)
2000 struct ll_sb_info *sbi = ll_i2sbi(inode);
2001 struct mdt_body *body;
2002 struct lov_mds_md *lmm = NULL;
2003 struct ptlrpc_request *req = NULL;
2004 struct md_op_data *op_data;
2007 rc = ll_get_default_mdsize(sbi, &lmmsize);
2011 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2012 strlen(filename), lmmsize,
2013 LUSTRE_OPC_ANY, NULL);
2014 if (IS_ERR(op_data))
2015 RETURN(PTR_ERR(op_data));
2017 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2018 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2019 ll_finish_md_op_data(op_data);
2021 CDEBUG(D_INFO, "md_getattr_name failed "
2022 "on %s: rc %d\n", filename, rc);
2026 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2027 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2029 lmmsize = body->mbo_eadatasize;
2031 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2033 GOTO(out, rc = -ENODATA);
2036 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2037 LASSERT(lmm != NULL);
2039 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2040 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2041 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2042 GOTO(out, rc = -EPROTO);
2045 * This is coming from the MDS, so is probably in
2046 * little endian. We convert it to host endian before
2047 * passing it to userspace.
2049 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2052 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2053 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2054 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2055 if (le32_to_cpu(lmm->lmm_pattern) &
2056 LOV_PATTERN_F_RELEASED)
2060 /* if function called for directory - we should
2061 * avoid swab not existent lsm objects */
2062 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2063 lustre_swab_lov_user_md_v1(
2064 (struct lov_user_md_v1 *)lmm);
2065 if (S_ISREG(body->mbo_mode))
2066 lustre_swab_lov_user_md_objects(
2067 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2069 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2070 lustre_swab_lov_user_md_v3(
2071 (struct lov_user_md_v3 *)lmm);
2072 if (S_ISREG(body->mbo_mode))
2073 lustre_swab_lov_user_md_objects(
2074 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2076 } else if (lmm->lmm_magic ==
2077 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2078 lustre_swab_lov_comp_md_v1(
2079 (struct lov_comp_md_v1 *)lmm);
2085 *lmm_size = lmmsize;
2090 static int ll_lov_setea(struct inode *inode, struct file *file,
2093 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2094 struct lov_user_md *lump;
2095 int lum_size = sizeof(struct lov_user_md) +
2096 sizeof(struct lov_user_ost_data);
2100 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2103 OBD_ALLOC_LARGE(lump, lum_size);
2107 if (copy_from_user(lump, arg, lum_size))
2108 GOTO(out_lump, rc = -EFAULT);
2110 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2112 cl_lov_delay_create_clear(&file->f_flags);
2115 OBD_FREE_LARGE(lump, lum_size);
2119 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2126 env = cl_env_get(&refcheck);
2128 RETURN(PTR_ERR(env));
2130 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2131 cl_env_put(env, &refcheck);
2135 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2138 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2139 struct lov_user_md *klum;
2141 __u64 flags = FMODE_WRITE;
2144 rc = ll_copy_user_md(lum, &klum);
2149 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2154 rc = put_user(0, &lum->lmm_stripe_count);
2158 rc = ll_layout_refresh(inode, &gen);
2162 rc = ll_file_getstripe(inode, arg, lum_size);
2164 cl_lov_delay_create_clear(&file->f_flags);
2167 OBD_FREE(klum, lum_size);
2172 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2174 struct ll_inode_info *lli = ll_i2info(inode);
2175 struct cl_object *obj = lli->lli_clob;
2176 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2177 struct ll_grouplock grouplock;
2182 CWARN("group id for group lock must not be 0\n");
2186 if (ll_file_nolock(file))
2187 RETURN(-EOPNOTSUPP);
2189 spin_lock(&lli->lli_lock);
2190 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2191 CWARN("group lock already existed with gid %lu\n",
2192 fd->fd_grouplock.lg_gid);
2193 spin_unlock(&lli->lli_lock);
2196 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2197 spin_unlock(&lli->lli_lock);
2200 * XXX: group lock needs to protect all OST objects while PFL
2201 * can add new OST objects during the IO, so we'd instantiate
2202 * all OST objects before getting its group lock.
2207 struct cl_layout cl = {
2208 .cl_is_composite = false,
2210 struct lu_extent ext = {
2212 .e_end = OBD_OBJECT_EOF,
2215 env = cl_env_get(&refcheck);
2217 RETURN(PTR_ERR(env));
2219 rc = cl_object_layout_get(env, obj, &cl);
2220 if (!rc && cl.cl_is_composite)
2221 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2224 cl_env_put(env, &refcheck);
2229 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2230 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2234 spin_lock(&lli->lli_lock);
2235 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2236 spin_unlock(&lli->lli_lock);
2237 CERROR("another thread just won the race\n");
2238 cl_put_grouplock(&grouplock);
2242 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2243 fd->fd_grouplock = grouplock;
2244 spin_unlock(&lli->lli_lock);
2246 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2250 static int ll_put_grouplock(struct inode *inode, struct file *file,
2253 struct ll_inode_info *lli = ll_i2info(inode);
2254 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2255 struct ll_grouplock grouplock;
2258 spin_lock(&lli->lli_lock);
2259 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2260 spin_unlock(&lli->lli_lock);
2261 CWARN("no group lock held\n");
2265 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2267 if (fd->fd_grouplock.lg_gid != arg) {
2268 CWARN("group lock %lu doesn't match current id %lu\n",
2269 arg, fd->fd_grouplock.lg_gid);
2270 spin_unlock(&lli->lli_lock);
2274 grouplock = fd->fd_grouplock;
2275 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2276 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2277 spin_unlock(&lli->lli_lock);
2279 cl_put_grouplock(&grouplock);
2280 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2285 * Close inode open handle
2287 * \param dentry [in] dentry which contains the inode
2288 * \param it [in,out] intent which contains open info and result
2291 * \retval <0 failure
2293 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2295 struct inode *inode = dentry->d_inode;
2296 struct obd_client_handle *och;
2302 /* Root ? Do nothing. */
2303 if (dentry->d_inode->i_sb->s_root == dentry)
2306 /* No open handle to close? Move away */
2307 if (!it_disposition(it, DISP_OPEN_OPEN))
2310 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2312 OBD_ALLOC(och, sizeof(*och));
2314 GOTO(out, rc = -ENOMEM);
2316 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2318 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2320 /* this one is in place of ll_file_open */
2321 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2322 ptlrpc_req_finished(it->it_request);
2323 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2329 * Get size for inode for which FIEMAP mapping is requested.
2330 * Make the FIEMAP get_info call and returns the result.
2331 * \param fiemap kernel buffer to hold extens
2332 * \param num_bytes kernel buffer size
2334 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2340 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2343 /* Checks for fiemap flags */
2344 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2345 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2349 /* Check for FIEMAP_FLAG_SYNC */
2350 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2351 rc = filemap_fdatawrite(inode->i_mapping);
2356 env = cl_env_get(&refcheck);
2358 RETURN(PTR_ERR(env));
2360 if (i_size_read(inode) == 0) {
2361 rc = ll_glimpse_size(inode);
2366 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2367 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2368 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2370 /* If filesize is 0, then there would be no objects for mapping */
2371 if (fmkey.lfik_oa.o_size == 0) {
2372 fiemap->fm_mapped_extents = 0;
2376 fmkey.lfik_fiemap = *fiemap;
2378 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2379 &fmkey, fiemap, &num_bytes);
2381 cl_env_put(env, &refcheck);
2385 int ll_fid2path(struct inode *inode, void __user *arg)
2387 struct obd_export *exp = ll_i2mdexp(inode);
2388 const struct getinfo_fid2path __user *gfin = arg;
2390 struct getinfo_fid2path *gfout;
2396 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2397 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2400 /* Only need to get the buflen */
2401 if (get_user(pathlen, &gfin->gf_pathlen))
2404 if (pathlen > PATH_MAX)
2407 outsize = sizeof(*gfout) + pathlen;
2408 OBD_ALLOC(gfout, outsize);
2412 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2413 GOTO(gf_free, rc = -EFAULT);
2414 /* append root FID after gfout to let MDT know the root FID so that it
2415 * can lookup the correct path, this is mainly for fileset.
2416 * old server without fileset mount support will ignore this. */
2417 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2419 /* Call mdc_iocontrol */
2420 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2424 if (copy_to_user(arg, gfout, outsize))
2428 OBD_FREE(gfout, outsize);
2433 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2435 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2443 ioc->idv_version = 0;
2444 ioc->idv_layout_version = UINT_MAX;
2446 /* If no file object initialized, we consider its version is 0. */
2450 env = cl_env_get(&refcheck);
2452 RETURN(PTR_ERR(env));
2454 io = vvp_env_thread_io(env);
2456 io->u.ci_data_version.dv_data_version = 0;
2457 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2458 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2461 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2462 result = cl_io_loop(env, io);
2464 result = io->ci_result;
2466 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2467 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2469 cl_io_fini(env, io);
2471 if (unlikely(io->ci_need_restart))
2474 cl_env_put(env, &refcheck);
2480 * Read the data_version for inode.
2482 * This value is computed using stripe object version on OST.
2483 * Version is computed using server side locking.
2485 * @param flags if do sync on the OST side;
2487 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2488 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2490 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2492 struct ioc_data_version ioc = { .idv_flags = flags };
2495 rc = ll_ioc_data_version(inode, &ioc);
2497 *data_version = ioc.idv_version;
2503 * Trigger a HSM release request for the provided inode.
2505 int ll_hsm_release(struct inode *inode)
2508 struct obd_client_handle *och = NULL;
2509 __u64 data_version = 0;
2514 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2515 ll_get_fsname(inode->i_sb, NULL, 0),
2516 PFID(&ll_i2info(inode)->lli_fid));
2518 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2520 GOTO(out, rc = PTR_ERR(och));
2522 /* Grab latest data_version and [am]time values */
2523 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2527 env = cl_env_get(&refcheck);
2529 GOTO(out, rc = PTR_ERR(env));
2531 rc = ll_merge_attr(env, inode);
2532 cl_env_put(env, &refcheck);
2534 /* If error happen, we have the wrong size for a file.
2540 /* Release the file.
2541 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2542 * we still need it to pack l_remote_handle to MDT. */
2543 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2549 if (och != NULL && !IS_ERR(och)) /* close the file */
2550 ll_lease_close(och, inode, NULL);
2555 struct ll_swap_stack {
2558 struct inode *inode1;
2559 struct inode *inode2;
2564 static int ll_swap_layouts(struct file *file1, struct file *file2,
2565 struct lustre_swap_layouts *lsl)
2567 struct mdc_swap_layouts msl;
2568 struct md_op_data *op_data;
2571 struct ll_swap_stack *llss = NULL;
2574 OBD_ALLOC_PTR(llss);
2578 llss->inode1 = file_inode(file1);
2579 llss->inode2 = file_inode(file2);
2581 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2585 /* we use 2 bool because it is easier to swap than 2 bits */
2586 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2587 llss->check_dv1 = true;
2589 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2590 llss->check_dv2 = true;
2592 /* we cannot use lsl->sl_dvX directly because we may swap them */
2593 llss->dv1 = lsl->sl_dv1;
2594 llss->dv2 = lsl->sl_dv2;
2596 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2597 if (rc == 0) /* same file, done! */
2600 if (rc < 0) { /* sequentialize it */
2601 swap(llss->inode1, llss->inode2);
2603 swap(llss->dv1, llss->dv2);
2604 swap(llss->check_dv1, llss->check_dv2);
2608 if (gid != 0) { /* application asks to flush dirty cache */
2609 rc = ll_get_grouplock(llss->inode1, file1, gid);
2613 rc = ll_get_grouplock(llss->inode2, file2, gid);
2615 ll_put_grouplock(llss->inode1, file1, gid);
2620 /* ultimate check, before swaping the layouts we check if
2621 * dataversion has changed (if requested) */
2622 if (llss->check_dv1) {
2623 rc = ll_data_version(llss->inode1, &dv, 0);
2626 if (dv != llss->dv1)
2627 GOTO(putgl, rc = -EAGAIN);
2630 if (llss->check_dv2) {
2631 rc = ll_data_version(llss->inode2, &dv, 0);
2634 if (dv != llss->dv2)
2635 GOTO(putgl, rc = -EAGAIN);
2638 /* struct md_op_data is used to send the swap args to the mdt
2639 * only flags is missing, so we use struct mdc_swap_layouts
2640 * through the md_op_data->op_data */
2641 /* flags from user space have to be converted before they are send to
2642 * server, no flag is sent today, they are only used on the client */
2645 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2646 0, LUSTRE_OPC_ANY, &msl);
2647 if (IS_ERR(op_data))
2648 GOTO(free, rc = PTR_ERR(op_data));
2650 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2651 sizeof(*op_data), op_data, NULL);
2652 ll_finish_md_op_data(op_data);
2659 ll_put_grouplock(llss->inode2, file2, gid);
2660 ll_put_grouplock(llss->inode1, file1, gid);
2670 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2672 struct md_op_data *op_data;
2676 /* Detect out-of range masks */
2677 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2680 /* Non-root users are forbidden to set or clear flags which are
2681 * NOT defined in HSM_USER_MASK. */
2682 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2683 !cfs_capable(CFS_CAP_SYS_ADMIN))
2686 /* Detect out-of range archive id */
2687 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2688 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2691 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2692 LUSTRE_OPC_ANY, hss);
2693 if (IS_ERR(op_data))
2694 RETURN(PTR_ERR(op_data));
2696 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2697 sizeof(*op_data), op_data, NULL);
2699 ll_finish_md_op_data(op_data);
2704 static int ll_hsm_import(struct inode *inode, struct file *file,
2705 struct hsm_user_import *hui)
2707 struct hsm_state_set *hss = NULL;
2708 struct iattr *attr = NULL;
2712 if (!S_ISREG(inode->i_mode))
2718 GOTO(out, rc = -ENOMEM);
2720 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2721 hss->hss_archive_id = hui->hui_archive_id;
2722 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2723 rc = ll_hsm_state_set(inode, hss);
2727 OBD_ALLOC_PTR(attr);
2729 GOTO(out, rc = -ENOMEM);
2731 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2732 attr->ia_mode |= S_IFREG;
2733 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2734 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2735 attr->ia_size = hui->hui_size;
2736 attr->ia_mtime.tv_sec = hui->hui_mtime;
2737 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2738 attr->ia_atime.tv_sec = hui->hui_atime;
2739 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2741 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2742 ATTR_UID | ATTR_GID |
2743 ATTR_MTIME | ATTR_MTIME_SET |
2744 ATTR_ATIME | ATTR_ATIME_SET;
2748 rc = ll_setattr_raw(file_dentry(file), attr, true);
2752 inode_unlock(inode);
2764 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2766 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2767 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2770 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2772 struct inode *inode = file_inode(file);
2774 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2775 ATTR_MTIME | ATTR_MTIME_SET |
2776 ATTR_CTIME | ATTR_CTIME_SET,
2778 .tv_sec = lfu->lfu_atime_sec,
2779 .tv_nsec = lfu->lfu_atime_nsec,
2782 .tv_sec = lfu->lfu_mtime_sec,
2783 .tv_nsec = lfu->lfu_mtime_nsec,
2786 .tv_sec = lfu->lfu_ctime_sec,
2787 .tv_nsec = lfu->lfu_ctime_nsec,
2793 if (!capable(CAP_SYS_ADMIN))
2796 if (!S_ISREG(inode->i_mode))
2800 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2801 inode_unlock(inode);
2806 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2809 case MODE_READ_USER:
2811 case MODE_WRITE_USER:
2818 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2820 /* Used to allow the upper layers of the client to request an LDLM lock
2821 * without doing an actual read or write.
2823 * Used for ladvise lockahead to manually request specific locks.
2825 * \param[in] file file this ladvise lock request is on
2826 * \param[in] ladvise ladvise struct describing this lock request
2828 * \retval 0 success, no detailed result available (sync requests
2829 * and requests sent to the server [not handled locally]
2830 * cannot return detailed results)
2831 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2832 * see definitions for details.
2833 * \retval negative negative errno on error
2835 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2837 struct lu_env *env = NULL;
2838 struct cl_io *io = NULL;
2839 struct cl_lock *lock = NULL;
2840 struct cl_lock_descr *descr = NULL;
2841 struct dentry *dentry = file->f_path.dentry;
2842 struct inode *inode = dentry->d_inode;
2843 enum cl_lock_mode cl_mode;
2844 off_t start = ladvise->lla_start;
2845 off_t end = ladvise->lla_end;
2851 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2852 "start=%llu, end=%llu\n", dentry->d_name.len,
2853 dentry->d_name.name, dentry->d_inode,
2854 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2857 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2859 GOTO(out, result = cl_mode);
2861 /* Get IO environment */
2862 result = cl_io_get(inode, &env, &io, &refcheck);
2866 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2869 * nothing to do for this io. This currently happens when
2870 * stripe sub-object's are not yet created.
2872 result = io->ci_result;
2873 } else if (result == 0) {
2874 lock = vvp_env_lock(env);
2875 descr = &lock->cll_descr;
2877 descr->cld_obj = io->ci_obj;
2878 /* Convert byte offsets to pages */
2879 descr->cld_start = cl_index(io->ci_obj, start);
2880 descr->cld_end = cl_index(io->ci_obj, end);
2881 descr->cld_mode = cl_mode;
2882 /* CEF_MUST is used because we do not want to convert a
2883 * lockahead request to a lockless lock */
2884 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2887 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2888 descr->cld_enq_flags |= CEF_SPECULATIVE;
2890 result = cl_lock_request(env, io, lock);
2892 /* On success, we need to release the lock */
2894 cl_lock_release(env, lock);
2896 cl_io_fini(env, io);
2897 cl_env_put(env, &refcheck);
2899 /* -ECANCELED indicates a matching lock with a different extent
2900 * was already present, and -EEXIST indicates a matching lock
2901 * on exactly the same extent was already present.
2902 * We convert them to positive values for userspace to make
2903 * recognizing true errors easier.
2904 * Note we can only return these detailed results on async requests,
2905 * as sync requests look the same as i/o requests for locking. */
2906 if (result == -ECANCELED)
2907 result = LLA_RESULT_DIFFERENT;
2908 else if (result == -EEXIST)
2909 result = LLA_RESULT_SAME;
2914 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2916 static int ll_ladvise_sanity(struct inode *inode,
2917 struct llapi_lu_ladvise *ladvise)
2919 enum lu_ladvise_type advice = ladvise->lla_advice;
2920 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2921 * be in the first 32 bits of enum ladvise_flags */
2922 __u32 flags = ladvise->lla_peradvice_flags;
2923 /* 3 lines at 80 characters per line, should be plenty */
2926 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2928 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2929 "last supported advice is %s (value '%d'): rc = %d\n",
2930 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2931 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2935 /* Per-advice checks */
2937 case LU_LADVISE_LOCKNOEXPAND:
2938 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2940 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2942 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2943 ladvise_names[advice], rc);
2947 case LU_LADVISE_LOCKAHEAD:
2948 /* Currently only READ and WRITE modes can be requested */
2949 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2950 ladvise->lla_lockahead_mode == 0) {
2952 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2954 ll_get_fsname(inode->i_sb, NULL, 0),
2955 ladvise->lla_lockahead_mode,
2956 ladvise_names[advice], rc);
2959 case LU_LADVISE_WILLREAD:
2960 case LU_LADVISE_DONTNEED:
2962 /* Note fall through above - These checks apply to all advices
2963 * except LOCKNOEXPAND */
2964 if (flags & ~LF_DEFAULT_MASK) {
2966 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2968 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2969 ladvise_names[advice], rc);
2972 if (ladvise->lla_start >= ladvise->lla_end) {
2974 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2975 "for %s: rc = %d\n",
2976 ll_get_fsname(inode->i_sb, NULL, 0),
2977 ladvise->lla_start, ladvise->lla_end,
2978 ladvise_names[advice], rc);
2990 * Give file access advices
2992 * The ladvise interface is similar to Linux fadvise() system call, except it
2993 * forwards the advices directly from Lustre client to server. The server side
2994 * codes will apply appropriate read-ahead and caching techniques for the
2995 * corresponding files.
2997 * A typical workload for ladvise is e.g. a bunch of different clients are
2998 * doing small random reads of a file, so prefetching pages into OSS cache
2999 * with big linear reads before the random IO is a net benefit. Fetching
3000 * all that data into each client cache with fadvise() may not be, due to
3001 * much more data being sent to the client.
3003 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3004 struct llapi_lu_ladvise *ladvise)
3008 struct cl_ladvise_io *lio;
3013 env = cl_env_get(&refcheck);
3015 RETURN(PTR_ERR(env));
3017 io = vvp_env_thread_io(env);
3018 io->ci_obj = ll_i2info(inode)->lli_clob;
3020 /* initialize parameters for ladvise */
3021 lio = &io->u.ci_ladvise;
3022 lio->li_start = ladvise->lla_start;
3023 lio->li_end = ladvise->lla_end;
3024 lio->li_fid = ll_inode2fid(inode);
3025 lio->li_advice = ladvise->lla_advice;
3026 lio->li_flags = flags;
3028 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3029 rc = cl_io_loop(env, io);
3033 cl_io_fini(env, io);
3034 cl_env_put(env, &refcheck);
3038 static int ll_lock_noexpand(struct file *file, int flags)
3040 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3042 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3047 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3050 struct fsxattr fsxattr;
3052 if (copy_from_user(&fsxattr,
3053 (const struct fsxattr __user *)arg,
3057 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3058 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3059 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3060 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3061 if (copy_to_user((struct fsxattr __user *)arg,
3062 &fsxattr, sizeof(fsxattr)))
3068 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3072 struct md_op_data *op_data;
3073 struct ptlrpc_request *req = NULL;
3075 struct fsxattr fsxattr;
3076 struct cl_object *obj;
3080 /* only root could change project ID */
3081 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3084 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3085 LUSTRE_OPC_ANY, NULL);
3086 if (IS_ERR(op_data))
3087 RETURN(PTR_ERR(op_data));
3089 if (copy_from_user(&fsxattr,
3090 (const struct fsxattr __user *)arg,
3092 GOTO(out_fsxattr, rc = -EFAULT);
3094 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3095 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3096 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3097 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3098 op_data->op_projid = fsxattr.fsx_projid;
3099 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3100 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3102 ptlrpc_req_finished(req);
3104 GOTO(out_fsxattr, rc);
3105 ll_update_inode_flags(inode, op_data->op_attr_flags);
3106 obj = ll_i2info(inode)->lli_clob;
3108 GOTO(out_fsxattr, rc);
3110 OBD_ALLOC_PTR(attr);
3112 GOTO(out_fsxattr, rc = -ENOMEM);
3114 attr->ia_valid = ATTR_ATTR_FLAG;
3115 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3118 ll_finish_md_op_data(op_data);
3122 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3125 struct inode *inode = file_inode(file);
3126 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3127 struct ll_inode_info *lli = ll_i2info(inode);
3128 struct obd_client_handle *och = NULL;
3129 struct split_param sp;
3132 enum mds_op_bias bias = 0;
3133 struct file *layout_file = NULL;
3135 size_t data_size = 0;
3139 mutex_lock(&lli->lli_och_mutex);
3140 if (fd->fd_lease_och != NULL) {
3141 och = fd->fd_lease_och;
3142 fd->fd_lease_och = NULL;
3144 mutex_unlock(&lli->lli_och_mutex);
3147 GOTO(out, rc = -ENOLCK);
3149 fmode = och->och_flags;
3151 switch (ioc->lil_flags) {
3152 case LL_LEASE_RESYNC_DONE:
3153 if (ioc->lil_count > IOC_IDS_MAX)
3154 GOTO(out, rc = -EINVAL);
3156 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3157 OBD_ALLOC(data, data_size);
3159 GOTO(out, rc = -ENOMEM);
3161 if (copy_from_user(data, (void __user *)arg, data_size))
3162 GOTO(out, rc = -EFAULT);
3164 bias = MDS_CLOSE_RESYNC_DONE;
3166 case LL_LEASE_LAYOUT_MERGE: {
3169 if (ioc->lil_count != 1)
3170 GOTO(out, rc = -EINVAL);
3172 arg += sizeof(*ioc);
3173 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3174 GOTO(out, rc = -EFAULT);
3176 layout_file = fget(fd);
3178 GOTO(out, rc = -EBADF);
3180 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3181 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3182 GOTO(out, rc = -EPERM);
3184 data = file_inode(layout_file);
3185 bias = MDS_CLOSE_LAYOUT_MERGE;
3188 case LL_LEASE_LAYOUT_SPLIT: {
3192 if (ioc->lil_count != 2)
3193 GOTO(out, rc = -EINVAL);
3195 arg += sizeof(*ioc);
3196 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3197 GOTO(out, rc = -EFAULT);
3199 arg += sizeof(__u32);
3200 if (copy_from_user(&mirror_id, (void __user *)arg,
3202 GOTO(out, rc = -EFAULT);
3204 layout_file = fget(fdv);
3206 GOTO(out, rc = -EBADF);
3208 sp.sp_inode = file_inode(layout_file);
3209 sp.sp_mirror_id = (__u16)mirror_id;
3211 bias = MDS_CLOSE_LAYOUT_SPLIT;
3215 /* without close intent */
3219 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3223 rc = ll_lease_och_release(inode, file);
3232 switch (ioc->lil_flags) {
3233 case LL_LEASE_RESYNC_DONE:
3235 OBD_FREE(data, data_size);
3237 case LL_LEASE_LAYOUT_MERGE:
3238 case LL_LEASE_LAYOUT_SPLIT:
3245 rc = ll_lease_type_from_fmode(fmode);
3249 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3252 struct inode *inode = file_inode(file);
3253 struct ll_inode_info *lli = ll_i2info(inode);
3254 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3255 struct obd_client_handle *och = NULL;
3256 __u64 open_flags = 0;
3262 switch (ioc->lil_mode) {
3263 case LL_LEASE_WRLCK:
3264 if (!(file->f_mode & FMODE_WRITE))
3266 fmode = FMODE_WRITE;
3268 case LL_LEASE_RDLCK:
3269 if (!(file->f_mode & FMODE_READ))
3273 case LL_LEASE_UNLCK:
3274 RETURN(ll_file_unlock_lease(file, ioc, arg));
3279 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3281 /* apply for lease */
3282 if (ioc->lil_flags & LL_LEASE_RESYNC)
3283 open_flags = MDS_OPEN_RESYNC;
3284 och = ll_lease_open(inode, file, fmode, open_flags);
3286 RETURN(PTR_ERR(och));
3288 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3289 rc = ll_lease_file_resync(och, inode);
3291 ll_lease_close(och, inode, NULL);
3294 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3296 ll_lease_close(och, inode, NULL);
3302 mutex_lock(&lli->lli_och_mutex);
3303 if (fd->fd_lease_och == NULL) {
3304 fd->fd_lease_och = och;
3307 mutex_unlock(&lli->lli_och_mutex);
3309 /* impossible now that only excl is supported for now */
3310 ll_lease_close(och, inode, &lease_broken);
3317 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3319 struct inode *inode = file_inode(file);
3320 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3324 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3325 PFID(ll_inode2fid(inode)), inode, cmd);
3326 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3328 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3329 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3333 case LL_IOC_GETFLAGS:
3334 /* Get the current value of the file flags */
3335 return put_user(fd->fd_flags, (int __user *)arg);
3336 case LL_IOC_SETFLAGS:
3337 case LL_IOC_CLRFLAGS:
3338 /* Set or clear specific file flags */
3339 /* XXX This probably needs checks to ensure the flags are
3340 * not abused, and to handle any flag side effects.
3342 if (get_user(flags, (int __user *) arg))
3345 if (cmd == LL_IOC_SETFLAGS) {
3346 if ((flags & LL_FILE_IGNORE_LOCK) &&
3347 !(file->f_flags & O_DIRECT)) {
3348 CERROR("%s: unable to disable locking on "
3349 "non-O_DIRECT file\n", current->comm);
3353 fd->fd_flags |= flags;
3355 fd->fd_flags &= ~flags;
3358 case LL_IOC_LOV_SETSTRIPE:
3359 case LL_IOC_LOV_SETSTRIPE_NEW:
3360 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3361 case LL_IOC_LOV_SETEA:
3362 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3363 case LL_IOC_LOV_SWAP_LAYOUTS: {
3365 struct lustre_swap_layouts lsl;
3367 if (copy_from_user(&lsl, (char __user *)arg,
3368 sizeof(struct lustre_swap_layouts)))
3371 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3374 file2 = fget(lsl.sl_fd);
3378 /* O_WRONLY or O_RDWR */
3379 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3380 GOTO(out, rc = -EPERM);
3382 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3383 struct inode *inode2;
3384 struct ll_inode_info *lli;
3385 struct obd_client_handle *och = NULL;
3387 lli = ll_i2info(inode);
3388 mutex_lock(&lli->lli_och_mutex);
3389 if (fd->fd_lease_och != NULL) {
3390 och = fd->fd_lease_och;
3391 fd->fd_lease_och = NULL;
3393 mutex_unlock(&lli->lli_och_mutex);
3395 GOTO(out, rc = -ENOLCK);
3396 inode2 = file_inode(file2);
3397 rc = ll_swap_layouts_close(och, inode, inode2);
3399 rc = ll_swap_layouts(file, file2, &lsl);
3405 case LL_IOC_LOV_GETSTRIPE:
3406 case LL_IOC_LOV_GETSTRIPE_NEW:
3407 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3408 case FS_IOC_GETFLAGS:
3409 case FS_IOC_SETFLAGS:
3410 RETURN(ll_iocontrol(inode, file, cmd, arg));
3411 case FSFILT_IOC_GETVERSION:
3412 case FS_IOC_GETVERSION:
3413 RETURN(put_user(inode->i_generation, (int __user *)arg));
3414 /* We need to special case any other ioctls we want to handle,
3415 * to send them to the MDS/OST as appropriate and to properly
3416 * network encode the arg field. */
3417 case FS_IOC_SETVERSION:
3420 case LL_IOC_GROUP_LOCK:
3421 RETURN(ll_get_grouplock(inode, file, arg));
3422 case LL_IOC_GROUP_UNLOCK:
3423 RETURN(ll_put_grouplock(inode, file, arg));
3424 case IOC_OBD_STATFS:
3425 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3427 case LL_IOC_FLUSHCTX:
3428 RETURN(ll_flush_ctx(inode));
3429 case LL_IOC_PATH2FID: {
3430 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3431 sizeof(struct lu_fid)))
3436 case LL_IOC_GETPARENT:
3437 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3439 case OBD_IOC_FID2PATH:
3440 RETURN(ll_fid2path(inode, (void __user *)arg));
3441 case LL_IOC_DATA_VERSION: {
3442 struct ioc_data_version idv;
3445 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3448 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3449 rc = ll_ioc_data_version(inode, &idv);
3452 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3458 case LL_IOC_GET_MDTIDX: {
3461 mdtidx = ll_get_mdt_idx(inode);
3465 if (put_user((int)mdtidx, (int __user *)arg))
3470 case OBD_IOC_GETDTNAME:
3471 case OBD_IOC_GETMDNAME:
3472 RETURN(ll_get_obd_name(inode, cmd, arg));
3473 case LL_IOC_HSM_STATE_GET: {
3474 struct md_op_data *op_data;
3475 struct hsm_user_state *hus;
3482 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3483 LUSTRE_OPC_ANY, hus);
3484 if (IS_ERR(op_data)) {
3486 RETURN(PTR_ERR(op_data));
3489 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3492 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3495 ll_finish_md_op_data(op_data);
3499 case LL_IOC_HSM_STATE_SET: {
3500 struct hsm_state_set *hss;
3507 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3512 rc = ll_hsm_state_set(inode, hss);
3517 case LL_IOC_HSM_ACTION: {
3518 struct md_op_data *op_data;
3519 struct hsm_current_action *hca;
3526 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3527 LUSTRE_OPC_ANY, hca);
3528 if (IS_ERR(op_data)) {
3530 RETURN(PTR_ERR(op_data));
3533 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3536 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3539 ll_finish_md_op_data(op_data);
3543 case LL_IOC_SET_LEASE_OLD: {
3544 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3546 RETURN(ll_file_set_lease(file, &ioc, 0));
3548 case LL_IOC_SET_LEASE: {
3549 struct ll_ioc_lease ioc;
3551 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3554 RETURN(ll_file_set_lease(file, &ioc, arg));
3556 case LL_IOC_GET_LEASE: {
3557 struct ll_inode_info *lli = ll_i2info(inode);
3558 struct ldlm_lock *lock = NULL;
3561 mutex_lock(&lli->lli_och_mutex);
3562 if (fd->fd_lease_och != NULL) {
3563 struct obd_client_handle *och = fd->fd_lease_och;
3565 lock = ldlm_handle2lock(&och->och_lease_handle);
3567 lock_res_and_lock(lock);
3568 if (!ldlm_is_cancel(lock))
3569 fmode = och->och_flags;
3571 unlock_res_and_lock(lock);
3572 LDLM_LOCK_PUT(lock);
3575 mutex_unlock(&lli->lli_och_mutex);
3577 RETURN(ll_lease_type_from_fmode(fmode));
3579 case LL_IOC_HSM_IMPORT: {
3580 struct hsm_user_import *hui;
3586 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3591 rc = ll_hsm_import(inode, file, hui);
3596 case LL_IOC_FUTIMES_3: {
3597 struct ll_futimes_3 lfu;
3599 if (copy_from_user(&lfu,
3600 (const struct ll_futimes_3 __user *)arg,
3604 RETURN(ll_file_futimes_3(file, &lfu));
3606 case LL_IOC_LADVISE: {
3607 struct llapi_ladvise_hdr *k_ladvise_hdr;
3608 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3611 int alloc_size = sizeof(*k_ladvise_hdr);
3614 u_ladvise_hdr = (void __user *)arg;
3615 OBD_ALLOC_PTR(k_ladvise_hdr);
3616 if (k_ladvise_hdr == NULL)
3619 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3620 GOTO(out_ladvise, rc = -EFAULT);
3622 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3623 k_ladvise_hdr->lah_count < 1)
3624 GOTO(out_ladvise, rc = -EINVAL);
3626 num_advise = k_ladvise_hdr->lah_count;
3627 if (num_advise >= LAH_COUNT_MAX)
3628 GOTO(out_ladvise, rc = -EFBIG);
3630 OBD_FREE_PTR(k_ladvise_hdr);
3631 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3632 lah_advise[num_advise]);
3633 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3634 if (k_ladvise_hdr == NULL)
3638 * TODO: submit multiple advices to one server in a single RPC
3640 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3641 GOTO(out_ladvise, rc = -EFAULT);
3643 for (i = 0; i < num_advise; i++) {
3644 struct llapi_lu_ladvise *k_ladvise =
3645 &k_ladvise_hdr->lah_advise[i];
3646 struct llapi_lu_ladvise __user *u_ladvise =
3647 &u_ladvise_hdr->lah_advise[i];
3649 rc = ll_ladvise_sanity(inode, k_ladvise);
3651 GOTO(out_ladvise, rc);
3653 switch (k_ladvise->lla_advice) {
3654 case LU_LADVISE_LOCKNOEXPAND:
3655 rc = ll_lock_noexpand(file,
3656 k_ladvise->lla_peradvice_flags);
3657 GOTO(out_ladvise, rc);
3658 case LU_LADVISE_LOCKAHEAD:
3660 rc = ll_file_lock_ahead(file, k_ladvise);
3663 GOTO(out_ladvise, rc);
3666 &u_ladvise->lla_lockahead_result))
3667 GOTO(out_ladvise, rc = -EFAULT);
3670 rc = ll_ladvise(inode, file,
3671 k_ladvise_hdr->lah_flags,
3674 GOTO(out_ladvise, rc);
3681 OBD_FREE(k_ladvise_hdr, alloc_size);
3684 case LL_IOC_FLR_SET_MIRROR: {
3685 /* mirror I/O must be direct to avoid polluting page cache
3687 if (!(file->f_flags & O_DIRECT))
3690 fd->fd_designated_mirror = (__u32)arg;
3693 case LL_IOC_FSGETXATTR:
3694 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3695 case LL_IOC_FSSETXATTR:
3696 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3698 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3700 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3701 (void __user *)arg));
3705 #ifndef HAVE_FILE_LLSEEK_SIZE
3706 static inline loff_t
3707 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3709 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3711 if (offset > maxsize)
3714 if (offset != file->f_pos) {
3715 file->f_pos = offset;
3716 file->f_version = 0;
3722 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3723 loff_t maxsize, loff_t eof)
3725 struct inode *inode = file_inode(file);
3733 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3734 * position-querying operation. Avoid rewriting the "same"
3735 * f_pos value back to the file because a concurrent read(),
3736 * write() or lseek() might have altered it
3741 * f_lock protects against read/modify/write race with other
3742 * SEEK_CURs. Note that parallel writes and reads behave
3746 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3747 inode_unlock(inode);
3751 * In the generic case the entire file is data, so as long as
3752 * offset isn't at the end of the file then the offset is data.
3759 * There is a virtual hole at the end of the file, so as long as
3760 * offset isn't i_size or larger, return i_size.
3768 return llseek_execute(file, offset, maxsize);
3772 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3774 struct inode *inode = file_inode(file);
3775 loff_t retval, eof = 0;
3778 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3779 (origin == SEEK_CUR) ? file->f_pos : 0);
3780 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3781 PFID(ll_inode2fid(inode)), inode, retval, retval,
3783 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3785 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3786 retval = ll_glimpse_size(inode);
3789 eof = i_size_read(inode);
3792 retval = ll_generic_file_llseek_size(file, offset, origin,
3793 ll_file_maxbytes(inode), eof);
3797 static int ll_flush(struct file *file, fl_owner_t id)
3799 struct inode *inode = file_inode(file);
3800 struct ll_inode_info *lli = ll_i2info(inode);
3801 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3804 LASSERT(!S_ISDIR(inode->i_mode));
3806 /* catch async errors that were recorded back when async writeback
3807 * failed for pages in this mapping. */
3808 rc = lli->lli_async_rc;
3809 lli->lli_async_rc = 0;
3810 if (lli->lli_clob != NULL) {
3811 err = lov_read_and_clear_async_rc(lli->lli_clob);
3816 /* The application has been told write failure already.
3817 * Do not report failure again. */
3818 if (fd->fd_write_failed)
3820 return rc ? -EIO : 0;
3824 * Called to make sure a portion of file has been written out.
3825 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3827 * Return how many pages have been written.
3829 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3830 enum cl_fsync_mode mode, int ignore_layout)
3834 struct cl_fsync_io *fio;
3839 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3840 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3843 env = cl_env_get(&refcheck);
3845 RETURN(PTR_ERR(env));
3847 io = vvp_env_thread_io(env);
3848 io->ci_obj = ll_i2info(inode)->lli_clob;
3849 io->ci_ignore_layout = ignore_layout;
3851 /* initialize parameters for sync */
3852 fio = &io->u.ci_fsync;
3853 fio->fi_start = start;
3855 fio->fi_fid = ll_inode2fid(inode);
3856 fio->fi_mode = mode;
3857 fio->fi_nr_written = 0;
3859 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3860 result = cl_io_loop(env, io);
3862 result = io->ci_result;
3864 result = fio->fi_nr_written;
3865 cl_io_fini(env, io);
3866 cl_env_put(env, &refcheck);
3872 * When dentry is provided (the 'else' case), file_dentry() may be
3873 * null and dentry must be used directly rather than pulled from
3874 * file_dentry() as is done otherwise.
3877 #ifdef HAVE_FILE_FSYNC_4ARGS
3878 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3880 struct dentry *dentry = file_dentry(file);
3882 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3883 int ll_fsync(struct file *file, int datasync)
3885 struct dentry *dentry = file_dentry(file);
3887 loff_t end = LLONG_MAX;
3889 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3892 loff_t end = LLONG_MAX;
3894 struct inode *inode = dentry->d_inode;
3895 struct ll_inode_info *lli = ll_i2info(inode);
3896 struct ptlrpc_request *req;
3900 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3901 PFID(ll_inode2fid(inode)), inode);
3902 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3904 #ifdef HAVE_FILE_FSYNC_4ARGS
3905 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3906 lock_inode = !lli->lli_inode_locked;
3910 /* fsync's caller has already called _fdata{sync,write}, we want
3911 * that IO to finish before calling the osc and mdc sync methods */
3912 rc = filemap_fdatawait(inode->i_mapping);
3915 /* catch async errors that were recorded back when async writeback
3916 * failed for pages in this mapping. */
3917 if (!S_ISDIR(inode->i_mode)) {
3918 err = lli->lli_async_rc;
3919 lli->lli_async_rc = 0;
3922 if (lli->lli_clob != NULL) {
3923 err = lov_read_and_clear_async_rc(lli->lli_clob);
3929 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3933 ptlrpc_req_finished(req);
3935 if (S_ISREG(inode->i_mode)) {
3936 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3938 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3939 if (rc == 0 && err < 0)
3942 fd->fd_write_failed = true;
3944 fd->fd_write_failed = false;
3947 #ifdef HAVE_FILE_FSYNC_4ARGS
3949 inode_unlock(inode);
3955 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3957 struct inode *inode = file_inode(file);
3958 struct ll_sb_info *sbi = ll_i2sbi(inode);
3959 struct ldlm_enqueue_info einfo = {
3960 .ei_type = LDLM_FLOCK,
3961 .ei_cb_cp = ldlm_flock_completion_ast,
3962 .ei_cbdata = file_lock,
3964 struct md_op_data *op_data;
3965 struct lustre_handle lockh = { 0 };
3966 union ldlm_policy_data flock = { { 0 } };
3967 int fl_type = file_lock->fl_type;
3973 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3974 PFID(ll_inode2fid(inode)), file_lock);
3976 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3978 if (file_lock->fl_flags & FL_FLOCK) {
3979 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3980 /* flocks are whole-file locks */
3981 flock.l_flock.end = OFFSET_MAX;
3982 /* For flocks owner is determined by the local file desctiptor*/
3983 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3984 } else if (file_lock->fl_flags & FL_POSIX) {
3985 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3986 flock.l_flock.start = file_lock->fl_start;
3987 flock.l_flock.end = file_lock->fl_end;
3991 flock.l_flock.pid = file_lock->fl_pid;
3993 /* Somewhat ugly workaround for svc lockd.
3994 * lockd installs custom fl_lmops->lm_compare_owner that checks
3995 * for the fl_owner to be the same (which it always is on local node
3996 * I guess between lockd processes) and then compares pid.
3997 * As such we assign pid to the owner field to make it all work,
3998 * conflict with normal locks is unlikely since pid space and
3999 * pointer space for current->files are not intersecting */
4000 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4001 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4005 einfo.ei_mode = LCK_PR;
4008 /* An unlock request may or may not have any relation to
4009 * existing locks so we may not be able to pass a lock handle
4010 * via a normal ldlm_lock_cancel() request. The request may even
4011 * unlock a byte range in the middle of an existing lock. In
4012 * order to process an unlock request we need all of the same
4013 * information that is given with a normal read or write record
4014 * lock request. To avoid creating another ldlm unlock (cancel)
4015 * message we'll treat a LCK_NL flock request as an unlock. */
4016 einfo.ei_mode = LCK_NL;
4019 einfo.ei_mode = LCK_PW;
4022 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4037 flags = LDLM_FL_BLOCK_NOWAIT;
4043 flags = LDLM_FL_TEST_LOCK;
4046 CERROR("unknown fcntl lock command: %d\n", cmd);
4050 /* Save the old mode so that if the mode in the lock changes we
4051 * can decrement the appropriate reader or writer refcount. */
4052 file_lock->fl_type = einfo.ei_mode;
4054 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4055 LUSTRE_OPC_ANY, NULL);
4056 if (IS_ERR(op_data))
4057 RETURN(PTR_ERR(op_data));
4059 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4060 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4061 flock.l_flock.pid, flags, einfo.ei_mode,
4062 flock.l_flock.start, flock.l_flock.end);
4064 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4067 /* Restore the file lock type if not TEST lock. */
4068 if (!(flags & LDLM_FL_TEST_LOCK))
4069 file_lock->fl_type = fl_type;
4071 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4072 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4073 !(flags & LDLM_FL_TEST_LOCK))
4074 rc2 = locks_lock_file_wait(file, file_lock);
4076 if ((file_lock->fl_flags & FL_FLOCK) &&
4077 (rc == 0 || file_lock->fl_type == F_UNLCK))
4078 rc2 = flock_lock_file_wait(file, file_lock);
4079 if ((file_lock->fl_flags & FL_POSIX) &&
4080 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4081 !(flags & LDLM_FL_TEST_LOCK))
4082 rc2 = posix_lock_file_wait(file, file_lock);
4083 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4085 if (rc2 && file_lock->fl_type != F_UNLCK) {
4086 einfo.ei_mode = LCK_NL;
4087 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4092 ll_finish_md_op_data(op_data);
4097 int ll_get_fid_by_name(struct inode *parent, const char *name,
4098 int namelen, struct lu_fid *fid,
4099 struct inode **inode)
4101 struct md_op_data *op_data = NULL;
4102 struct mdt_body *body;
4103 struct ptlrpc_request *req;
4107 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4108 LUSTRE_OPC_ANY, NULL);
4109 if (IS_ERR(op_data))
4110 RETURN(PTR_ERR(op_data));
4112 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4113 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4114 ll_finish_md_op_data(op_data);
4118 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4120 GOTO(out_req, rc = -EFAULT);
4122 *fid = body->mbo_fid1;
4125 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4127 ptlrpc_req_finished(req);
4131 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4134 struct dentry *dchild = NULL;
4135 struct inode *child_inode = NULL;
4136 struct md_op_data *op_data;
4137 struct ptlrpc_request *request = NULL;
4138 struct obd_client_handle *och = NULL;
4140 struct mdt_body *body;
4141 __u64 data_version = 0;
4142 size_t namelen = strlen(name);
4143 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4147 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4148 PFID(ll_inode2fid(parent)), name,
4149 lum->lum_stripe_offset, lum->lum_stripe_count);
4151 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4152 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4153 lustre_swab_lmv_user_md(lum);
4155 /* Get child FID first */
4156 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4159 dchild = d_lookup(file_dentry(file), &qstr);
4161 if (dchild->d_inode)
4162 child_inode = igrab(dchild->d_inode);
4167 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4176 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4177 OBD_CONNECT2_DIR_MIGRATE)) {
4178 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4179 ll_i2info(child_inode)->lli_lsm_md) {
4180 CERROR("%s: MDT doesn't support stripe directory "
4182 ll_get_fsname(parent->i_sb, NULL, 0));
4183 GOTO(out_iput, rc = -EOPNOTSUPP);
4188 * lfs migrate command needs to be blocked on the client
4189 * by checking the migrate FID against the FID of the
4192 if (child_inode == parent->i_sb->s_root->d_inode)
4193 GOTO(out_iput, rc = -EINVAL);
4195 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4196 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4197 if (IS_ERR(op_data))
4198 GOTO(out_iput, rc = PTR_ERR(op_data));
4200 inode_lock(child_inode);
4201 op_data->op_fid3 = *ll_inode2fid(child_inode);
4202 if (!fid_is_sane(&op_data->op_fid3)) {
4203 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4204 ll_get_fsname(parent->i_sb, NULL, 0), name,
4205 PFID(&op_data->op_fid3));
4206 GOTO(out_unlock, rc = -EINVAL);
4209 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4210 op_data->op_data = lum;
4211 op_data->op_data_size = lumlen;
4214 if (S_ISREG(child_inode->i_mode)) {
4215 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4219 GOTO(out_unlock, rc);
4222 rc = ll_data_version(child_inode, &data_version,
4225 GOTO(out_close, rc);
4227 op_data->op_handle = och->och_fh;
4228 op_data->op_data_version = data_version;
4229 op_data->op_lease_handle = och->och_lease_handle;
4230 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4232 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4233 och->och_mod->mod_open_req->rq_replay = 0;
4234 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4237 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4238 name, namelen, &request);
4240 LASSERT(request != NULL);
4241 ll_update_times(request, parent);
4243 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4244 LASSERT(body != NULL);
4246 /* If the server does release layout lock, then we cleanup
4247 * the client och here, otherwise release it in out_close: */
4248 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4249 obd_mod_put(och->och_mod);
4250 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4252 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4258 if (request != NULL) {
4259 ptlrpc_req_finished(request);
4263 /* Try again if the file layout has changed. */
4264 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4269 ll_lease_close(och, child_inode, NULL);
4271 clear_nlink(child_inode);
4273 inode_unlock(child_inode);
4274 ll_finish_md_op_data(op_data);
4281 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4289 * test if some locks matching bits and l_req_mode are acquired
4290 * - bits can be in different locks
4291 * - if found clear the common lock bits in *bits
4292 * - the bits not found, are kept in *bits
4294 * \param bits [IN] searched lock bits [IN]
4295 * \param l_req_mode [IN] searched lock mode
4296 * \retval boolean, true iff all bits are found
4298 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4300 struct lustre_handle lockh;
4301 union ldlm_policy_data policy;
4302 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4303 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4312 fid = &ll_i2info(inode)->lli_fid;
4313 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4314 ldlm_lockname[mode]);
4316 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4317 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4318 policy.l_inodebits.bits = *bits & (1 << i);
4319 if (policy.l_inodebits.bits == 0)
4322 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4323 &policy, mode, &lockh)) {
4324 struct ldlm_lock *lock;
4326 lock = ldlm_handle2lock(&lockh);
4329 ~(lock->l_policy_data.l_inodebits.bits);
4330 LDLM_LOCK_PUT(lock);
4332 *bits &= ~policy.l_inodebits.bits;
4339 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4340 struct lustre_handle *lockh, __u64 flags,
4341 enum ldlm_mode mode)
4343 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4348 fid = &ll_i2info(inode)->lli_fid;
4349 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4351 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4352 fid, LDLM_IBITS, &policy, mode, lockh);
4357 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4359 /* Already unlinked. Just update nlink and return success */
4360 if (rc == -ENOENT) {
4362 /* If it is striped directory, and there is bad stripe
4363 * Let's revalidate the dentry again, instead of returning
4365 if (S_ISDIR(inode->i_mode) &&
4366 ll_i2info(inode)->lli_lsm_md != NULL)
4369 /* This path cannot be hit for regular files unless in
4370 * case of obscure races, so no need to to validate
4372 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4374 } else if (rc != 0) {
4375 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4376 "%s: revalidate FID "DFID" error: rc = %d\n",
4377 ll_get_fsname(inode->i_sb, NULL, 0),
4378 PFID(ll_inode2fid(inode)), rc);
4384 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4386 struct inode *inode = dentry->d_inode;
4387 struct obd_export *exp = ll_i2mdexp(inode);
4388 struct lookup_intent oit = {
4391 struct ptlrpc_request *req = NULL;
4392 struct md_op_data *op_data;
4396 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4397 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4399 /* Call getattr by fid, so do not provide name at all. */
4400 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4401 LUSTRE_OPC_ANY, NULL);
4402 if (IS_ERR(op_data))
4403 RETURN(PTR_ERR(op_data));
4405 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4406 ll_finish_md_op_data(op_data);
4408 rc = ll_inode_revalidate_fini(inode, rc);
4412 rc = ll_revalidate_it_finish(req, &oit, dentry);
4414 ll_intent_release(&oit);
4418 /* Unlinked? Unhash dentry, so it is not picked up later by
4419 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4420 * here to preserve get_cwd functionality on 2.6.
4422 if (!dentry->d_inode->i_nlink) {
4423 ll_lock_dcache(inode);
4424 d_lustre_invalidate(dentry, 0);
4425 ll_unlock_dcache(inode);
4428 ll_lookup_finish_locks(&oit, dentry);
4430 ptlrpc_req_finished(req);
4435 static int ll_merge_md_attr(struct inode *inode)
4437 struct cl_attr attr = { 0 };
4440 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4441 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4442 &attr, ll_md_blocking_ast);
4446 set_nlink(inode, attr.cat_nlink);
4447 inode->i_blocks = attr.cat_blocks;
4448 i_size_write(inode, attr.cat_size);
4450 ll_i2info(inode)->lli_atime = attr.cat_atime;
4451 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4452 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4457 static inline dev_t ll_compat_encode_dev(dev_t dev)
4459 /* The compat_sys_*stat*() syscalls will fail unless the
4460 * device majors and minors are both less than 256. Note that
4461 * the value returned here will be passed through
4462 * old_encode_dev() in cp_compat_stat(). And so we are not
4463 * trying to return a valid compat (u16) device number, just
4464 * one that will pass the old_valid_dev() check. */
4466 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4469 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4470 int ll_getattr(const struct path *path, struct kstat *stat,
4471 u32 request_mask, unsigned int flags)
4473 struct dentry *de = path->dentry;
4475 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4478 struct inode *inode = de->d_inode;
4479 struct ll_sb_info *sbi = ll_i2sbi(inode);
4480 struct ll_inode_info *lli = ll_i2info(inode);
4483 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4485 rc = ll_inode_revalidate(de, IT_GETATTR);
4489 if (S_ISREG(inode->i_mode)) {
4490 /* In case of restore, the MDT has the right size and has
4491 * already send it back without granting the layout lock,
4492 * inode is up-to-date so glimpse is useless.
4493 * Also to glimpse we need the layout, in case of a running
4494 * restore the MDT holds the layout lock so the glimpse will
4495 * block up to the end of restore (getattr will block)
4497 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4498 rc = ll_glimpse_size(inode);
4503 /* If object isn't regular a file then don't validate size. */
4504 if (S_ISDIR(inode->i_mode) &&
4505 lli->lli_lsm_md != NULL) {
4506 rc = ll_merge_md_attr(inode);
4511 LTIME_S(inode->i_atime) = lli->lli_atime;
4512 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4513 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4516 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4518 if (ll_need_32bit_api(sbi)) {
4519 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4520 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4521 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4523 stat->ino = inode->i_ino;
4524 stat->dev = inode->i_sb->s_dev;
4525 stat->rdev = inode->i_rdev;
4528 stat->mode = inode->i_mode;
4529 stat->uid = inode->i_uid;
4530 stat->gid = inode->i_gid;
4531 stat->atime = inode->i_atime;
4532 stat->mtime = inode->i_mtime;
4533 stat->ctime = inode->i_ctime;
4534 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4536 stat->nlink = inode->i_nlink;
4537 stat->size = i_size_read(inode);
4538 stat->blocks = inode->i_blocks;
4543 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4544 __u64 start, __u64 len)
4548 struct fiemap *fiemap;
4549 unsigned int extent_count = fieinfo->fi_extents_max;
4551 num_bytes = sizeof(*fiemap) + (extent_count *
4552 sizeof(struct fiemap_extent));
4553 OBD_ALLOC_LARGE(fiemap, num_bytes);
4558 fiemap->fm_flags = fieinfo->fi_flags;
4559 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4560 fiemap->fm_start = start;
4561 fiemap->fm_length = len;
4562 if (extent_count > 0 &&
4563 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4564 sizeof(struct fiemap_extent)) != 0)
4565 GOTO(out, rc = -EFAULT);
4567 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4569 fieinfo->fi_flags = fiemap->fm_flags;
4570 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4571 if (extent_count > 0 &&
4572 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4573 fiemap->fm_mapped_extents *
4574 sizeof(struct fiemap_extent)) != 0)
4575 GOTO(out, rc = -EFAULT);
4577 OBD_FREE_LARGE(fiemap, num_bytes);
4581 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4583 struct ll_inode_info *lli = ll_i2info(inode);
4584 struct posix_acl *acl = NULL;
4587 spin_lock(&lli->lli_lock);
4588 /* VFS' acl_permission_check->check_acl will release the refcount */
4589 acl = posix_acl_dup(lli->lli_posix_acl);
4590 spin_unlock(&lli->lli_lock);
4595 #ifdef HAVE_IOP_SET_ACL
4596 #ifdef CONFIG_FS_POSIX_ACL
4597 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4599 struct ll_sb_info *sbi = ll_i2sbi(inode);
4600 struct ptlrpc_request *req = NULL;
4601 const char *name = NULL;
4603 size_t value_size = 0;
4608 case ACL_TYPE_ACCESS:
4609 name = XATTR_NAME_POSIX_ACL_ACCESS;
4611 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4614 case ACL_TYPE_DEFAULT:
4615 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4616 if (!S_ISDIR(inode->i_mode))
4617 rc = acl ? -EACCES : 0;
4628 value_size = posix_acl_xattr_size(acl->a_count);
4629 value = kmalloc(value_size, GFP_NOFS);
4631 GOTO(out, rc = -ENOMEM);
4633 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4635 GOTO(out_value, rc);
4638 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4639 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4640 name, value, value_size, 0, 0, &req);
4642 ptlrpc_req_finished(req);
4647 forget_cached_acl(inode, type);
4649 set_cached_acl(inode, type, acl);
4652 #endif /* CONFIG_FS_POSIX_ACL */
4653 #endif /* HAVE_IOP_SET_ACL */
4655 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4657 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4658 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4660 ll_check_acl(struct inode *inode, int mask)
4663 # ifdef CONFIG_FS_POSIX_ACL
4664 struct posix_acl *acl;
4668 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4669 if (flags & IPERM_FLAG_RCU)
4672 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4677 rc = posix_acl_permission(inode, acl, mask);
4678 posix_acl_release(acl);
4681 # else /* !CONFIG_FS_POSIX_ACL */
4683 # endif /* CONFIG_FS_POSIX_ACL */
4685 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4687 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4688 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4690 # ifdef HAVE_INODE_PERMISION_2ARGS
4691 int ll_inode_permission(struct inode *inode, int mask)
4693 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4698 struct ll_sb_info *sbi;
4699 struct root_squash_info *squash;
4700 struct cred *cred = NULL;
4701 const struct cred *old_cred = NULL;
4703 bool squash_id = false;
4706 #ifdef MAY_NOT_BLOCK
4707 if (mask & MAY_NOT_BLOCK)
4709 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4710 if (flags & IPERM_FLAG_RCU)
4714 /* as root inode are NOT getting validated in lookup operation,
4715 * need to do it before permission check. */
4717 if (inode == inode->i_sb->s_root->d_inode) {
4718 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4723 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4724 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4726 /* squash fsuid/fsgid if needed */
4727 sbi = ll_i2sbi(inode);
4728 squash = &sbi->ll_squash;
4729 if (unlikely(squash->rsi_uid != 0 &&
4730 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4731 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4735 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4736 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4737 squash->rsi_uid, squash->rsi_gid);
4739 /* update current process's credentials
4740 * and FS capability */
4741 cred = prepare_creds();
4745 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4746 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4747 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4748 if ((1 << cap) & CFS_CAP_FS_MASK)
4749 cap_lower(cred->cap_effective, cap);
4751 old_cred = override_creds(cred);
4754 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4755 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4756 /* restore current process's credentials and FS capability */
4758 revert_creds(old_cred);
4765 /* -o localflock - only provides locally consistent flock locks */
4766 struct file_operations ll_file_operations = {
4767 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4768 # ifdef HAVE_SYNC_READ_WRITE
4769 .read = new_sync_read,
4770 .write = new_sync_write,
4772 .read_iter = ll_file_read_iter,
4773 .write_iter = ll_file_write_iter,
4774 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4775 .read = ll_file_read,
4776 .aio_read = ll_file_aio_read,
4777 .write = ll_file_write,
4778 .aio_write = ll_file_aio_write,
4779 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4780 .unlocked_ioctl = ll_file_ioctl,
4781 .open = ll_file_open,
4782 .release = ll_file_release,
4783 .mmap = ll_file_mmap,
4784 .llseek = ll_file_seek,
4785 .splice_read = ll_file_splice_read,
4790 struct file_operations ll_file_operations_flock = {
4791 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4792 # ifdef HAVE_SYNC_READ_WRITE
4793 .read = new_sync_read,
4794 .write = new_sync_write,
4795 # endif /* HAVE_SYNC_READ_WRITE */
4796 .read_iter = ll_file_read_iter,
4797 .write_iter = ll_file_write_iter,
4798 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4799 .read = ll_file_read,
4800 .aio_read = ll_file_aio_read,
4801 .write = ll_file_write,
4802 .aio_write = ll_file_aio_write,
4803 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4804 .unlocked_ioctl = ll_file_ioctl,
4805 .open = ll_file_open,
4806 .release = ll_file_release,
4807 .mmap = ll_file_mmap,
4808 .llseek = ll_file_seek,
4809 .splice_read = ll_file_splice_read,
4812 .flock = ll_file_flock,
4813 .lock = ll_file_flock
4816 /* These are for -o noflock - to return ENOSYS on flock calls */
4817 struct file_operations ll_file_operations_noflock = {
4818 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4819 # ifdef HAVE_SYNC_READ_WRITE
4820 .read = new_sync_read,
4821 .write = new_sync_write,
4822 # endif /* HAVE_SYNC_READ_WRITE */
4823 .read_iter = ll_file_read_iter,
4824 .write_iter = ll_file_write_iter,
4825 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4826 .read = ll_file_read,
4827 .aio_read = ll_file_aio_read,
4828 .write = ll_file_write,
4829 .aio_write = ll_file_aio_write,
4830 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4831 .unlocked_ioctl = ll_file_ioctl,
4832 .open = ll_file_open,
4833 .release = ll_file_release,
4834 .mmap = ll_file_mmap,
4835 .llseek = ll_file_seek,
4836 .splice_read = ll_file_splice_read,
4839 .flock = ll_file_noflock,
4840 .lock = ll_file_noflock
4843 struct inode_operations ll_file_inode_operations = {
4844 .setattr = ll_setattr,
4845 .getattr = ll_getattr,
4846 .permission = ll_inode_permission,
4847 #ifdef HAVE_IOP_XATTR
4848 .setxattr = ll_setxattr,
4849 .getxattr = ll_getxattr,
4850 .removexattr = ll_removexattr,
4852 .listxattr = ll_listxattr,
4853 .fiemap = ll_fiemap,
4854 #ifdef HAVE_IOP_GET_ACL
4855 .get_acl = ll_get_acl,
4857 #ifdef HAVE_IOP_SET_ACL
4858 .set_acl = ll_set_acl,
4862 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4864 struct ll_inode_info *lli = ll_i2info(inode);
4865 struct cl_object *obj = lli->lli_clob;
4874 env = cl_env_get(&refcheck);
4876 RETURN(PTR_ERR(env));
4878 rc = cl_conf_set(env, lli->lli_clob, conf);
4882 if (conf->coc_opc == OBJECT_CONF_SET) {
4883 struct ldlm_lock *lock = conf->coc_lock;
4884 struct cl_layout cl = {
4888 LASSERT(lock != NULL);
4889 LASSERT(ldlm_has_layout(lock));
4891 /* it can only be allowed to match after layout is
4892 * applied to inode otherwise false layout would be
4893 * seen. Applying layout shoud happen before dropping
4894 * the intent lock. */
4895 ldlm_lock_allow_match(lock);
4897 rc = cl_object_layout_get(env, obj, &cl);
4902 DFID": layout version change: %u -> %u\n",
4903 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4905 ll_layout_version_set(lli, cl.cl_layout_gen);
4909 cl_env_put(env, &refcheck);
4914 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4915 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4918 struct ll_sb_info *sbi = ll_i2sbi(inode);
4919 struct ptlrpc_request *req;
4920 struct mdt_body *body;
4927 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4928 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4929 lock->l_lvb_data, lock->l_lvb_len);
4931 if (lock->l_lvb_data != NULL)
4934 /* if layout lock was granted right away, the layout is returned
4935 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4936 * blocked and then granted via completion ast, we have to fetch
4937 * layout here. Please note that we can't use the LVB buffer in
4938 * completion AST because it doesn't have a large enough buffer */
4939 rc = ll_get_default_mdsize(sbi, &lmmsize);
4941 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4942 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4946 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4948 GOTO(out, rc = -EPROTO);
4950 lmmsize = body->mbo_eadatasize;
4951 if (lmmsize == 0) /* empty layout */
4954 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4956 GOTO(out, rc = -EFAULT);
4958 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4959 if (lvbdata == NULL)
4960 GOTO(out, rc = -ENOMEM);
4962 memcpy(lvbdata, lmm, lmmsize);
4963 lock_res_and_lock(lock);
4964 if (unlikely(lock->l_lvb_data == NULL)) {
4965 lock->l_lvb_type = LVB_T_LAYOUT;
4966 lock->l_lvb_data = lvbdata;
4967 lock->l_lvb_len = lmmsize;
4970 unlock_res_and_lock(lock);
4973 OBD_FREE_LARGE(lvbdata, lmmsize);
4978 ptlrpc_req_finished(req);
4983 * Apply the layout to the inode. Layout lock is held and will be released
4986 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4987 struct inode *inode)
4989 struct ll_inode_info *lli = ll_i2info(inode);
4990 struct ll_sb_info *sbi = ll_i2sbi(inode);
4991 struct ldlm_lock *lock;
4992 struct cl_object_conf conf;
4995 bool wait_layout = false;
4998 LASSERT(lustre_handle_is_used(lockh));
5000 lock = ldlm_handle2lock(lockh);
5001 LASSERT(lock != NULL);
5002 LASSERT(ldlm_has_layout(lock));
5004 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5005 PFID(&lli->lli_fid), inode);
5007 /* in case this is a caching lock and reinstate with new inode */
5008 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5010 lock_res_and_lock(lock);
5011 lvb_ready = ldlm_is_lvb_ready(lock);
5012 unlock_res_and_lock(lock);
5014 /* checking lvb_ready is racy but this is okay. The worst case is
5015 * that multi processes may configure the file on the same time. */
5019 rc = ll_layout_fetch(inode, lock);
5023 /* for layout lock, lmm is stored in lock's lvb.
5024 * lvb_data is immutable if the lock is held so it's safe to access it
5027 * set layout to file. Unlikely this will fail as old layout was
5028 * surely eliminated */
5029 memset(&conf, 0, sizeof conf);
5030 conf.coc_opc = OBJECT_CONF_SET;
5031 conf.coc_inode = inode;
5032 conf.coc_lock = lock;
5033 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5034 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5035 rc = ll_layout_conf(inode, &conf);
5037 /* refresh layout failed, need to wait */
5038 wait_layout = rc == -EBUSY;
5041 LDLM_LOCK_PUT(lock);
5042 ldlm_lock_decref(lockh, mode);
5044 /* wait for IO to complete if it's still being used. */
5046 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5047 ll_get_fsname(inode->i_sb, NULL, 0),
5048 PFID(&lli->lli_fid), inode);
5050 memset(&conf, 0, sizeof conf);
5051 conf.coc_opc = OBJECT_CONF_WAIT;
5052 conf.coc_inode = inode;
5053 rc = ll_layout_conf(inode, &conf);
5057 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5058 ll_get_fsname(inode->i_sb, NULL, 0),
5059 PFID(&lli->lli_fid), rc);
5065 * Issue layout intent RPC to MDS.
5066 * \param inode [in] file inode
5067 * \param intent [in] layout intent
5069 * \retval 0 on success
5070 * \retval < 0 error code
5072 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5074 struct ll_inode_info *lli = ll_i2info(inode);
5075 struct ll_sb_info *sbi = ll_i2sbi(inode);
5076 struct md_op_data *op_data;
5077 struct lookup_intent it;
5078 struct ptlrpc_request *req;
5082 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5083 0, 0, LUSTRE_OPC_ANY, NULL);
5084 if (IS_ERR(op_data))
5085 RETURN(PTR_ERR(op_data));
5087 op_data->op_data = intent;
5088 op_data->op_data_size = sizeof(*intent);
5090 memset(&it, 0, sizeof(it));
5091 it.it_op = IT_LAYOUT;
5092 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5093 intent->li_opc == LAYOUT_INTENT_TRUNC)
5094 it.it_flags = FMODE_WRITE;
5096 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5097 ll_get_fsname(inode->i_sb, NULL, 0),
5098 PFID(&lli->lli_fid), inode);
5100 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5101 &ll_md_blocking_ast, 0);
5102 if (it.it_request != NULL)
5103 ptlrpc_req_finished(it.it_request);
5104 it.it_request = NULL;
5106 ll_finish_md_op_data(op_data);
5108 /* set lock data in case this is a new lock */
5110 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5112 ll_intent_drop_lock(&it);
5118 * This function checks if there exists a LAYOUT lock on the client side,
5119 * or enqueues it if it doesn't have one in cache.
5121 * This function will not hold layout lock so it may be revoked any time after
5122 * this function returns. Any operations depend on layout should be redone
5125 * This function should be called before lov_io_init() to get an uptodate
5126 * layout version, the caller should save the version number and after IO
5127 * is finished, this function should be called again to verify that layout
5128 * is not changed during IO time.
5130 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5132 struct ll_inode_info *lli = ll_i2info(inode);
5133 struct ll_sb_info *sbi = ll_i2sbi(inode);
5134 struct lustre_handle lockh;
5135 struct layout_intent intent = {
5136 .li_opc = LAYOUT_INTENT_ACCESS,
5138 enum ldlm_mode mode;
5142 *gen = ll_layout_version_get(lli);
5143 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5147 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5148 LASSERT(S_ISREG(inode->i_mode));
5150 /* take layout lock mutex to enqueue layout lock exclusively. */
5151 mutex_lock(&lli->lli_layout_mutex);
5154 /* mostly layout lock is caching on the local side, so try to
5155 * match it before grabbing layout lock mutex. */
5156 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5157 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5158 if (mode != 0) { /* hit cached lock */
5159 rc = ll_layout_lock_set(&lockh, mode, inode);
5165 rc = ll_layout_intent(inode, &intent);
5171 *gen = ll_layout_version_get(lli);
5172 mutex_unlock(&lli->lli_layout_mutex);
5178 * Issue layout intent RPC indicating where in a file an IO is about to write.
5180 * \param[in] inode file inode.
5181 * \param[in] ext write range with start offset of fille in bytes where
5182 * an IO is about to write, and exclusive end offset in
5185 * \retval 0 on success
5186 * \retval < 0 error code
5188 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5189 struct lu_extent *ext)
5191 struct layout_intent intent = {
5193 .li_extent.e_start = ext->e_start,
5194 .li_extent.e_end = ext->e_end,
5199 rc = ll_layout_intent(inode, &intent);
5205 * This function send a restore request to the MDT
5207 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5209 struct hsm_user_request *hur;
5213 len = sizeof(struct hsm_user_request) +
5214 sizeof(struct hsm_user_item);
5215 OBD_ALLOC(hur, len);
5219 hur->hur_request.hr_action = HUA_RESTORE;
5220 hur->hur_request.hr_archive_id = 0;
5221 hur->hur_request.hr_flags = 0;
5222 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5223 sizeof(hur->hur_user_item[0].hui_fid));
5224 hur->hur_user_item[0].hui_extent.offset = offset;
5225 hur->hur_user_item[0].hui_extent.length = length;
5226 hur->hur_request.hr_itemcount = 1;
5227 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,