4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
108 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
109 op_data->op_handle = och->och_fh;
111 if (och->och_flags & FMODE_WRITE &&
112 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
113 /* For HSM: if inode data has been modified, pack it so that
114 * MDT can set data dirty flag in the archive. */
115 op_data->op_bias |= MDS_DATA_MODIFIED;
121 * Perform a close, possibly with a bias.
122 * The meaning of "data" depends on the value of "bias".
124 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
125 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
128 static int ll_close_inode_openhandle(struct inode *inode,
129 struct obd_client_handle *och,
130 enum mds_op_bias bias, void *data)
132 struct obd_export *md_exp = ll_i2mdexp(inode);
133 const struct ll_inode_info *lli = ll_i2info(inode);
134 struct md_op_data *op_data;
135 struct ptlrpc_request *req = NULL;
139 if (class_exp2obd(md_exp) == NULL) {
140 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
141 ll_get_fsname(inode->i_sb, NULL, 0),
142 PFID(&lli->lli_fid));
146 OBD_ALLOC_PTR(op_data);
147 /* We leak openhandle and request here on error, but not much to be
148 * done in OOM case since app won't retry close on error either. */
150 GOTO(out, rc = -ENOMEM);
152 ll_prepare_close(inode, op_data, och);
154 case MDS_CLOSE_LAYOUT_MERGE:
155 /* merge blocks from the victim inode */
156 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
157 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
158 case MDS_CLOSE_LAYOUT_SPLIT:
159 case MDS_CLOSE_LAYOUT_SWAP: {
160 struct split_param *sp = data;
162 LASSERT(data != NULL);
163 op_data->op_bias |= bias;
164 op_data->op_data_version = 0;
165 op_data->op_lease_handle = och->och_lease_handle;
166 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
167 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
168 op_data->op_mirror_id = sp->sp_mirror_id;
170 op_data->op_fid2 = *ll_inode2fid(data);
175 case MDS_CLOSE_RESYNC_DONE: {
176 struct ll_ioc_lease *ioc = data;
178 LASSERT(data != NULL);
179 op_data->op_attr_blocks +=
180 ioc->lil_count * op_data->op_attr_blocks;
181 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
182 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
184 op_data->op_lease_handle = och->och_lease_handle;
185 op_data->op_data = &ioc->lil_ids[0];
186 op_data->op_data_size =
187 ioc->lil_count * sizeof(ioc->lil_ids[0]);
191 case MDS_HSM_RELEASE:
192 LASSERT(data != NULL);
193 op_data->op_bias |= MDS_HSM_RELEASE;
194 op_data->op_data_version = *(__u64 *)data;
195 op_data->op_lease_handle = och->och_lease_handle;
196 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
200 LASSERT(data == NULL);
204 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
205 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
206 if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
207 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
209 rc = md_close(md_exp, op_data, och->och_mod, &req);
210 if (rc != 0 && rc != -EINTR)
211 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
212 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
214 if (rc == 0 && op_data->op_bias & bias) {
215 struct mdt_body *body;
217 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
218 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
222 ll_finish_md_op_data(op_data);
226 md_clear_open_replay_data(md_exp, och);
227 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
230 ptlrpc_req_finished(req); /* This is close request */
234 int ll_md_real_close(struct inode *inode, fmode_t fmode)
236 struct ll_inode_info *lli = ll_i2info(inode);
237 struct obd_client_handle **och_p;
238 struct obd_client_handle *och;
243 if (fmode & FMODE_WRITE) {
244 och_p = &lli->lli_mds_write_och;
245 och_usecount = &lli->lli_open_fd_write_count;
246 } else if (fmode & FMODE_EXEC) {
247 och_p = &lli->lli_mds_exec_och;
248 och_usecount = &lli->lli_open_fd_exec_count;
250 LASSERT(fmode & FMODE_READ);
251 och_p = &lli->lli_mds_read_och;
252 och_usecount = &lli->lli_open_fd_read_count;
255 mutex_lock(&lli->lli_och_mutex);
256 if (*och_usecount > 0) {
257 /* There are still users of this handle, so skip
259 mutex_unlock(&lli->lli_och_mutex);
265 mutex_unlock(&lli->lli_och_mutex);
268 /* There might be a race and this handle may already
270 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
276 static int ll_md_close(struct inode *inode, struct file *file)
278 union ldlm_policy_data policy = {
279 .l_inodebits = { MDS_INODELOCK_OPEN },
281 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
282 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
283 struct ll_inode_info *lli = ll_i2info(inode);
284 struct lustre_handle lockh;
285 enum ldlm_mode lockmode;
289 /* clear group lock, if present */
290 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
291 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
293 if (fd->fd_lease_och != NULL) {
296 /* Usually the lease is not released when the
297 * application crashed, we need to release here. */
298 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
299 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
300 PFID(&lli->lli_fid), rc, lease_broken);
302 fd->fd_lease_och = NULL;
305 if (fd->fd_och != NULL) {
306 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
311 /* Let's see if we have good enough OPEN lock on the file and if
312 we can skip talking to MDS */
313 mutex_lock(&lli->lli_och_mutex);
314 if (fd->fd_omode & FMODE_WRITE) {
316 LASSERT(lli->lli_open_fd_write_count);
317 lli->lli_open_fd_write_count--;
318 } else if (fd->fd_omode & FMODE_EXEC) {
320 LASSERT(lli->lli_open_fd_exec_count);
321 lli->lli_open_fd_exec_count--;
324 LASSERT(lli->lli_open_fd_read_count);
325 lli->lli_open_fd_read_count--;
327 mutex_unlock(&lli->lli_och_mutex);
329 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
330 LDLM_IBITS, &policy, lockmode, &lockh))
331 rc = ll_md_real_close(inode, fd->fd_omode);
334 LUSTRE_FPRIVATE(file) = NULL;
335 ll_file_data_put(fd);
340 /* While this returns an error code, fput() the caller does not, so we need
341 * to make every effort to clean up all of our state here. Also, applications
342 * rarely check close errors and even if an error is returned they will not
343 * re-try the close call.
345 int ll_file_release(struct inode *inode, struct file *file)
347 struct ll_file_data *fd;
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
353 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
354 PFID(ll_inode2fid(inode)), inode);
356 if (inode->i_sb->s_root != file_dentry(file))
357 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
358 fd = LUSTRE_FPRIVATE(file);
361 /* The last ref on @file, maybe not the the owner pid of statahead,
362 * because parent and child process can share the same file handle. */
363 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
364 ll_deauthorize_statahead(inode, fd);
366 if (inode->i_sb->s_root == file_dentry(file)) {
367 LUSTRE_FPRIVATE(file) = NULL;
368 ll_file_data_put(fd);
372 if (!S_ISDIR(inode->i_mode)) {
373 if (lli->lli_clob != NULL)
374 lov_read_and_clear_async_rc(lli->lli_clob);
375 lli->lli_async_rc = 0;
378 rc = ll_md_close(inode, file);
380 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
381 libcfs_debug_dumplog();
386 static inline int ll_dom_readpage(void *data, struct page *page)
388 struct niobuf_local *lnb = data;
391 kaddr = ll_kmap_atomic(page, KM_USER0);
392 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
393 if (lnb->lnb_len < PAGE_SIZE)
394 memset(kaddr + lnb->lnb_len, 0,
395 PAGE_SIZE - lnb->lnb_len);
396 flush_dcache_page(page);
397 SetPageUptodate(page);
398 ll_kunmap_atomic(kaddr, KM_USER0);
404 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
405 struct lookup_intent *it)
407 struct ll_inode_info *lli = ll_i2info(inode);
408 struct cl_object *obj = lli->lli_clob;
409 struct address_space *mapping = inode->i_mapping;
411 struct niobuf_remote *rnb;
416 struct lustre_handle lockh;
417 struct ldlm_lock *lock;
418 unsigned long index, start;
419 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
439 env = cl_env_get(&refcheck);
443 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
445 GOTO(out_env, rc = -ENODATA);
447 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
448 data = (char *)rnb + sizeof(*rnb);
450 if (rnb == NULL || rnb->rnb_len == 0)
451 GOTO(out_env, rc = 0);
453 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
454 rnb->rnb_len, i_size_read(inode));
456 io = vvp_env_thread_io(env);
458 io->ci_ignore_layout = 1;
459 rc = cl_io_init(env, io, CIT_MISC, obj);
463 lnb.lnb_file_offset = rnb->rnb_offset;
464 start = lnb.lnb_file_offset / PAGE_SIZE;
466 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
467 lnb.lnb_page_offset = 0;
471 lnb.lnb_data = data + (index << PAGE_SHIFT);
472 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
473 if (lnb.lnb_len > PAGE_SIZE)
474 lnb.lnb_len = PAGE_SIZE;
476 vmpage = read_cache_page(mapping, index + start,
477 ll_dom_readpage, &lnb);
478 if (IS_ERR(vmpage)) {
479 CWARN("%s: cannot fill page %lu for "DFID
480 " with data: rc = %li\n",
481 ll_get_fsname(inode->i_sb, NULL, 0),
482 index + start, PFID(lu_object_fid(&obj->co_lu)),
487 clp = cl_page_find(env, obj, vmpage->index, vmpage,
492 GOTO(out_io, rc = PTR_ERR(clp));
496 cl_page_export(env, clp, 1);
497 cl_page_put(env, clp);
501 } while (rnb->rnb_len > (index << PAGE_SHIFT));
507 cl_env_put(env, &refcheck);
510 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
511 struct lookup_intent *itp)
513 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
514 struct dentry *parent = de->d_parent;
515 const char *name = NULL;
517 struct md_op_data *op_data;
518 struct ptlrpc_request *req = NULL;
522 LASSERT(parent != NULL);
523 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
525 /* if server supports open-by-fid, or file name is invalid, don't pack
526 * name in open request */
527 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
528 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
529 name = de->d_name.name;
530 len = de->d_name.len;
533 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
534 name, len, 0, LUSTRE_OPC_ANY, NULL);
536 RETURN(PTR_ERR(op_data));
537 op_data->op_data = lmm;
538 op_data->op_data_size = lmmsize;
540 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
541 &ll_md_blocking_ast, 0);
542 ll_finish_md_op_data(op_data);
544 /* reason for keep own exit path - don`t flood log
545 * with messages with -ESTALE errors.
547 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
548 it_open_error(DISP_OPEN_OPEN, itp))
550 ll_release_openhandle(de, itp);
554 if (it_disposition(itp, DISP_LOOKUP_NEG))
555 GOTO(out, rc = -ENOENT);
557 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
558 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
559 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
563 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
565 if (!rc && itp->it_lock_mode) {
566 ll_dom_finish_open(de->d_inode, req, itp);
567 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
571 ptlrpc_req_finished(req);
572 ll_intent_drop_lock(itp);
574 /* We did open by fid, but by the time we got to the server,
575 * the object disappeared. If this is a create, we cannot really
576 * tell the userspace that the file it was trying to create
577 * does not exist. Instead let's return -ESTALE, and the VFS will
578 * retry the create with LOOKUP_REVAL that we are going to catch
579 * in ll_revalidate_dentry() and use lookup then.
581 if (rc == -ENOENT && itp->it_op & IT_CREAT)
587 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
588 struct obd_client_handle *och)
590 struct mdt_body *body;
592 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
593 och->och_fh = body->mbo_handle;
594 och->och_fid = body->mbo_fid1;
595 och->och_lease_handle.cookie = it->it_lock_handle;
596 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
597 och->och_flags = it->it_flags;
599 return md_set_open_replay_data(md_exp, och, it);
602 static int ll_local_open(struct file *file, struct lookup_intent *it,
603 struct ll_file_data *fd, struct obd_client_handle *och)
605 struct inode *inode = file_inode(file);
608 LASSERT(!LUSTRE_FPRIVATE(file));
615 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
620 LUSTRE_FPRIVATE(file) = fd;
621 ll_readahead_init(inode, &fd->fd_ras);
622 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
624 /* ll_cl_context initialize */
625 rwlock_init(&fd->fd_lock);
626 INIT_LIST_HEAD(&fd->fd_lccs);
631 /* Open a file, and (for the very first open) create objects on the OSTs at
632 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
633 * creation or open until ll_lov_setstripe() ioctl is called.
635 * If we already have the stripe MD locally then we don't request it in
636 * md_open(), by passing a lmm_size = 0.
638 * It is up to the application to ensure no other processes open this file
639 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
640 * used. We might be able to avoid races of that sort by getting lli_open_sem
641 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
642 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
644 int ll_file_open(struct inode *inode, struct file *file)
646 struct ll_inode_info *lli = ll_i2info(inode);
647 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
648 .it_flags = file->f_flags };
649 struct obd_client_handle **och_p = NULL;
650 __u64 *och_usecount = NULL;
651 struct ll_file_data *fd;
655 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
656 PFID(ll_inode2fid(inode)), inode, file->f_flags);
658 it = file->private_data; /* XXX: compat macro */
659 file->private_data = NULL; /* prevent ll_local_open assertion */
661 fd = ll_file_data_get();
663 GOTO(out_nofiledata, rc = -ENOMEM);
666 if (S_ISDIR(inode->i_mode))
667 ll_authorize_statahead(inode, fd);
669 if (inode->i_sb->s_root == file_dentry(file)) {
670 LUSTRE_FPRIVATE(file) = fd;
674 if (!it || !it->it_disposition) {
675 /* Convert f_flags into access mode. We cannot use file->f_mode,
676 * because everything but O_ACCMODE mask was stripped from
678 if ((oit.it_flags + 1) & O_ACCMODE)
680 if (file->f_flags & O_TRUNC)
681 oit.it_flags |= FMODE_WRITE;
683 /* kernel only call f_op->open in dentry_open. filp_open calls
684 * dentry_open after call to open_namei that checks permissions.
685 * Only nfsd_open call dentry_open directly without checking
686 * permissions and because of that this code below is safe. */
687 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
688 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
690 /* We do not want O_EXCL here, presumably we opened the file
691 * already? XXX - NFS implications? */
692 oit.it_flags &= ~O_EXCL;
694 /* bug20584, if "it_flags" contains O_CREAT, the file will be
695 * created if necessary, then "IT_CREAT" should be set to keep
696 * consistent with it */
697 if (oit.it_flags & O_CREAT)
698 oit.it_op |= IT_CREAT;
704 /* Let's see if we have file open on MDS already. */
705 if (it->it_flags & FMODE_WRITE) {
706 och_p = &lli->lli_mds_write_och;
707 och_usecount = &lli->lli_open_fd_write_count;
708 } else if (it->it_flags & FMODE_EXEC) {
709 och_p = &lli->lli_mds_exec_och;
710 och_usecount = &lli->lli_open_fd_exec_count;
712 och_p = &lli->lli_mds_read_och;
713 och_usecount = &lli->lli_open_fd_read_count;
716 mutex_lock(&lli->lli_och_mutex);
717 if (*och_p) { /* Open handle is present */
718 if (it_disposition(it, DISP_OPEN_OPEN)) {
719 /* Well, there's extra open request that we do not need,
720 let's close it somehow. This will decref request. */
721 rc = it_open_error(DISP_OPEN_OPEN, it);
723 mutex_unlock(&lli->lli_och_mutex);
724 GOTO(out_openerr, rc);
727 ll_release_openhandle(file_dentry(file), it);
731 rc = ll_local_open(file, it, fd, NULL);
734 mutex_unlock(&lli->lli_och_mutex);
735 GOTO(out_openerr, rc);
738 LASSERT(*och_usecount == 0);
739 if (!it->it_disposition) {
740 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
741 /* We cannot just request lock handle now, new ELC code
742 means that one of other OPEN locks for this file
743 could be cancelled, and since blocking ast handler
744 would attempt to grab och_mutex as well, that would
745 result in a deadlock */
746 mutex_unlock(&lli->lli_och_mutex);
748 * Normally called under two situations:
750 * 2. A race/condition on MDS resulting in no open
751 * handle to be returned from LOOKUP|OPEN request,
752 * for example if the target entry was a symlink.
754 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
755 * marked by a bit set in ll_iget_for_nfs. Clear the
756 * bit so that it's not confusing later callers.
758 * NB; when ldd is NULL, it must have come via normal
759 * lookup path only, since ll_iget_for_nfs always calls
762 if (ldd && ldd->lld_nfs_dentry) {
763 ldd->lld_nfs_dentry = 0;
764 it->it_flags |= MDS_OPEN_LOCK;
768 * Always specify MDS_OPEN_BY_FID because we don't want
769 * to get file with different fid.
771 it->it_flags |= MDS_OPEN_BY_FID;
772 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
775 GOTO(out_openerr, rc);
779 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
781 GOTO(out_och_free, rc = -ENOMEM);
785 /* md_intent_lock() didn't get a request ref if there was an
786 * open error, so don't do cleanup on the request here
788 /* XXX (green): Should not we bail out on any error here, not
789 * just open error? */
790 rc = it_open_error(DISP_OPEN_OPEN, it);
792 GOTO(out_och_free, rc);
794 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
795 "inode %p: disposition %x, status %d\n", inode,
796 it_disposition(it, ~0), it->it_status);
798 rc = ll_local_open(file, it, fd, *och_p);
800 GOTO(out_och_free, rc);
802 mutex_unlock(&lli->lli_och_mutex);
805 /* Must do this outside lli_och_mutex lock to prevent deadlock where
806 different kind of OPEN lock for this same inode gets cancelled
807 by ldlm_cancel_lru */
808 if (!S_ISREG(inode->i_mode))
809 GOTO(out_och_free, rc);
811 cl_lov_delay_create_clear(&file->f_flags);
812 GOTO(out_och_free, rc);
816 if (och_p && *och_p) {
817 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
818 *och_p = NULL; /* OBD_FREE writes some magic there */
821 mutex_unlock(&lli->lli_och_mutex);
824 if (lli->lli_opendir_key == fd)
825 ll_deauthorize_statahead(inode, fd);
827 ll_file_data_put(fd);
829 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
833 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
834 ptlrpc_req_finished(it->it_request);
835 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
841 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
842 struct ldlm_lock_desc *desc, void *data, int flag)
845 struct lustre_handle lockh;
849 case LDLM_CB_BLOCKING:
850 ldlm_lock2handle(lock, &lockh);
851 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
853 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
857 case LDLM_CB_CANCELING:
865 * When setting a lease on a file, we take ownership of the lli_mds_*_och
866 * and save it as fd->fd_och so as to force client to reopen the file even
867 * if it has an open lock in cache already.
869 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
870 struct lustre_handle *old_handle)
872 struct ll_inode_info *lli = ll_i2info(inode);
873 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
874 struct obd_client_handle **och_p;
879 /* Get the openhandle of the file */
880 mutex_lock(&lli->lli_och_mutex);
881 if (fd->fd_lease_och != NULL)
882 GOTO(out_unlock, rc = -EBUSY);
884 if (fd->fd_och == NULL) {
885 if (file->f_mode & FMODE_WRITE) {
886 LASSERT(lli->lli_mds_write_och != NULL);
887 och_p = &lli->lli_mds_write_och;
888 och_usecount = &lli->lli_open_fd_write_count;
890 LASSERT(lli->lli_mds_read_och != NULL);
891 och_p = &lli->lli_mds_read_och;
892 och_usecount = &lli->lli_open_fd_read_count;
895 if (*och_usecount > 1)
896 GOTO(out_unlock, rc = -EBUSY);
903 *old_handle = fd->fd_och->och_fh;
907 mutex_unlock(&lli->lli_och_mutex);
912 * Release ownership on lli_mds_*_och when putting back a file lease.
914 static int ll_lease_och_release(struct inode *inode, struct file *file)
916 struct ll_inode_info *lli = ll_i2info(inode);
917 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
918 struct obd_client_handle **och_p;
919 struct obd_client_handle *old_och = NULL;
924 mutex_lock(&lli->lli_och_mutex);
925 if (file->f_mode & FMODE_WRITE) {
926 och_p = &lli->lli_mds_write_och;
927 och_usecount = &lli->lli_open_fd_write_count;
929 och_p = &lli->lli_mds_read_och;
930 och_usecount = &lli->lli_open_fd_read_count;
933 /* The file may have been open by another process (broken lease) so
934 * *och_p is not NULL. In this case we should simply increase usecount
937 if (*och_p != NULL) {
938 old_och = fd->fd_och;
945 mutex_unlock(&lli->lli_och_mutex);
948 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
954 * Acquire a lease and open the file.
956 static struct obd_client_handle *
957 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
960 struct lookup_intent it = { .it_op = IT_OPEN };
961 struct ll_sb_info *sbi = ll_i2sbi(inode);
962 struct md_op_data *op_data;
963 struct ptlrpc_request *req = NULL;
964 struct lustre_handle old_handle = { 0 };
965 struct obd_client_handle *och = NULL;
970 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
971 RETURN(ERR_PTR(-EINVAL));
974 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
975 RETURN(ERR_PTR(-EPERM));
977 rc = ll_lease_och_acquire(inode, file, &old_handle);
984 RETURN(ERR_PTR(-ENOMEM));
986 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
987 LUSTRE_OPC_ANY, NULL);
989 GOTO(out, rc = PTR_ERR(op_data));
991 /* To tell the MDT this openhandle is from the same owner */
992 op_data->op_handle = old_handle;
994 it.it_flags = fmode | open_flags;
995 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
996 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
997 &ll_md_blocking_lease_ast,
998 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
999 * it can be cancelled which may mislead applications that the lease is
1001 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1002 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1003 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1004 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1005 ll_finish_md_op_data(op_data);
1006 ptlrpc_req_finished(req);
1008 GOTO(out_release_it, rc);
1010 if (it_disposition(&it, DISP_LOOKUP_NEG))
1011 GOTO(out_release_it, rc = -ENOENT);
1013 rc = it_open_error(DISP_OPEN_OPEN, &it);
1015 GOTO(out_release_it, rc);
1017 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1018 ll_och_fill(sbi->ll_md_exp, &it, och);
1020 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1021 GOTO(out_close, rc = -EOPNOTSUPP);
1023 /* already get lease, handle lease lock */
1024 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1025 if (it.it_lock_mode == 0 ||
1026 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1027 /* open lock must return for lease */
1028 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1029 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1031 GOTO(out_close, rc = -EPROTO);
1034 ll_intent_release(&it);
1038 /* Cancel open lock */
1039 if (it.it_lock_mode != 0) {
1040 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1042 it.it_lock_mode = 0;
1043 och->och_lease_handle.cookie = 0ULL;
1045 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1047 CERROR("%s: error closing file "DFID": %d\n",
1048 ll_get_fsname(inode->i_sb, NULL, 0),
1049 PFID(&ll_i2info(inode)->lli_fid), rc2);
1050 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1052 ll_intent_release(&it);
1056 RETURN(ERR_PTR(rc));
1060 * Check whether a layout swap can be done between two inodes.
1062 * \param[in] inode1 First inode to check
1063 * \param[in] inode2 Second inode to check
1065 * \retval 0 on success, layout swap can be performed between both inodes
1066 * \retval negative error code if requirements are not met
1068 static int ll_check_swap_layouts_validity(struct inode *inode1,
1069 struct inode *inode2)
1071 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1074 if (inode_permission(inode1, MAY_WRITE) ||
1075 inode_permission(inode2, MAY_WRITE))
1078 if (inode1->i_sb != inode2->i_sb)
1084 static int ll_swap_layouts_close(struct obd_client_handle *och,
1085 struct inode *inode, struct inode *inode2)
1087 const struct lu_fid *fid1 = ll_inode2fid(inode);
1088 const struct lu_fid *fid2;
1092 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1093 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1095 rc = ll_check_swap_layouts_validity(inode, inode2);
1097 GOTO(out_free_och, rc);
1099 /* We now know that inode2 is a lustre inode */
1100 fid2 = ll_inode2fid(inode2);
1102 rc = lu_fid_cmp(fid1, fid2);
1104 GOTO(out_free_och, rc = -EINVAL);
1106 /* Close the file and {swap,merge} layouts between inode & inode2.
1107 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1108 * because we still need it to pack l_remote_handle to MDT. */
1109 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1112 och = NULL; /* freed in ll_close_inode_openhandle() */
1122 * Release lease and close the file.
1123 * It will check if the lease has ever broken.
1125 static int ll_lease_close_intent(struct obd_client_handle *och,
1126 struct inode *inode,
1127 bool *lease_broken, enum mds_op_bias bias,
1130 struct ldlm_lock *lock;
1131 bool cancelled = true;
1135 lock = ldlm_handle2lock(&och->och_lease_handle);
1137 lock_res_and_lock(lock);
1138 cancelled = ldlm_is_cancel(lock);
1139 unlock_res_and_lock(lock);
1140 LDLM_LOCK_PUT(lock);
1143 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1144 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1146 if (lease_broken != NULL)
1147 *lease_broken = cancelled;
1149 if (!cancelled && !bias)
1150 ldlm_cli_cancel(&och->och_lease_handle, 0);
1152 if (cancelled) { /* no need to excute intent */
1157 rc = ll_close_inode_openhandle(inode, och, bias, data);
1161 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1164 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1168 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1170 static int ll_lease_file_resync(struct obd_client_handle *och,
1171 struct inode *inode)
1173 struct ll_sb_info *sbi = ll_i2sbi(inode);
1174 struct md_op_data *op_data;
1175 __u64 data_version_unused;
1179 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1180 LUSTRE_OPC_ANY, NULL);
1181 if (IS_ERR(op_data))
1182 RETURN(PTR_ERR(op_data));
1184 /* before starting file resync, it's necessary to clean up page cache
1185 * in client memory, otherwise once the layout version is increased,
1186 * writing back cached data will be denied the OSTs. */
1187 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1191 op_data->op_handle = och->och_lease_handle;
1192 rc = md_file_resync(sbi->ll_md_exp, op_data);
1198 ll_finish_md_op_data(op_data);
1202 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1204 struct ll_inode_info *lli = ll_i2info(inode);
1205 struct cl_object *obj = lli->lli_clob;
1206 struct cl_attr *attr = vvp_env_thread_attr(env);
1214 ll_inode_size_lock(inode);
1216 /* Merge timestamps the most recently obtained from MDS with
1217 * timestamps obtained from OSTs.
1219 * Do not overwrite atime of inode because it may be refreshed
1220 * by file_accessed() function. If the read was served by cache
1221 * data, there is no RPC to be sent so that atime may not be
1222 * transferred to OSTs at all. MDT only updates atime at close time
1223 * if it's at least 'mdd.*.atime_diff' older.
1224 * All in all, the atime in Lustre does not strictly comply with
1225 * POSIX. Solving this problem needs to send an RPC to MDT for each
1226 * read, this will hurt performance. */
1227 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1228 LTIME_S(inode->i_atime) = lli->lli_atime;
1229 lli->lli_update_atime = 0;
1231 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1232 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1234 atime = LTIME_S(inode->i_atime);
1235 mtime = LTIME_S(inode->i_mtime);
1236 ctime = LTIME_S(inode->i_ctime);
1238 cl_object_attr_lock(obj);
1239 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1242 rc = cl_object_attr_get(env, obj, attr);
1243 cl_object_attr_unlock(obj);
1246 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1248 if (atime < attr->cat_atime)
1249 atime = attr->cat_atime;
1251 if (ctime < attr->cat_ctime)
1252 ctime = attr->cat_ctime;
1254 if (mtime < attr->cat_mtime)
1255 mtime = attr->cat_mtime;
1257 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1258 PFID(&lli->lli_fid), attr->cat_size);
1260 i_size_write(inode, attr->cat_size);
1261 inode->i_blocks = attr->cat_blocks;
1263 LTIME_S(inode->i_atime) = atime;
1264 LTIME_S(inode->i_mtime) = mtime;
1265 LTIME_S(inode->i_ctime) = ctime;
1268 ll_inode_size_unlock(inode);
1274 * Set designated mirror for I/O.
1276 * So far only read, write, and truncated can support to issue I/O to
1277 * designated mirror.
1279 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1281 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1283 /* clear layout version for generic(non-resync) I/O in case it carries
1284 * stale layout version due to I/O restart */
1285 io->ci_layout_version = 0;
1287 /* FLR: disable non-delay for designated mirror I/O because obviously
1288 * only one mirror is available */
1289 if (fd->fd_designated_mirror > 0) {
1291 io->ci_designated_mirror = fd->fd_designated_mirror;
1292 io->ci_layout_version = fd->fd_layout_version;
1293 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1297 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1298 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1301 static bool file_is_noatime(const struct file *file)
1303 const struct vfsmount *mnt = file->f_path.mnt;
1304 const struct inode *inode = file_inode((struct file *)file);
1306 /* Adapted from file_accessed() and touch_atime().*/
1307 if (file->f_flags & O_NOATIME)
1310 if (inode->i_flags & S_NOATIME)
1313 if (IS_NOATIME(inode))
1316 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1319 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1322 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1328 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1330 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1332 struct inode *inode = file_inode(file);
1333 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1335 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1336 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1337 io->u.ci_rw.rw_file = file;
1338 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1339 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1340 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1342 if (iot == CIT_WRITE) {
1343 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1344 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1345 file->f_flags & O_DIRECT ||
1348 io->ci_obj = ll_i2info(inode)->lli_clob;
1349 io->ci_lockreq = CILR_MAYBE;
1350 if (ll_file_nolock(file)) {
1351 io->ci_lockreq = CILR_NEVER;
1352 io->ci_no_srvlock = 1;
1353 } else if (file->f_flags & O_APPEND) {
1354 io->ci_lockreq = CILR_MANDATORY;
1356 io->ci_noatime = file_is_noatime(file);
1357 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1358 io->ci_pio = !io->u.ci_rw.rw_append;
1362 /* FLR: only use non-delay I/O for read as there is only one
1363 * avaliable mirror for write. */
1364 io->ci_ndelay = !(iot == CIT_WRITE);
1366 ll_io_set_mirror(io, file);
1369 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1371 struct cl_io_pt *pt = ptask->pt_cbdata;
1372 struct file *file = pt->cip_file;
1375 loff_t pos = pt->cip_pos;
1380 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1381 file_dentry(file)->d_name.name,
1382 pt->cip_iot == CIT_READ ? "read" : "write",
1383 pos, pos + pt->cip_count);
1385 env = cl_env_get(&refcheck);
1387 RETURN(PTR_ERR(env));
1389 io = vvp_env_thread_io(env);
1390 ll_io_init(io, file, pt->cip_iot);
1391 io->u.ci_rw.rw_iter = pt->cip_iter;
1392 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1393 io->ci_pio = 0; /* It's already in parallel task */
1395 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1396 pt->cip_count - pt->cip_result);
1398 struct vvp_io *vio = vvp_env_io(env);
1400 vio->vui_io_subtype = IO_NORMAL;
1401 vio->vui_fd = LUSTRE_FPRIVATE(file);
1403 ll_cl_add(file, env, io, LCC_RW);
1404 rc = cl_io_loop(env, io);
1405 ll_cl_remove(file, env);
1407 /* cl_io_rw_init() handled IO */
1411 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1417 if (io->ci_nob > 0) {
1418 pt->cip_result += io->ci_nob;
1419 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1421 pt->cip_iocb.ki_pos = pos;
1422 #ifdef HAVE_KIOCB_KI_LEFT
1423 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1424 #elif defined(HAVE_KI_NBYTES)
1425 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1429 cl_io_fini(env, io);
1430 cl_env_put(env, &refcheck);
1432 pt->cip_need_restart = io->ci_need_restart;
1434 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1435 file_dentry(file)->d_name.name,
1436 pt->cip_iot == CIT_READ ? "read" : "write",
1437 pt->cip_result, rc);
1439 RETURN(pt->cip_result > 0 ? 0 : rc);
1443 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1444 struct file *file, enum cl_io_type iot,
1445 loff_t *ppos, size_t count)
1447 struct range_lock range;
1448 struct vvp_io *vio = vvp_env_io(env);
1449 struct inode *inode = file_inode(file);
1450 struct ll_inode_info *lli = ll_i2info(inode);
1451 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1456 unsigned retried = 0;
1457 bool restarted = false;
1461 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1462 file_dentry(file)->d_name.name,
1463 iot == CIT_READ ? "read" : "write", pos, pos + count);
1466 io = vvp_env_thread_io(env);
1467 ll_io_init(io, file, iot);
1468 if (args->via_io_subtype == IO_NORMAL) {
1469 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1470 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1472 if (args->via_io_subtype != IO_NORMAL || restarted)
1474 io->ci_ndelay_tried = retried;
1476 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1477 bool range_locked = false;
1479 if (file->f_flags & O_APPEND)
1480 range_lock_init(&range, 0, LUSTRE_EOF);
1482 range_lock_init(&range, pos, pos + count - 1);
1484 vio->vui_fd = LUSTRE_FPRIVATE(file);
1485 vio->vui_io_subtype = args->via_io_subtype;
1487 switch (vio->vui_io_subtype) {
1489 /* Direct IO reads must also take range lock,
1490 * or multiple reads will try to work on the same pages
1491 * See LU-6227 for details. */
1492 if (((iot == CIT_WRITE) ||
1493 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1494 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1495 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1497 rc = range_lock(&lli->lli_write_tree, &range);
1501 range_locked = true;
1505 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1506 vio->u.splice.vui_flags = args->u.splice.via_flags;
1509 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1513 ll_cl_add(file, env, io, LCC_RW);
1514 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1515 !lli->lli_inode_locked) {
1517 lli->lli_inode_locked = 1;
1519 rc = cl_io_loop(env, io);
1520 if (lli->lli_inode_locked) {
1521 lli->lli_inode_locked = 0;
1522 inode_unlock(inode);
1524 ll_cl_remove(file, env);
1527 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1529 range_unlock(&lli->lli_write_tree, &range);
1532 /* cl_io_rw_init() handled IO */
1536 if (io->ci_nob > 0) {
1537 result += io->ci_nob;
1538 count -= io->ci_nob;
1540 if (args->via_io_subtype == IO_NORMAL) {
1541 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1543 /* CLIO is too complicated. See LU-11069. */
1544 if (cl_io_is_append(io))
1545 pos = io->u.ci_rw.rw_iocb.ki_pos;
1549 args->u.normal.via_iocb->ki_pos = pos;
1550 #ifdef HAVE_KIOCB_KI_LEFT
1551 args->u.normal.via_iocb->ki_left = count;
1552 #elif defined(HAVE_KI_NBYTES)
1553 args->u.normal.via_iocb->ki_nbytes = count;
1557 pos = io->u.ci_rw.rw_range.cir_pos;
1561 cl_io_fini(env, io);
1564 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1565 file->f_path.dentry->d_name.name,
1566 iot, rc, result, io->ci_need_restart);
1568 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1570 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1571 file_dentry(file)->d_name.name,
1572 iot == CIT_READ ? "read" : "write",
1573 pos, pos + count, result, rc);
1574 /* preserve the tried count for FLR */
1575 retried = io->ci_ndelay_tried;
1580 if (iot == CIT_READ) {
1582 ll_stats_ops_tally(ll_i2sbi(inode),
1583 LPROC_LL_READ_BYTES, result);
1584 } else if (iot == CIT_WRITE) {
1586 ll_stats_ops_tally(ll_i2sbi(inode),
1587 LPROC_LL_WRITE_BYTES, result);
1588 fd->fd_write_failed = false;
1589 } else if (result == 0 && rc == 0) {
1592 fd->fd_write_failed = true;
1594 fd->fd_write_failed = false;
1595 } else if (rc != -ERESTARTSYS) {
1596 fd->fd_write_failed = true;
1600 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1601 file_dentry(file)->d_name.name,
1602 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1606 RETURN(result > 0 ? result : rc);
1610 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1611 * especially for small I/O.
1613 * To serve a read request, CLIO has to create and initialize a cl_io and
1614 * then request DLM lock. This has turned out to have siginificant overhead
1615 * and affects the performance of small I/O dramatically.
1617 * It's not necessary to create a cl_io for each I/O. Under the help of read
1618 * ahead, most of the pages being read are already in memory cache and we can
1619 * read those pages directly because if the pages exist, the corresponding DLM
1620 * lock must exist so that page content must be valid.
1622 * In fast read implementation, the llite speculatively finds and reads pages
1623 * in memory cache. There are three scenarios for fast read:
1624 * - If the page exists and is uptodate, kernel VM will provide the data and
1625 * CLIO won't be intervened;
1626 * - If the page was brought into memory by read ahead, it will be exported
1627 * and read ahead parameters will be updated;
1628 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1629 * it will go back and invoke normal read, i.e., a cl_io will be created
1630 * and DLM lock will be requested.
1632 * POSIX compliance: posix standard states that read is intended to be atomic.
1633 * Lustre read implementation is in line with Linux kernel read implementation
1634 * and neither of them complies with POSIX standard in this matter. Fast read
1635 * doesn't make the situation worse on single node but it may interleave write
1636 * results from multiple nodes due to short read handling in ll_file_aio_read().
1638 * \param env - lu_env
1639 * \param iocb - kiocb from kernel
1640 * \param iter - user space buffers where the data will be copied
1642 * \retval - number of bytes have been read, or error code if error occurred.
1645 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1649 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1652 /* NB: we can't do direct IO for fast read because it will need a lock
1653 * to make IO engine happy. */
1654 if (iocb->ki_filp->f_flags & O_DIRECT)
1657 result = generic_file_read_iter(iocb, iter);
1659 /* If the first page is not in cache, generic_file_aio_read() will be
1660 * returned with -ENODATA.
1661 * See corresponding code in ll_readpage(). */
1662 if (result == -ENODATA)
1666 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1667 LPROC_LL_READ_BYTES, result);
1673 * Read from a file (through the page cache).
1675 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1678 struct vvp_io_args *args;
1683 result = ll_do_fast_read(iocb, to);
1684 if (result < 0 || iov_iter_count(to) == 0)
1687 env = cl_env_get(&refcheck);
1689 return PTR_ERR(env);
1691 args = ll_env_args(env, IO_NORMAL);
1692 args->u.normal.via_iter = to;
1693 args->u.normal.via_iocb = iocb;
1695 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1696 &iocb->ki_pos, iov_iter_count(to));
1699 else if (result == 0)
1702 cl_env_put(env, &refcheck);
1708 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1709 * If a page is already in the page cache and dirty (and some other things -
1710 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1711 * write to it without doing a full I/O, because Lustre already knows about it
1712 * and will write it out. This saves a lot of processing time.
1714 * All writes here are within one page, so exclusion is handled by the page
1715 * lock on the vm page. We do not do tiny writes for writes which touch
1716 * multiple pages because it's very unlikely multiple sequential pages are
1717 * are already dirty.
1719 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1720 * and are unlikely to be to already dirty pages.
1722 * Attribute updates are important here, we do them in ll_tiny_write_end.
1724 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1726 ssize_t count = iov_iter_count(iter);
1727 struct file *file = iocb->ki_filp;
1728 struct inode *inode = file_inode(file);
1733 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1734 * of function for why.
1736 if (count >= PAGE_SIZE ||
1737 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1740 result = __generic_file_write_iter(iocb, iter);
1742 /* If the page is not already dirty, ll_tiny_write_begin returns
1743 * -ENODATA. We continue on to normal write.
1745 if (result == -ENODATA)
1749 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1751 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1754 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1760 * Write to a file (through the page cache).
1762 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1764 struct vvp_io_args *args;
1766 ssize_t rc_tiny = 0, rc_normal;
1771 /* NB: we can't do direct IO for tiny writes because they use the page
1772 * cache, we can't do sync writes because tiny writes can't flush
1773 * pages, and we can't do append writes because we can't guarantee the
1774 * required DLM locks are held to protect file size.
1776 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1777 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1778 rc_tiny = ll_do_tiny_write(iocb, from);
1780 /* In case of error, go on and try normal write - Only stop if tiny
1781 * write completed I/O.
1783 if (iov_iter_count(from) == 0)
1784 GOTO(out, rc_normal = rc_tiny);
1786 env = cl_env_get(&refcheck);
1788 return PTR_ERR(env);
1790 args = ll_env_args(env, IO_NORMAL);
1791 args->u.normal.via_iter = from;
1792 args->u.normal.via_iocb = iocb;
1794 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1795 &iocb->ki_pos, iov_iter_count(from));
1797 /* On success, combine bytes written. */
1798 if (rc_tiny >= 0 && rc_normal > 0)
1799 rc_normal += rc_tiny;
1800 /* On error, only return error from normal write if tiny write did not
1801 * write any bytes. Otherwise return bytes written by tiny write.
1803 else if (rc_tiny > 0)
1804 rc_normal = rc_tiny;
1806 cl_env_put(env, &refcheck);
1811 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1813 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1815 static int ll_file_get_iov_count(const struct iovec *iov,
1816 unsigned long *nr_segs, size_t *count)
1821 for (seg = 0; seg < *nr_segs; seg++) {
1822 const struct iovec *iv = &iov[seg];
1825 * If any segment has a negative length, or the cumulative
1826 * length ever wraps negative then return -EINVAL.
1829 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1831 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1836 cnt -= iv->iov_len; /* This segment is no good */
1843 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1844 unsigned long nr_segs, loff_t pos)
1851 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1855 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1856 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1857 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1858 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1859 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1861 result = ll_file_read_iter(iocb, &to);
1866 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1869 struct iovec iov = { .iov_base = buf, .iov_len = count };
1874 init_sync_kiocb(&kiocb, file);
1875 kiocb.ki_pos = *ppos;
1876 #ifdef HAVE_KIOCB_KI_LEFT
1877 kiocb.ki_left = count;
1878 #elif defined(HAVE_KI_NBYTES)
1879 kiocb.i_nbytes = count;
1882 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1883 *ppos = kiocb.ki_pos;
1889 * Write to a file (through the page cache).
1892 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1893 unsigned long nr_segs, loff_t pos)
1895 struct iov_iter from;
1900 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1904 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1905 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1906 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1907 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1908 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1910 result = ll_file_write_iter(iocb, &from);
1915 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1916 size_t count, loff_t *ppos)
1918 struct iovec iov = { .iov_base = (void __user *)buf,
1925 init_sync_kiocb(&kiocb, file);
1926 kiocb.ki_pos = *ppos;
1927 #ifdef HAVE_KIOCB_KI_LEFT
1928 kiocb.ki_left = count;
1929 #elif defined(HAVE_KI_NBYTES)
1930 kiocb.ki_nbytes = count;
1933 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1934 *ppos = kiocb.ki_pos;
1938 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1941 * Send file content (through pagecache) somewhere with helper
1943 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1944 struct pipe_inode_info *pipe, size_t count,
1948 struct vvp_io_args *args;
1953 env = cl_env_get(&refcheck);
1955 RETURN(PTR_ERR(env));
1957 args = ll_env_args(env, IO_SPLICE);
1958 args->u.splice.via_pipe = pipe;
1959 args->u.splice.via_flags = flags;
1961 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1962 cl_env_put(env, &refcheck);
1966 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1967 __u64 flags, struct lov_user_md *lum, int lum_size)
1969 struct lookup_intent oit = {
1971 .it_flags = flags | MDS_OPEN_BY_FID,
1976 ll_inode_size_lock(inode);
1977 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1979 GOTO(out_unlock, rc);
1981 ll_release_openhandle(dentry, &oit);
1984 ll_inode_size_unlock(inode);
1985 ll_intent_release(&oit);
1990 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1991 struct lov_mds_md **lmmp, int *lmm_size,
1992 struct ptlrpc_request **request)
1994 struct ll_sb_info *sbi = ll_i2sbi(inode);
1995 struct mdt_body *body;
1996 struct lov_mds_md *lmm = NULL;
1997 struct ptlrpc_request *req = NULL;
1998 struct md_op_data *op_data;
2001 rc = ll_get_default_mdsize(sbi, &lmmsize);
2005 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2006 strlen(filename), lmmsize,
2007 LUSTRE_OPC_ANY, NULL);
2008 if (IS_ERR(op_data))
2009 RETURN(PTR_ERR(op_data));
2011 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2012 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2013 ll_finish_md_op_data(op_data);
2015 CDEBUG(D_INFO, "md_getattr_name failed "
2016 "on %s: rc %d\n", filename, rc);
2020 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2021 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2023 lmmsize = body->mbo_eadatasize;
2025 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2027 GOTO(out, rc = -ENODATA);
2030 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2031 LASSERT(lmm != NULL);
2033 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2034 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2035 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2036 GOTO(out, rc = -EPROTO);
2039 * This is coming from the MDS, so is probably in
2040 * little endian. We convert it to host endian before
2041 * passing it to userspace.
2043 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2046 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2047 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2048 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2049 if (le32_to_cpu(lmm->lmm_pattern) &
2050 LOV_PATTERN_F_RELEASED)
2054 /* if function called for directory - we should
2055 * avoid swab not existent lsm objects */
2056 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2057 lustre_swab_lov_user_md_v1(
2058 (struct lov_user_md_v1 *)lmm);
2059 if (S_ISREG(body->mbo_mode))
2060 lustre_swab_lov_user_md_objects(
2061 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2063 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2064 lustre_swab_lov_user_md_v3(
2065 (struct lov_user_md_v3 *)lmm);
2066 if (S_ISREG(body->mbo_mode))
2067 lustre_swab_lov_user_md_objects(
2068 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2070 } else if (lmm->lmm_magic ==
2071 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2072 lustre_swab_lov_comp_md_v1(
2073 (struct lov_comp_md_v1 *)lmm);
2079 *lmm_size = lmmsize;
2084 static int ll_lov_setea(struct inode *inode, struct file *file,
2087 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2088 struct lov_user_md *lump;
2089 int lum_size = sizeof(struct lov_user_md) +
2090 sizeof(struct lov_user_ost_data);
2094 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2097 OBD_ALLOC_LARGE(lump, lum_size);
2101 if (copy_from_user(lump, arg, lum_size))
2102 GOTO(out_lump, rc = -EFAULT);
2104 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2106 cl_lov_delay_create_clear(&file->f_flags);
2109 OBD_FREE_LARGE(lump, lum_size);
2113 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2120 env = cl_env_get(&refcheck);
2122 RETURN(PTR_ERR(env));
2124 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2125 cl_env_put(env, &refcheck);
2129 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2132 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2133 struct lov_user_md *klum;
2135 __u64 flags = FMODE_WRITE;
2138 rc = ll_copy_user_md(lum, &klum);
2143 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2148 rc = put_user(0, &lum->lmm_stripe_count);
2152 rc = ll_layout_refresh(inode, &gen);
2156 rc = ll_file_getstripe(inode, arg, lum_size);
2158 cl_lov_delay_create_clear(&file->f_flags);
2161 OBD_FREE(klum, lum_size);
2166 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2168 struct ll_inode_info *lli = ll_i2info(inode);
2169 struct cl_object *obj = lli->lli_clob;
2170 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2171 struct ll_grouplock grouplock;
2176 CWARN("group id for group lock must not be 0\n");
2180 if (ll_file_nolock(file))
2181 RETURN(-EOPNOTSUPP);
2183 spin_lock(&lli->lli_lock);
2184 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2185 CWARN("group lock already existed with gid %lu\n",
2186 fd->fd_grouplock.lg_gid);
2187 spin_unlock(&lli->lli_lock);
2190 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2191 spin_unlock(&lli->lli_lock);
2194 * XXX: group lock needs to protect all OST objects while PFL
2195 * can add new OST objects during the IO, so we'd instantiate
2196 * all OST objects before getting its group lock.
2201 struct cl_layout cl = {
2202 .cl_is_composite = false,
2204 struct lu_extent ext = {
2206 .e_end = OBD_OBJECT_EOF,
2209 env = cl_env_get(&refcheck);
2211 RETURN(PTR_ERR(env));
2213 rc = cl_object_layout_get(env, obj, &cl);
2214 if (!rc && cl.cl_is_composite)
2215 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2218 cl_env_put(env, &refcheck);
2223 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2224 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2228 spin_lock(&lli->lli_lock);
2229 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2230 spin_unlock(&lli->lli_lock);
2231 CERROR("another thread just won the race\n");
2232 cl_put_grouplock(&grouplock);
2236 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2237 fd->fd_grouplock = grouplock;
2238 spin_unlock(&lli->lli_lock);
2240 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2244 static int ll_put_grouplock(struct inode *inode, struct file *file,
2247 struct ll_inode_info *lli = ll_i2info(inode);
2248 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2249 struct ll_grouplock grouplock;
2252 spin_lock(&lli->lli_lock);
2253 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2254 spin_unlock(&lli->lli_lock);
2255 CWARN("no group lock held\n");
2259 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2261 if (fd->fd_grouplock.lg_gid != arg) {
2262 CWARN("group lock %lu doesn't match current id %lu\n",
2263 arg, fd->fd_grouplock.lg_gid);
2264 spin_unlock(&lli->lli_lock);
2268 grouplock = fd->fd_grouplock;
2269 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2270 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2271 spin_unlock(&lli->lli_lock);
2273 cl_put_grouplock(&grouplock);
2274 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2279 * Close inode open handle
2281 * \param dentry [in] dentry which contains the inode
2282 * \param it [in,out] intent which contains open info and result
2285 * \retval <0 failure
2287 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2289 struct inode *inode = dentry->d_inode;
2290 struct obd_client_handle *och;
2296 /* Root ? Do nothing. */
2297 if (dentry->d_inode->i_sb->s_root == dentry)
2300 /* No open handle to close? Move away */
2301 if (!it_disposition(it, DISP_OPEN_OPEN))
2304 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2306 OBD_ALLOC(och, sizeof(*och));
2308 GOTO(out, rc = -ENOMEM);
2310 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2312 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2314 /* this one is in place of ll_file_open */
2315 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2316 ptlrpc_req_finished(it->it_request);
2317 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2323 * Get size for inode for which FIEMAP mapping is requested.
2324 * Make the FIEMAP get_info call and returns the result.
2325 * \param fiemap kernel buffer to hold extens
2326 * \param num_bytes kernel buffer size
2328 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2334 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2337 /* Checks for fiemap flags */
2338 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2339 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2343 /* Check for FIEMAP_FLAG_SYNC */
2344 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2345 rc = filemap_fdatawrite(inode->i_mapping);
2350 env = cl_env_get(&refcheck);
2352 RETURN(PTR_ERR(env));
2354 if (i_size_read(inode) == 0) {
2355 rc = ll_glimpse_size(inode);
2360 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2361 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2362 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2364 /* If filesize is 0, then there would be no objects for mapping */
2365 if (fmkey.lfik_oa.o_size == 0) {
2366 fiemap->fm_mapped_extents = 0;
2370 fmkey.lfik_fiemap = *fiemap;
2372 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2373 &fmkey, fiemap, &num_bytes);
2375 cl_env_put(env, &refcheck);
2379 int ll_fid2path(struct inode *inode, void __user *arg)
2381 struct obd_export *exp = ll_i2mdexp(inode);
2382 const struct getinfo_fid2path __user *gfin = arg;
2384 struct getinfo_fid2path *gfout;
2390 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2391 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2394 /* Only need to get the buflen */
2395 if (get_user(pathlen, &gfin->gf_pathlen))
2398 if (pathlen > PATH_MAX)
2401 outsize = sizeof(*gfout) + pathlen;
2402 OBD_ALLOC(gfout, outsize);
2406 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2407 GOTO(gf_free, rc = -EFAULT);
2408 /* append root FID after gfout to let MDT know the root FID so that it
2409 * can lookup the correct path, this is mainly for fileset.
2410 * old server without fileset mount support will ignore this. */
2411 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2413 /* Call mdc_iocontrol */
2414 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2418 if (copy_to_user(arg, gfout, outsize))
2422 OBD_FREE(gfout, outsize);
2427 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2429 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2437 ioc->idv_version = 0;
2438 ioc->idv_layout_version = UINT_MAX;
2440 /* If no file object initialized, we consider its version is 0. */
2444 env = cl_env_get(&refcheck);
2446 RETURN(PTR_ERR(env));
2448 io = vvp_env_thread_io(env);
2450 io->u.ci_data_version.dv_data_version = 0;
2451 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2452 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2455 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2456 result = cl_io_loop(env, io);
2458 result = io->ci_result;
2460 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2461 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2463 cl_io_fini(env, io);
2465 if (unlikely(io->ci_need_restart))
2468 cl_env_put(env, &refcheck);
2474 * Read the data_version for inode.
2476 * This value is computed using stripe object version on OST.
2477 * Version is computed using server side locking.
2479 * @param flags if do sync on the OST side;
2481 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2482 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2484 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2486 struct ioc_data_version ioc = { .idv_flags = flags };
2489 rc = ll_ioc_data_version(inode, &ioc);
2491 *data_version = ioc.idv_version;
2497 * Trigger a HSM release request for the provided inode.
2499 int ll_hsm_release(struct inode *inode)
2502 struct obd_client_handle *och = NULL;
2503 __u64 data_version = 0;
2508 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2509 ll_get_fsname(inode->i_sb, NULL, 0),
2510 PFID(&ll_i2info(inode)->lli_fid));
2512 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2514 GOTO(out, rc = PTR_ERR(och));
2516 /* Grab latest data_version and [am]time values */
2517 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2521 env = cl_env_get(&refcheck);
2523 GOTO(out, rc = PTR_ERR(env));
2525 rc = ll_merge_attr(env, inode);
2526 cl_env_put(env, &refcheck);
2528 /* If error happen, we have the wrong size for a file.
2534 /* Release the file.
2535 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2536 * we still need it to pack l_remote_handle to MDT. */
2537 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2543 if (och != NULL && !IS_ERR(och)) /* close the file */
2544 ll_lease_close(och, inode, NULL);
2549 struct ll_swap_stack {
2552 struct inode *inode1;
2553 struct inode *inode2;
2558 static int ll_swap_layouts(struct file *file1, struct file *file2,
2559 struct lustre_swap_layouts *lsl)
2561 struct mdc_swap_layouts msl;
2562 struct md_op_data *op_data;
2565 struct ll_swap_stack *llss = NULL;
2568 OBD_ALLOC_PTR(llss);
2572 llss->inode1 = file_inode(file1);
2573 llss->inode2 = file_inode(file2);
2575 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2579 /* we use 2 bool because it is easier to swap than 2 bits */
2580 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2581 llss->check_dv1 = true;
2583 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2584 llss->check_dv2 = true;
2586 /* we cannot use lsl->sl_dvX directly because we may swap them */
2587 llss->dv1 = lsl->sl_dv1;
2588 llss->dv2 = lsl->sl_dv2;
2590 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2591 if (rc == 0) /* same file, done! */
2594 if (rc < 0) { /* sequentialize it */
2595 swap(llss->inode1, llss->inode2);
2597 swap(llss->dv1, llss->dv2);
2598 swap(llss->check_dv1, llss->check_dv2);
2602 if (gid != 0) { /* application asks to flush dirty cache */
2603 rc = ll_get_grouplock(llss->inode1, file1, gid);
2607 rc = ll_get_grouplock(llss->inode2, file2, gid);
2609 ll_put_grouplock(llss->inode1, file1, gid);
2614 /* ultimate check, before swaping the layouts we check if
2615 * dataversion has changed (if requested) */
2616 if (llss->check_dv1) {
2617 rc = ll_data_version(llss->inode1, &dv, 0);
2620 if (dv != llss->dv1)
2621 GOTO(putgl, rc = -EAGAIN);
2624 if (llss->check_dv2) {
2625 rc = ll_data_version(llss->inode2, &dv, 0);
2628 if (dv != llss->dv2)
2629 GOTO(putgl, rc = -EAGAIN);
2632 /* struct md_op_data is used to send the swap args to the mdt
2633 * only flags is missing, so we use struct mdc_swap_layouts
2634 * through the md_op_data->op_data */
2635 /* flags from user space have to be converted before they are send to
2636 * server, no flag is sent today, they are only used on the client */
2639 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2640 0, LUSTRE_OPC_ANY, &msl);
2641 if (IS_ERR(op_data))
2642 GOTO(free, rc = PTR_ERR(op_data));
2644 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2645 sizeof(*op_data), op_data, NULL);
2646 ll_finish_md_op_data(op_data);
2653 ll_put_grouplock(llss->inode2, file2, gid);
2654 ll_put_grouplock(llss->inode1, file1, gid);
2664 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2666 struct md_op_data *op_data;
2670 /* Detect out-of range masks */
2671 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2674 /* Non-root users are forbidden to set or clear flags which are
2675 * NOT defined in HSM_USER_MASK. */
2676 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2677 !cfs_capable(CFS_CAP_SYS_ADMIN))
2680 /* Detect out-of range archive id */
2681 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2682 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2685 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2686 LUSTRE_OPC_ANY, hss);
2687 if (IS_ERR(op_data))
2688 RETURN(PTR_ERR(op_data));
2690 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2691 sizeof(*op_data), op_data, NULL);
2693 ll_finish_md_op_data(op_data);
2698 static int ll_hsm_import(struct inode *inode, struct file *file,
2699 struct hsm_user_import *hui)
2701 struct hsm_state_set *hss = NULL;
2702 struct iattr *attr = NULL;
2706 if (!S_ISREG(inode->i_mode))
2712 GOTO(out, rc = -ENOMEM);
2714 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2715 hss->hss_archive_id = hui->hui_archive_id;
2716 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2717 rc = ll_hsm_state_set(inode, hss);
2721 OBD_ALLOC_PTR(attr);
2723 GOTO(out, rc = -ENOMEM);
2725 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2726 attr->ia_mode |= S_IFREG;
2727 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2728 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2729 attr->ia_size = hui->hui_size;
2730 attr->ia_mtime.tv_sec = hui->hui_mtime;
2731 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2732 attr->ia_atime.tv_sec = hui->hui_atime;
2733 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2735 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2736 ATTR_UID | ATTR_GID |
2737 ATTR_MTIME | ATTR_MTIME_SET |
2738 ATTR_ATIME | ATTR_ATIME_SET;
2742 rc = ll_setattr_raw(file_dentry(file), attr, true);
2746 inode_unlock(inode);
2758 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2760 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2761 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2764 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2766 struct inode *inode = file_inode(file);
2768 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2769 ATTR_MTIME | ATTR_MTIME_SET |
2770 ATTR_CTIME | ATTR_CTIME_SET,
2772 .tv_sec = lfu->lfu_atime_sec,
2773 .tv_nsec = lfu->lfu_atime_nsec,
2776 .tv_sec = lfu->lfu_mtime_sec,
2777 .tv_nsec = lfu->lfu_mtime_nsec,
2780 .tv_sec = lfu->lfu_ctime_sec,
2781 .tv_nsec = lfu->lfu_ctime_nsec,
2787 if (!capable(CAP_SYS_ADMIN))
2790 if (!S_ISREG(inode->i_mode))
2794 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2795 inode_unlock(inode);
2800 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2803 case MODE_READ_USER:
2805 case MODE_WRITE_USER:
2812 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2814 /* Used to allow the upper layers of the client to request an LDLM lock
2815 * without doing an actual read or write.
2817 * Used for ladvise lockahead to manually request specific locks.
2819 * \param[in] file file this ladvise lock request is on
2820 * \param[in] ladvise ladvise struct describing this lock request
2822 * \retval 0 success, no detailed result available (sync requests
2823 * and requests sent to the server [not handled locally]
2824 * cannot return detailed results)
2825 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2826 * see definitions for details.
2827 * \retval negative negative errno on error
2829 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2831 struct lu_env *env = NULL;
2832 struct cl_io *io = NULL;
2833 struct cl_lock *lock = NULL;
2834 struct cl_lock_descr *descr = NULL;
2835 struct dentry *dentry = file->f_path.dentry;
2836 struct inode *inode = dentry->d_inode;
2837 enum cl_lock_mode cl_mode;
2838 off_t start = ladvise->lla_start;
2839 off_t end = ladvise->lla_end;
2845 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2846 "start=%llu, end=%llu\n", dentry->d_name.len,
2847 dentry->d_name.name, dentry->d_inode,
2848 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2851 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2853 GOTO(out, result = cl_mode);
2855 /* Get IO environment */
2856 result = cl_io_get(inode, &env, &io, &refcheck);
2860 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2863 * nothing to do for this io. This currently happens when
2864 * stripe sub-object's are not yet created.
2866 result = io->ci_result;
2867 } else if (result == 0) {
2868 lock = vvp_env_lock(env);
2869 descr = &lock->cll_descr;
2871 descr->cld_obj = io->ci_obj;
2872 /* Convert byte offsets to pages */
2873 descr->cld_start = cl_index(io->ci_obj, start);
2874 descr->cld_end = cl_index(io->ci_obj, end);
2875 descr->cld_mode = cl_mode;
2876 /* CEF_MUST is used because we do not want to convert a
2877 * lockahead request to a lockless lock */
2878 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2881 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2882 descr->cld_enq_flags |= CEF_SPECULATIVE;
2884 result = cl_lock_request(env, io, lock);
2886 /* On success, we need to release the lock */
2888 cl_lock_release(env, lock);
2890 cl_io_fini(env, io);
2891 cl_env_put(env, &refcheck);
2893 /* -ECANCELED indicates a matching lock with a different extent
2894 * was already present, and -EEXIST indicates a matching lock
2895 * on exactly the same extent was already present.
2896 * We convert them to positive values for userspace to make
2897 * recognizing true errors easier.
2898 * Note we can only return these detailed results on async requests,
2899 * as sync requests look the same as i/o requests for locking. */
2900 if (result == -ECANCELED)
2901 result = LLA_RESULT_DIFFERENT;
2902 else if (result == -EEXIST)
2903 result = LLA_RESULT_SAME;
2908 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2910 static int ll_ladvise_sanity(struct inode *inode,
2911 struct llapi_lu_ladvise *ladvise)
2913 enum lu_ladvise_type advice = ladvise->lla_advice;
2914 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2915 * be in the first 32 bits of enum ladvise_flags */
2916 __u32 flags = ladvise->lla_peradvice_flags;
2917 /* 3 lines at 80 characters per line, should be plenty */
2920 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2922 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2923 "last supported advice is %s (value '%d'): rc = %d\n",
2924 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2925 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2929 /* Per-advice checks */
2931 case LU_LADVISE_LOCKNOEXPAND:
2932 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2934 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2936 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2937 ladvise_names[advice], rc);
2941 case LU_LADVISE_LOCKAHEAD:
2942 /* Currently only READ and WRITE modes can be requested */
2943 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2944 ladvise->lla_lockahead_mode == 0) {
2946 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2948 ll_get_fsname(inode->i_sb, NULL, 0),
2949 ladvise->lla_lockahead_mode,
2950 ladvise_names[advice], rc);
2953 case LU_LADVISE_WILLREAD:
2954 case LU_LADVISE_DONTNEED:
2956 /* Note fall through above - These checks apply to all advices
2957 * except LOCKNOEXPAND */
2958 if (flags & ~LF_DEFAULT_MASK) {
2960 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2962 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2963 ladvise_names[advice], rc);
2966 if (ladvise->lla_start >= ladvise->lla_end) {
2968 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2969 "for %s: rc = %d\n",
2970 ll_get_fsname(inode->i_sb, NULL, 0),
2971 ladvise->lla_start, ladvise->lla_end,
2972 ladvise_names[advice], rc);
2984 * Give file access advices
2986 * The ladvise interface is similar to Linux fadvise() system call, except it
2987 * forwards the advices directly from Lustre client to server. The server side
2988 * codes will apply appropriate read-ahead and caching techniques for the
2989 * corresponding files.
2991 * A typical workload for ladvise is e.g. a bunch of different clients are
2992 * doing small random reads of a file, so prefetching pages into OSS cache
2993 * with big linear reads before the random IO is a net benefit. Fetching
2994 * all that data into each client cache with fadvise() may not be, due to
2995 * much more data being sent to the client.
2997 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2998 struct llapi_lu_ladvise *ladvise)
3002 struct cl_ladvise_io *lio;
3007 env = cl_env_get(&refcheck);
3009 RETURN(PTR_ERR(env));
3011 io = vvp_env_thread_io(env);
3012 io->ci_obj = ll_i2info(inode)->lli_clob;
3014 /* initialize parameters for ladvise */
3015 lio = &io->u.ci_ladvise;
3016 lio->li_start = ladvise->lla_start;
3017 lio->li_end = ladvise->lla_end;
3018 lio->li_fid = ll_inode2fid(inode);
3019 lio->li_advice = ladvise->lla_advice;
3020 lio->li_flags = flags;
3022 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3023 rc = cl_io_loop(env, io);
3027 cl_io_fini(env, io);
3028 cl_env_put(env, &refcheck);
3032 static int ll_lock_noexpand(struct file *file, int flags)
3034 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3036 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3041 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3044 struct fsxattr fsxattr;
3046 if (copy_from_user(&fsxattr,
3047 (const struct fsxattr __user *)arg,
3051 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3052 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3053 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3054 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3055 if (copy_to_user((struct fsxattr __user *)arg,
3056 &fsxattr, sizeof(fsxattr)))
3062 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3066 struct md_op_data *op_data;
3067 struct ptlrpc_request *req = NULL;
3069 struct fsxattr fsxattr;
3070 struct cl_object *obj;
3073 /* only root could change project ID */
3074 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3077 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3078 LUSTRE_OPC_ANY, NULL);
3079 if (IS_ERR(op_data))
3080 RETURN(PTR_ERR(op_data));
3082 if (copy_from_user(&fsxattr,
3083 (const struct fsxattr __user *)arg,
3085 GOTO(out_fsxattr1, rc = -EFAULT);
3087 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3088 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3089 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3090 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3091 op_data->op_projid = fsxattr.fsx_projid;
3092 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3093 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3095 ptlrpc_req_finished(req);
3097 obj = ll_i2info(inode)->lli_clob;
3101 ll_update_inode_flags(inode, op_data->op_attr_flags);
3102 OBD_ALLOC_PTR(attr);
3104 GOTO(out_fsxattr1, rc = -ENOMEM);
3105 attr->ia_valid = ATTR_ATTR_FLAG;
3106 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3111 ll_finish_md_op_data(op_data);
3115 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3118 struct inode *inode = file_inode(file);
3119 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3120 struct ll_inode_info *lli = ll_i2info(inode);
3121 struct obd_client_handle *och = NULL;
3122 struct split_param sp;
3125 enum mds_op_bias bias = 0;
3126 struct file *layout_file = NULL;
3128 size_t data_size = 0;
3132 mutex_lock(&lli->lli_och_mutex);
3133 if (fd->fd_lease_och != NULL) {
3134 och = fd->fd_lease_och;
3135 fd->fd_lease_och = NULL;
3137 mutex_unlock(&lli->lli_och_mutex);
3140 GOTO(out, rc = -ENOLCK);
3142 fmode = och->och_flags;
3144 switch (ioc->lil_flags) {
3145 case LL_LEASE_RESYNC_DONE:
3146 if (ioc->lil_count > IOC_IDS_MAX)
3147 GOTO(out, rc = -EINVAL);
3149 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3150 OBD_ALLOC(data, data_size);
3152 GOTO(out, rc = -ENOMEM);
3154 if (copy_from_user(data, (void __user *)arg, data_size))
3155 GOTO(out, rc = -EFAULT);
3157 bias = MDS_CLOSE_RESYNC_DONE;
3159 case LL_LEASE_LAYOUT_MERGE: {
3162 if (ioc->lil_count != 1)
3163 GOTO(out, rc = -EINVAL);
3165 arg += sizeof(*ioc);
3166 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3167 GOTO(out, rc = -EFAULT);
3169 layout_file = fget(fd);
3171 GOTO(out, rc = -EBADF);
3173 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3174 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3175 GOTO(out, rc = -EPERM);
3177 data = file_inode(layout_file);
3178 bias = MDS_CLOSE_LAYOUT_MERGE;
3181 case LL_LEASE_LAYOUT_SPLIT: {
3185 if (ioc->lil_count != 2)
3186 GOTO(out, rc = -EINVAL);
3188 arg += sizeof(*ioc);
3189 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3190 GOTO(out, rc = -EFAULT);
3192 arg += sizeof(__u32);
3193 if (copy_from_user(&mirror_id, (void __user *)arg,
3195 GOTO(out, rc = -EFAULT);
3197 layout_file = fget(fdv);
3199 GOTO(out, rc = -EBADF);
3201 sp.sp_inode = file_inode(layout_file);
3202 sp.sp_mirror_id = (__u16)mirror_id;
3204 bias = MDS_CLOSE_LAYOUT_SPLIT;
3208 /* without close intent */
3212 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3216 rc = ll_lease_och_release(inode, file);
3225 switch (ioc->lil_flags) {
3226 case LL_LEASE_RESYNC_DONE:
3228 OBD_FREE(data, data_size);
3230 case LL_LEASE_LAYOUT_MERGE:
3231 case LL_LEASE_LAYOUT_SPLIT:
3238 rc = ll_lease_type_from_fmode(fmode);
3242 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3245 struct inode *inode = file_inode(file);
3246 struct ll_inode_info *lli = ll_i2info(inode);
3247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3248 struct obd_client_handle *och = NULL;
3249 __u64 open_flags = 0;
3255 switch (ioc->lil_mode) {
3256 case LL_LEASE_WRLCK:
3257 if (!(file->f_mode & FMODE_WRITE))
3259 fmode = FMODE_WRITE;
3261 case LL_LEASE_RDLCK:
3262 if (!(file->f_mode & FMODE_READ))
3266 case LL_LEASE_UNLCK:
3267 RETURN(ll_file_unlock_lease(file, ioc, arg));
3272 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3274 /* apply for lease */
3275 if (ioc->lil_flags & LL_LEASE_RESYNC)
3276 open_flags = MDS_OPEN_RESYNC;
3277 och = ll_lease_open(inode, file, fmode, open_flags);
3279 RETURN(PTR_ERR(och));
3281 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3282 rc = ll_lease_file_resync(och, inode);
3284 ll_lease_close(och, inode, NULL);
3287 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3289 ll_lease_close(och, inode, NULL);
3295 mutex_lock(&lli->lli_och_mutex);
3296 if (fd->fd_lease_och == NULL) {
3297 fd->fd_lease_och = och;
3300 mutex_unlock(&lli->lli_och_mutex);
3302 /* impossible now that only excl is supported for now */
3303 ll_lease_close(och, inode, &lease_broken);
3310 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3312 struct inode *inode = file_inode(file);
3313 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3317 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3318 PFID(ll_inode2fid(inode)), inode, cmd);
3319 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3321 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3322 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3326 case LL_IOC_GETFLAGS:
3327 /* Get the current value of the file flags */
3328 return put_user(fd->fd_flags, (int __user *)arg);
3329 case LL_IOC_SETFLAGS:
3330 case LL_IOC_CLRFLAGS:
3331 /* Set or clear specific file flags */
3332 /* XXX This probably needs checks to ensure the flags are
3333 * not abused, and to handle any flag side effects.
3335 if (get_user(flags, (int __user *) arg))
3338 if (cmd == LL_IOC_SETFLAGS) {
3339 if ((flags & LL_FILE_IGNORE_LOCK) &&
3340 !(file->f_flags & O_DIRECT)) {
3341 CERROR("%s: unable to disable locking on "
3342 "non-O_DIRECT file\n", current->comm);
3346 fd->fd_flags |= flags;
3348 fd->fd_flags &= ~flags;
3351 case LL_IOC_LOV_SETSTRIPE:
3352 case LL_IOC_LOV_SETSTRIPE_NEW:
3353 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3354 case LL_IOC_LOV_SETEA:
3355 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3356 case LL_IOC_LOV_SWAP_LAYOUTS: {
3358 struct lustre_swap_layouts lsl;
3360 if (copy_from_user(&lsl, (char __user *)arg,
3361 sizeof(struct lustre_swap_layouts)))
3364 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3367 file2 = fget(lsl.sl_fd);
3371 /* O_WRONLY or O_RDWR */
3372 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3373 GOTO(out, rc = -EPERM);
3375 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3376 struct inode *inode2;
3377 struct ll_inode_info *lli;
3378 struct obd_client_handle *och = NULL;
3380 lli = ll_i2info(inode);
3381 mutex_lock(&lli->lli_och_mutex);
3382 if (fd->fd_lease_och != NULL) {
3383 och = fd->fd_lease_och;
3384 fd->fd_lease_och = NULL;
3386 mutex_unlock(&lli->lli_och_mutex);
3388 GOTO(out, rc = -ENOLCK);
3389 inode2 = file_inode(file2);
3390 rc = ll_swap_layouts_close(och, inode, inode2);
3392 rc = ll_swap_layouts(file, file2, &lsl);
3398 case LL_IOC_LOV_GETSTRIPE:
3399 case LL_IOC_LOV_GETSTRIPE_NEW:
3400 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3401 case FS_IOC_GETFLAGS:
3402 case FS_IOC_SETFLAGS:
3403 RETURN(ll_iocontrol(inode, file, cmd, arg));
3404 case FSFILT_IOC_GETVERSION:
3405 case FS_IOC_GETVERSION:
3406 RETURN(put_user(inode->i_generation, (int __user *)arg));
3407 /* We need to special case any other ioctls we want to handle,
3408 * to send them to the MDS/OST as appropriate and to properly
3409 * network encode the arg field. */
3410 case FS_IOC_SETVERSION:
3413 case LL_IOC_GROUP_LOCK:
3414 RETURN(ll_get_grouplock(inode, file, arg));
3415 case LL_IOC_GROUP_UNLOCK:
3416 RETURN(ll_put_grouplock(inode, file, arg));
3417 case IOC_OBD_STATFS:
3418 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3420 case LL_IOC_FLUSHCTX:
3421 RETURN(ll_flush_ctx(inode));
3422 case LL_IOC_PATH2FID: {
3423 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3424 sizeof(struct lu_fid)))
3429 case LL_IOC_GETPARENT:
3430 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3432 case OBD_IOC_FID2PATH:
3433 RETURN(ll_fid2path(inode, (void __user *)arg));
3434 case LL_IOC_DATA_VERSION: {
3435 struct ioc_data_version idv;
3438 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3441 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3442 rc = ll_ioc_data_version(inode, &idv);
3445 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3451 case LL_IOC_GET_MDTIDX: {
3454 mdtidx = ll_get_mdt_idx(inode);
3458 if (put_user((int)mdtidx, (int __user *)arg))
3463 case OBD_IOC_GETDTNAME:
3464 case OBD_IOC_GETMDNAME:
3465 RETURN(ll_get_obd_name(inode, cmd, arg));
3466 case LL_IOC_HSM_STATE_GET: {
3467 struct md_op_data *op_data;
3468 struct hsm_user_state *hus;
3475 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3476 LUSTRE_OPC_ANY, hus);
3477 if (IS_ERR(op_data)) {
3479 RETURN(PTR_ERR(op_data));
3482 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3485 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3488 ll_finish_md_op_data(op_data);
3492 case LL_IOC_HSM_STATE_SET: {
3493 struct hsm_state_set *hss;
3500 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3505 rc = ll_hsm_state_set(inode, hss);
3510 case LL_IOC_HSM_ACTION: {
3511 struct md_op_data *op_data;
3512 struct hsm_current_action *hca;
3519 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3520 LUSTRE_OPC_ANY, hca);
3521 if (IS_ERR(op_data)) {
3523 RETURN(PTR_ERR(op_data));
3526 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3529 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3532 ll_finish_md_op_data(op_data);
3536 case LL_IOC_SET_LEASE_OLD: {
3537 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3539 RETURN(ll_file_set_lease(file, &ioc, 0));
3541 case LL_IOC_SET_LEASE: {
3542 struct ll_ioc_lease ioc;
3544 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3547 RETURN(ll_file_set_lease(file, &ioc, arg));
3549 case LL_IOC_GET_LEASE: {
3550 struct ll_inode_info *lli = ll_i2info(inode);
3551 struct ldlm_lock *lock = NULL;
3554 mutex_lock(&lli->lli_och_mutex);
3555 if (fd->fd_lease_och != NULL) {
3556 struct obd_client_handle *och = fd->fd_lease_och;
3558 lock = ldlm_handle2lock(&och->och_lease_handle);
3560 lock_res_and_lock(lock);
3561 if (!ldlm_is_cancel(lock))
3562 fmode = och->och_flags;
3564 unlock_res_and_lock(lock);
3565 LDLM_LOCK_PUT(lock);
3568 mutex_unlock(&lli->lli_och_mutex);
3570 RETURN(ll_lease_type_from_fmode(fmode));
3572 case LL_IOC_HSM_IMPORT: {
3573 struct hsm_user_import *hui;
3579 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3584 rc = ll_hsm_import(inode, file, hui);
3589 case LL_IOC_FUTIMES_3: {
3590 struct ll_futimes_3 lfu;
3592 if (copy_from_user(&lfu,
3593 (const struct ll_futimes_3 __user *)arg,
3597 RETURN(ll_file_futimes_3(file, &lfu));
3599 case LL_IOC_LADVISE: {
3600 struct llapi_ladvise_hdr *k_ladvise_hdr;
3601 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3604 int alloc_size = sizeof(*k_ladvise_hdr);
3607 u_ladvise_hdr = (void __user *)arg;
3608 OBD_ALLOC_PTR(k_ladvise_hdr);
3609 if (k_ladvise_hdr == NULL)
3612 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3613 GOTO(out_ladvise, rc = -EFAULT);
3615 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3616 k_ladvise_hdr->lah_count < 1)
3617 GOTO(out_ladvise, rc = -EINVAL);
3619 num_advise = k_ladvise_hdr->lah_count;
3620 if (num_advise >= LAH_COUNT_MAX)
3621 GOTO(out_ladvise, rc = -EFBIG);
3623 OBD_FREE_PTR(k_ladvise_hdr);
3624 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3625 lah_advise[num_advise]);
3626 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3627 if (k_ladvise_hdr == NULL)
3631 * TODO: submit multiple advices to one server in a single RPC
3633 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3634 GOTO(out_ladvise, rc = -EFAULT);
3636 for (i = 0; i < num_advise; i++) {
3637 struct llapi_lu_ladvise *k_ladvise =
3638 &k_ladvise_hdr->lah_advise[i];
3639 struct llapi_lu_ladvise __user *u_ladvise =
3640 &u_ladvise_hdr->lah_advise[i];
3642 rc = ll_ladvise_sanity(inode, k_ladvise);
3644 GOTO(out_ladvise, rc);
3646 switch (k_ladvise->lla_advice) {
3647 case LU_LADVISE_LOCKNOEXPAND:
3648 rc = ll_lock_noexpand(file,
3649 k_ladvise->lla_peradvice_flags);
3650 GOTO(out_ladvise, rc);
3651 case LU_LADVISE_LOCKAHEAD:
3653 rc = ll_file_lock_ahead(file, k_ladvise);
3656 GOTO(out_ladvise, rc);
3659 &u_ladvise->lla_lockahead_result))
3660 GOTO(out_ladvise, rc = -EFAULT);
3663 rc = ll_ladvise(inode, file,
3664 k_ladvise_hdr->lah_flags,
3667 GOTO(out_ladvise, rc);
3674 OBD_FREE(k_ladvise_hdr, alloc_size);
3677 case LL_IOC_FLR_SET_MIRROR: {
3678 /* mirror I/O must be direct to avoid polluting page cache
3680 if (!(file->f_flags & O_DIRECT))
3683 fd->fd_designated_mirror = (__u32)arg;
3686 case LL_IOC_FSGETXATTR:
3687 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3688 case LL_IOC_FSSETXATTR:
3689 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3691 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3693 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3694 (void __user *)arg));
3698 #ifndef HAVE_FILE_LLSEEK_SIZE
3699 static inline loff_t
3700 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3702 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3704 if (offset > maxsize)
3707 if (offset != file->f_pos) {
3708 file->f_pos = offset;
3709 file->f_version = 0;
3715 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3716 loff_t maxsize, loff_t eof)
3718 struct inode *inode = file_inode(file);
3726 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3727 * position-querying operation. Avoid rewriting the "same"
3728 * f_pos value back to the file because a concurrent read(),
3729 * write() or lseek() might have altered it
3734 * f_lock protects against read/modify/write race with other
3735 * SEEK_CURs. Note that parallel writes and reads behave
3739 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3740 inode_unlock(inode);
3744 * In the generic case the entire file is data, so as long as
3745 * offset isn't at the end of the file then the offset is data.
3752 * There is a virtual hole at the end of the file, so as long as
3753 * offset isn't i_size or larger, return i_size.
3761 return llseek_execute(file, offset, maxsize);
3765 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3767 struct inode *inode = file_inode(file);
3768 loff_t retval, eof = 0;
3771 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3772 (origin == SEEK_CUR) ? file->f_pos : 0);
3773 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3774 PFID(ll_inode2fid(inode)), inode, retval, retval,
3776 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3778 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3779 retval = ll_glimpse_size(inode);
3782 eof = i_size_read(inode);
3785 retval = ll_generic_file_llseek_size(file, offset, origin,
3786 ll_file_maxbytes(inode), eof);
3790 static int ll_flush(struct file *file, fl_owner_t id)
3792 struct inode *inode = file_inode(file);
3793 struct ll_inode_info *lli = ll_i2info(inode);
3794 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3797 LASSERT(!S_ISDIR(inode->i_mode));
3799 /* catch async errors that were recorded back when async writeback
3800 * failed for pages in this mapping. */
3801 rc = lli->lli_async_rc;
3802 lli->lli_async_rc = 0;
3803 if (lli->lli_clob != NULL) {
3804 err = lov_read_and_clear_async_rc(lli->lli_clob);
3809 /* The application has been told write failure already.
3810 * Do not report failure again. */
3811 if (fd->fd_write_failed)
3813 return rc ? -EIO : 0;
3817 * Called to make sure a portion of file has been written out.
3818 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3820 * Return how many pages have been written.
3822 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3823 enum cl_fsync_mode mode, int ignore_layout)
3827 struct cl_fsync_io *fio;
3832 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3833 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3836 env = cl_env_get(&refcheck);
3838 RETURN(PTR_ERR(env));
3840 io = vvp_env_thread_io(env);
3841 io->ci_obj = ll_i2info(inode)->lli_clob;
3842 io->ci_ignore_layout = ignore_layout;
3844 /* initialize parameters for sync */
3845 fio = &io->u.ci_fsync;
3846 fio->fi_start = start;
3848 fio->fi_fid = ll_inode2fid(inode);
3849 fio->fi_mode = mode;
3850 fio->fi_nr_written = 0;
3852 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3853 result = cl_io_loop(env, io);
3855 result = io->ci_result;
3857 result = fio->fi_nr_written;
3858 cl_io_fini(env, io);
3859 cl_env_put(env, &refcheck);
3865 * When dentry is provided (the 'else' case), file_dentry() may be
3866 * null and dentry must be used directly rather than pulled from
3867 * file_dentry() as is done otherwise.
3870 #ifdef HAVE_FILE_FSYNC_4ARGS
3871 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3873 struct dentry *dentry = file_dentry(file);
3875 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3876 int ll_fsync(struct file *file, int datasync)
3878 struct dentry *dentry = file_dentry(file);
3880 loff_t end = LLONG_MAX;
3882 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3885 loff_t end = LLONG_MAX;
3887 struct inode *inode = dentry->d_inode;
3888 struct ll_inode_info *lli = ll_i2info(inode);
3889 struct ptlrpc_request *req;
3893 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3894 PFID(ll_inode2fid(inode)), inode);
3895 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3897 #ifdef HAVE_FILE_FSYNC_4ARGS
3898 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3899 lock_inode = !lli->lli_inode_locked;
3903 /* fsync's caller has already called _fdata{sync,write}, we want
3904 * that IO to finish before calling the osc and mdc sync methods */
3905 rc = filemap_fdatawait(inode->i_mapping);
3908 /* catch async errors that were recorded back when async writeback
3909 * failed for pages in this mapping. */
3910 if (!S_ISDIR(inode->i_mode)) {
3911 err = lli->lli_async_rc;
3912 lli->lli_async_rc = 0;
3915 if (lli->lli_clob != NULL) {
3916 err = lov_read_and_clear_async_rc(lli->lli_clob);
3922 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3926 ptlrpc_req_finished(req);
3928 if (S_ISREG(inode->i_mode)) {
3929 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3931 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3932 if (rc == 0 && err < 0)
3935 fd->fd_write_failed = true;
3937 fd->fd_write_failed = false;
3940 #ifdef HAVE_FILE_FSYNC_4ARGS
3942 inode_unlock(inode);
3948 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3950 struct inode *inode = file_inode(file);
3951 struct ll_sb_info *sbi = ll_i2sbi(inode);
3952 struct ldlm_enqueue_info einfo = {
3953 .ei_type = LDLM_FLOCK,
3954 .ei_cb_cp = ldlm_flock_completion_ast,
3955 .ei_cbdata = file_lock,
3957 struct md_op_data *op_data;
3958 struct lustre_handle lockh = { 0 };
3959 union ldlm_policy_data flock = { { 0 } };
3960 int fl_type = file_lock->fl_type;
3966 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3967 PFID(ll_inode2fid(inode)), file_lock);
3969 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3971 if (file_lock->fl_flags & FL_FLOCK) {
3972 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3973 /* flocks are whole-file locks */
3974 flock.l_flock.end = OFFSET_MAX;
3975 /* For flocks owner is determined by the local file desctiptor*/
3976 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3977 } else if (file_lock->fl_flags & FL_POSIX) {
3978 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3979 flock.l_flock.start = file_lock->fl_start;
3980 flock.l_flock.end = file_lock->fl_end;
3984 flock.l_flock.pid = file_lock->fl_pid;
3986 /* Somewhat ugly workaround for svc lockd.
3987 * lockd installs custom fl_lmops->lm_compare_owner that checks
3988 * for the fl_owner to be the same (which it always is on local node
3989 * I guess between lockd processes) and then compares pid.
3990 * As such we assign pid to the owner field to make it all work,
3991 * conflict with normal locks is unlikely since pid space and
3992 * pointer space for current->files are not intersecting */
3993 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3994 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3998 einfo.ei_mode = LCK_PR;
4001 /* An unlock request may or may not have any relation to
4002 * existing locks so we may not be able to pass a lock handle
4003 * via a normal ldlm_lock_cancel() request. The request may even
4004 * unlock a byte range in the middle of an existing lock. In
4005 * order to process an unlock request we need all of the same
4006 * information that is given with a normal read or write record
4007 * lock request. To avoid creating another ldlm unlock (cancel)
4008 * message we'll treat a LCK_NL flock request as an unlock. */
4009 einfo.ei_mode = LCK_NL;
4012 einfo.ei_mode = LCK_PW;
4015 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4030 flags = LDLM_FL_BLOCK_NOWAIT;
4036 flags = LDLM_FL_TEST_LOCK;
4039 CERROR("unknown fcntl lock command: %d\n", cmd);
4043 /* Save the old mode so that if the mode in the lock changes we
4044 * can decrement the appropriate reader or writer refcount. */
4045 file_lock->fl_type = einfo.ei_mode;
4047 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4048 LUSTRE_OPC_ANY, NULL);
4049 if (IS_ERR(op_data))
4050 RETURN(PTR_ERR(op_data));
4052 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4053 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4054 flock.l_flock.pid, flags, einfo.ei_mode,
4055 flock.l_flock.start, flock.l_flock.end);
4057 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4060 /* Restore the file lock type if not TEST lock. */
4061 if (!(flags & LDLM_FL_TEST_LOCK))
4062 file_lock->fl_type = fl_type;
4064 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4065 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4066 !(flags & LDLM_FL_TEST_LOCK))
4067 rc2 = locks_lock_file_wait(file, file_lock);
4069 if ((file_lock->fl_flags & FL_FLOCK) &&
4070 (rc == 0 || file_lock->fl_type == F_UNLCK))
4071 rc2 = flock_lock_file_wait(file, file_lock);
4072 if ((file_lock->fl_flags & FL_POSIX) &&
4073 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4074 !(flags & LDLM_FL_TEST_LOCK))
4075 rc2 = posix_lock_file_wait(file, file_lock);
4076 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4078 if (rc2 && file_lock->fl_type != F_UNLCK) {
4079 einfo.ei_mode = LCK_NL;
4080 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4085 ll_finish_md_op_data(op_data);
4090 int ll_get_fid_by_name(struct inode *parent, const char *name,
4091 int namelen, struct lu_fid *fid,
4092 struct inode **inode)
4094 struct md_op_data *op_data = NULL;
4095 struct mdt_body *body;
4096 struct ptlrpc_request *req;
4100 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4101 LUSTRE_OPC_ANY, NULL);
4102 if (IS_ERR(op_data))
4103 RETURN(PTR_ERR(op_data));
4105 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4106 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4107 ll_finish_md_op_data(op_data);
4111 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4113 GOTO(out_req, rc = -EFAULT);
4115 *fid = body->mbo_fid1;
4118 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4120 ptlrpc_req_finished(req);
4124 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4127 struct dentry *dchild = NULL;
4128 struct inode *child_inode = NULL;
4129 struct md_op_data *op_data;
4130 struct ptlrpc_request *request = NULL;
4131 struct obd_client_handle *och = NULL;
4133 struct mdt_body *body;
4134 __u64 data_version = 0;
4135 size_t namelen = strlen(name);
4136 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4140 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4141 PFID(ll_inode2fid(parent)), name,
4142 lum->lum_stripe_offset, lum->lum_stripe_count);
4144 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4145 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4146 lustre_swab_lmv_user_md(lum);
4148 /* Get child FID first */
4149 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4152 dchild = d_lookup(file_dentry(file), &qstr);
4154 if (dchild->d_inode)
4155 child_inode = igrab(dchild->d_inode);
4160 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4170 * lfs migrate command needs to be blocked on the client
4171 * by checking the migrate FID against the FID of the
4174 if (child_inode == parent->i_sb->s_root->d_inode)
4175 GOTO(out_iput, rc = -EINVAL);
4177 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4178 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4179 if (IS_ERR(op_data))
4180 GOTO(out_iput, rc = PTR_ERR(op_data));
4182 inode_lock(child_inode);
4183 op_data->op_fid3 = *ll_inode2fid(child_inode);
4184 if (!fid_is_sane(&op_data->op_fid3)) {
4185 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4186 ll_get_fsname(parent->i_sb, NULL, 0), name,
4187 PFID(&op_data->op_fid3));
4188 GOTO(out_unlock, rc = -EINVAL);
4191 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4192 op_data->op_data = lum;
4193 op_data->op_data_size = lumlen;
4196 if (S_ISREG(child_inode->i_mode)) {
4197 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4201 GOTO(out_unlock, rc);
4204 rc = ll_data_version(child_inode, &data_version,
4207 GOTO(out_close, rc);
4209 op_data->op_handle = och->och_fh;
4210 op_data->op_data_version = data_version;
4211 op_data->op_lease_handle = och->och_lease_handle;
4212 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4214 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4215 och->och_mod->mod_open_req->rq_replay = 0;
4216 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4219 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4220 name, namelen, &request);
4222 LASSERT(request != NULL);
4223 ll_update_times(request, parent);
4225 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4226 LASSERT(body != NULL);
4228 /* If the server does release layout lock, then we cleanup
4229 * the client och here, otherwise release it in out_close: */
4230 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4231 obd_mod_put(och->och_mod);
4232 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4234 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4240 if (request != NULL) {
4241 ptlrpc_req_finished(request);
4245 /* Try again if the file layout has changed. */
4246 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4251 ll_lease_close(och, child_inode, NULL);
4253 clear_nlink(child_inode);
4255 inode_unlock(child_inode);
4256 ll_finish_md_op_data(op_data);
4263 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4271 * test if some locks matching bits and l_req_mode are acquired
4272 * - bits can be in different locks
4273 * - if found clear the common lock bits in *bits
4274 * - the bits not found, are kept in *bits
4276 * \param bits [IN] searched lock bits [IN]
4277 * \param l_req_mode [IN] searched lock mode
4278 * \retval boolean, true iff all bits are found
4280 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4282 struct lustre_handle lockh;
4283 union ldlm_policy_data policy;
4284 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4285 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4294 fid = &ll_i2info(inode)->lli_fid;
4295 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4296 ldlm_lockname[mode]);
4298 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4299 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4300 policy.l_inodebits.bits = *bits & (1 << i);
4301 if (policy.l_inodebits.bits == 0)
4304 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4305 &policy, mode, &lockh)) {
4306 struct ldlm_lock *lock;
4308 lock = ldlm_handle2lock(&lockh);
4311 ~(lock->l_policy_data.l_inodebits.bits);
4312 LDLM_LOCK_PUT(lock);
4314 *bits &= ~policy.l_inodebits.bits;
4321 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4322 struct lustre_handle *lockh, __u64 flags,
4323 enum ldlm_mode mode)
4325 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4330 fid = &ll_i2info(inode)->lli_fid;
4331 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4333 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4334 fid, LDLM_IBITS, &policy, mode, lockh);
4339 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4341 /* Already unlinked. Just update nlink and return success */
4342 if (rc == -ENOENT) {
4344 /* If it is striped directory, and there is bad stripe
4345 * Let's revalidate the dentry again, instead of returning
4347 if (S_ISDIR(inode->i_mode) &&
4348 ll_i2info(inode)->lli_lsm_md != NULL)
4351 /* This path cannot be hit for regular files unless in
4352 * case of obscure races, so no need to to validate
4354 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4356 } else if (rc != 0) {
4357 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4358 "%s: revalidate FID "DFID" error: rc = %d\n",
4359 ll_get_fsname(inode->i_sb, NULL, 0),
4360 PFID(ll_inode2fid(inode)), rc);
4366 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4368 struct inode *inode = dentry->d_inode;
4369 struct obd_export *exp = ll_i2mdexp(inode);
4370 struct lookup_intent oit = {
4373 struct ptlrpc_request *req = NULL;
4374 struct md_op_data *op_data;
4378 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4379 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4381 /* Call getattr by fid, so do not provide name at all. */
4382 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4383 LUSTRE_OPC_ANY, NULL);
4384 if (IS_ERR(op_data))
4385 RETURN(PTR_ERR(op_data));
4387 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4388 ll_finish_md_op_data(op_data);
4390 rc = ll_inode_revalidate_fini(inode, rc);
4394 rc = ll_revalidate_it_finish(req, &oit, dentry);
4396 ll_intent_release(&oit);
4400 /* Unlinked? Unhash dentry, so it is not picked up later by
4401 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4402 * here to preserve get_cwd functionality on 2.6.
4404 if (!dentry->d_inode->i_nlink) {
4405 ll_lock_dcache(inode);
4406 d_lustre_invalidate(dentry, 0);
4407 ll_unlock_dcache(inode);
4410 ll_lookup_finish_locks(&oit, dentry);
4412 ptlrpc_req_finished(req);
4417 static int ll_merge_md_attr(struct inode *inode)
4419 struct cl_attr attr = { 0 };
4422 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4423 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4424 &attr, ll_md_blocking_ast);
4428 set_nlink(inode, attr.cat_nlink);
4429 inode->i_blocks = attr.cat_blocks;
4430 i_size_write(inode, attr.cat_size);
4432 ll_i2info(inode)->lli_atime = attr.cat_atime;
4433 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4434 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4439 static inline dev_t ll_compat_encode_dev(dev_t dev)
4441 /* The compat_sys_*stat*() syscalls will fail unless the
4442 * device majors and minors are both less than 256. Note that
4443 * the value returned here will be passed through
4444 * old_encode_dev() in cp_compat_stat(). And so we are not
4445 * trying to return a valid compat (u16) device number, just
4446 * one that will pass the old_valid_dev() check. */
4448 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4451 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4452 int ll_getattr(const struct path *path, struct kstat *stat,
4453 u32 request_mask, unsigned int flags)
4455 struct dentry *de = path->dentry;
4457 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4460 struct inode *inode = de->d_inode;
4461 struct ll_sb_info *sbi = ll_i2sbi(inode);
4462 struct ll_inode_info *lli = ll_i2info(inode);
4465 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4467 rc = ll_inode_revalidate(de, IT_GETATTR);
4471 if (S_ISREG(inode->i_mode)) {
4472 /* In case of restore, the MDT has the right size and has
4473 * already send it back without granting the layout lock,
4474 * inode is up-to-date so glimpse is useless.
4475 * Also to glimpse we need the layout, in case of a running
4476 * restore the MDT holds the layout lock so the glimpse will
4477 * block up to the end of restore (getattr will block)
4479 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4480 rc = ll_glimpse_size(inode);
4485 /* If object isn't regular a file then don't validate size. */
4486 if (S_ISDIR(inode->i_mode) &&
4487 lli->lli_lsm_md != NULL) {
4488 rc = ll_merge_md_attr(inode);
4493 LTIME_S(inode->i_atime) = lli->lli_atime;
4494 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4495 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4498 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4500 if (ll_need_32bit_api(sbi)) {
4501 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4502 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4503 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4505 stat->ino = inode->i_ino;
4506 stat->dev = inode->i_sb->s_dev;
4507 stat->rdev = inode->i_rdev;
4510 stat->mode = inode->i_mode;
4511 stat->uid = inode->i_uid;
4512 stat->gid = inode->i_gid;
4513 stat->atime = inode->i_atime;
4514 stat->mtime = inode->i_mtime;
4515 stat->ctime = inode->i_ctime;
4516 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4518 stat->nlink = inode->i_nlink;
4519 stat->size = i_size_read(inode);
4520 stat->blocks = inode->i_blocks;
4525 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4526 __u64 start, __u64 len)
4530 struct fiemap *fiemap;
4531 unsigned int extent_count = fieinfo->fi_extents_max;
4533 num_bytes = sizeof(*fiemap) + (extent_count *
4534 sizeof(struct fiemap_extent));
4535 OBD_ALLOC_LARGE(fiemap, num_bytes);
4540 fiemap->fm_flags = fieinfo->fi_flags;
4541 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4542 fiemap->fm_start = start;
4543 fiemap->fm_length = len;
4544 if (extent_count > 0 &&
4545 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4546 sizeof(struct fiemap_extent)) != 0)
4547 GOTO(out, rc = -EFAULT);
4549 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4551 fieinfo->fi_flags = fiemap->fm_flags;
4552 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4553 if (extent_count > 0 &&
4554 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4555 fiemap->fm_mapped_extents *
4556 sizeof(struct fiemap_extent)) != 0)
4557 GOTO(out, rc = -EFAULT);
4559 OBD_FREE_LARGE(fiemap, num_bytes);
4563 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4565 struct ll_inode_info *lli = ll_i2info(inode);
4566 struct posix_acl *acl = NULL;
4569 spin_lock(&lli->lli_lock);
4570 /* VFS' acl_permission_check->check_acl will release the refcount */
4571 acl = posix_acl_dup(lli->lli_posix_acl);
4572 spin_unlock(&lli->lli_lock);
4577 #ifdef HAVE_IOP_SET_ACL
4578 #ifdef CONFIG_FS_POSIX_ACL
4579 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4581 struct ll_sb_info *sbi = ll_i2sbi(inode);
4582 struct ptlrpc_request *req = NULL;
4583 const char *name = NULL;
4585 size_t value_size = 0;
4590 case ACL_TYPE_ACCESS:
4591 name = XATTR_NAME_POSIX_ACL_ACCESS;
4593 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4596 case ACL_TYPE_DEFAULT:
4597 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4598 if (!S_ISDIR(inode->i_mode))
4599 rc = acl ? -EACCES : 0;
4610 value_size = posix_acl_xattr_size(acl->a_count);
4611 value = kmalloc(value_size, GFP_NOFS);
4613 GOTO(out, rc = -ENOMEM);
4615 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4617 GOTO(out_value, rc);
4620 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4621 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4622 name, value, value_size, 0, 0, &req);
4624 ptlrpc_req_finished(req);
4629 forget_cached_acl(inode, type);
4631 set_cached_acl(inode, type, acl);
4634 #endif /* CONFIG_FS_POSIX_ACL */
4635 #endif /* HAVE_IOP_SET_ACL */
4637 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4639 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4640 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4642 ll_check_acl(struct inode *inode, int mask)
4645 # ifdef CONFIG_FS_POSIX_ACL
4646 struct posix_acl *acl;
4650 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4651 if (flags & IPERM_FLAG_RCU)
4654 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4659 rc = posix_acl_permission(inode, acl, mask);
4660 posix_acl_release(acl);
4663 # else /* !CONFIG_FS_POSIX_ACL */
4665 # endif /* CONFIG_FS_POSIX_ACL */
4667 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4669 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4670 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4672 # ifdef HAVE_INODE_PERMISION_2ARGS
4673 int ll_inode_permission(struct inode *inode, int mask)
4675 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4680 struct ll_sb_info *sbi;
4681 struct root_squash_info *squash;
4682 struct cred *cred = NULL;
4683 const struct cred *old_cred = NULL;
4685 bool squash_id = false;
4688 #ifdef MAY_NOT_BLOCK
4689 if (mask & MAY_NOT_BLOCK)
4691 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4692 if (flags & IPERM_FLAG_RCU)
4696 /* as root inode are NOT getting validated in lookup operation,
4697 * need to do it before permission check. */
4699 if (inode == inode->i_sb->s_root->d_inode) {
4700 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4705 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4706 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4708 /* squash fsuid/fsgid if needed */
4709 sbi = ll_i2sbi(inode);
4710 squash = &sbi->ll_squash;
4711 if (unlikely(squash->rsi_uid != 0 &&
4712 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4713 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4717 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4718 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4719 squash->rsi_uid, squash->rsi_gid);
4721 /* update current process's credentials
4722 * and FS capability */
4723 cred = prepare_creds();
4727 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4728 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4729 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4730 if ((1 << cap) & CFS_CAP_FS_MASK)
4731 cap_lower(cred->cap_effective, cap);
4733 old_cred = override_creds(cred);
4736 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4737 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4738 /* restore current process's credentials and FS capability */
4740 revert_creds(old_cred);
4747 /* -o localflock - only provides locally consistent flock locks */
4748 struct file_operations ll_file_operations = {
4749 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4750 # ifdef HAVE_SYNC_READ_WRITE
4751 .read = new_sync_read,
4752 .write = new_sync_write,
4754 .read_iter = ll_file_read_iter,
4755 .write_iter = ll_file_write_iter,
4756 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4757 .read = ll_file_read,
4758 .aio_read = ll_file_aio_read,
4759 .write = ll_file_write,
4760 .aio_write = ll_file_aio_write,
4761 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4762 .unlocked_ioctl = ll_file_ioctl,
4763 .open = ll_file_open,
4764 .release = ll_file_release,
4765 .mmap = ll_file_mmap,
4766 .llseek = ll_file_seek,
4767 .splice_read = ll_file_splice_read,
4772 struct file_operations ll_file_operations_flock = {
4773 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4774 # ifdef HAVE_SYNC_READ_WRITE
4775 .read = new_sync_read,
4776 .write = new_sync_write,
4777 # endif /* HAVE_SYNC_READ_WRITE */
4778 .read_iter = ll_file_read_iter,
4779 .write_iter = ll_file_write_iter,
4780 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4781 .read = ll_file_read,
4782 .aio_read = ll_file_aio_read,
4783 .write = ll_file_write,
4784 .aio_write = ll_file_aio_write,
4785 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4786 .unlocked_ioctl = ll_file_ioctl,
4787 .open = ll_file_open,
4788 .release = ll_file_release,
4789 .mmap = ll_file_mmap,
4790 .llseek = ll_file_seek,
4791 .splice_read = ll_file_splice_read,
4794 .flock = ll_file_flock,
4795 .lock = ll_file_flock
4798 /* These are for -o noflock - to return ENOSYS on flock calls */
4799 struct file_operations ll_file_operations_noflock = {
4800 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4801 # ifdef HAVE_SYNC_READ_WRITE
4802 .read = new_sync_read,
4803 .write = new_sync_write,
4804 # endif /* HAVE_SYNC_READ_WRITE */
4805 .read_iter = ll_file_read_iter,
4806 .write_iter = ll_file_write_iter,
4807 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4808 .read = ll_file_read,
4809 .aio_read = ll_file_aio_read,
4810 .write = ll_file_write,
4811 .aio_write = ll_file_aio_write,
4812 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4813 .unlocked_ioctl = ll_file_ioctl,
4814 .open = ll_file_open,
4815 .release = ll_file_release,
4816 .mmap = ll_file_mmap,
4817 .llseek = ll_file_seek,
4818 .splice_read = ll_file_splice_read,
4821 .flock = ll_file_noflock,
4822 .lock = ll_file_noflock
4825 struct inode_operations ll_file_inode_operations = {
4826 .setattr = ll_setattr,
4827 .getattr = ll_getattr,
4828 .permission = ll_inode_permission,
4829 #ifdef HAVE_IOP_XATTR
4830 .setxattr = ll_setxattr,
4831 .getxattr = ll_getxattr,
4832 .removexattr = ll_removexattr,
4834 .listxattr = ll_listxattr,
4835 .fiemap = ll_fiemap,
4836 #ifdef HAVE_IOP_GET_ACL
4837 .get_acl = ll_get_acl,
4839 #ifdef HAVE_IOP_SET_ACL
4840 .set_acl = ll_set_acl,
4844 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4846 struct ll_inode_info *lli = ll_i2info(inode);
4847 struct cl_object *obj = lli->lli_clob;
4856 env = cl_env_get(&refcheck);
4858 RETURN(PTR_ERR(env));
4860 rc = cl_conf_set(env, lli->lli_clob, conf);
4864 if (conf->coc_opc == OBJECT_CONF_SET) {
4865 struct ldlm_lock *lock = conf->coc_lock;
4866 struct cl_layout cl = {
4870 LASSERT(lock != NULL);
4871 LASSERT(ldlm_has_layout(lock));
4873 /* it can only be allowed to match after layout is
4874 * applied to inode otherwise false layout would be
4875 * seen. Applying layout shoud happen before dropping
4876 * the intent lock. */
4877 ldlm_lock_allow_match(lock);
4879 rc = cl_object_layout_get(env, obj, &cl);
4884 DFID": layout version change: %u -> %u\n",
4885 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4887 ll_layout_version_set(lli, cl.cl_layout_gen);
4891 cl_env_put(env, &refcheck);
4896 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4897 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4900 struct ll_sb_info *sbi = ll_i2sbi(inode);
4901 struct ptlrpc_request *req;
4902 struct mdt_body *body;
4909 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4910 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4911 lock->l_lvb_data, lock->l_lvb_len);
4913 if (lock->l_lvb_data != NULL)
4916 /* if layout lock was granted right away, the layout is returned
4917 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4918 * blocked and then granted via completion ast, we have to fetch
4919 * layout here. Please note that we can't use the LVB buffer in
4920 * completion AST because it doesn't have a large enough buffer */
4921 rc = ll_get_default_mdsize(sbi, &lmmsize);
4923 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4924 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4928 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4930 GOTO(out, rc = -EPROTO);
4932 lmmsize = body->mbo_eadatasize;
4933 if (lmmsize == 0) /* empty layout */
4936 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4938 GOTO(out, rc = -EFAULT);
4940 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4941 if (lvbdata == NULL)
4942 GOTO(out, rc = -ENOMEM);
4944 memcpy(lvbdata, lmm, lmmsize);
4945 lock_res_and_lock(lock);
4946 if (unlikely(lock->l_lvb_data == NULL)) {
4947 lock->l_lvb_type = LVB_T_LAYOUT;
4948 lock->l_lvb_data = lvbdata;
4949 lock->l_lvb_len = lmmsize;
4952 unlock_res_and_lock(lock);
4955 OBD_FREE_LARGE(lvbdata, lmmsize);
4960 ptlrpc_req_finished(req);
4965 * Apply the layout to the inode. Layout lock is held and will be released
4968 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4969 struct inode *inode)
4971 struct ll_inode_info *lli = ll_i2info(inode);
4972 struct ll_sb_info *sbi = ll_i2sbi(inode);
4973 struct ldlm_lock *lock;
4974 struct cl_object_conf conf;
4977 bool wait_layout = false;
4980 LASSERT(lustre_handle_is_used(lockh));
4982 lock = ldlm_handle2lock(lockh);
4983 LASSERT(lock != NULL);
4984 LASSERT(ldlm_has_layout(lock));
4986 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4987 PFID(&lli->lli_fid), inode);
4989 /* in case this is a caching lock and reinstate with new inode */
4990 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4992 lock_res_and_lock(lock);
4993 lvb_ready = ldlm_is_lvb_ready(lock);
4994 unlock_res_and_lock(lock);
4996 /* checking lvb_ready is racy but this is okay. The worst case is
4997 * that multi processes may configure the file on the same time. */
5001 rc = ll_layout_fetch(inode, lock);
5005 /* for layout lock, lmm is stored in lock's lvb.
5006 * lvb_data is immutable if the lock is held so it's safe to access it
5009 * set layout to file. Unlikely this will fail as old layout was
5010 * surely eliminated */
5011 memset(&conf, 0, sizeof conf);
5012 conf.coc_opc = OBJECT_CONF_SET;
5013 conf.coc_inode = inode;
5014 conf.coc_lock = lock;
5015 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5016 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5017 rc = ll_layout_conf(inode, &conf);
5019 /* refresh layout failed, need to wait */
5020 wait_layout = rc == -EBUSY;
5023 LDLM_LOCK_PUT(lock);
5024 ldlm_lock_decref(lockh, mode);
5026 /* wait for IO to complete if it's still being used. */
5028 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5029 ll_get_fsname(inode->i_sb, NULL, 0),
5030 PFID(&lli->lli_fid), inode);
5032 memset(&conf, 0, sizeof conf);
5033 conf.coc_opc = OBJECT_CONF_WAIT;
5034 conf.coc_inode = inode;
5035 rc = ll_layout_conf(inode, &conf);
5039 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5040 ll_get_fsname(inode->i_sb, NULL, 0),
5041 PFID(&lli->lli_fid), rc);
5047 * Issue layout intent RPC to MDS.
5048 * \param inode [in] file inode
5049 * \param intent [in] layout intent
5051 * \retval 0 on success
5052 * \retval < 0 error code
5054 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5056 struct ll_inode_info *lli = ll_i2info(inode);
5057 struct ll_sb_info *sbi = ll_i2sbi(inode);
5058 struct md_op_data *op_data;
5059 struct lookup_intent it;
5060 struct ptlrpc_request *req;
5064 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5065 0, 0, LUSTRE_OPC_ANY, NULL);
5066 if (IS_ERR(op_data))
5067 RETURN(PTR_ERR(op_data));
5069 op_data->op_data = intent;
5070 op_data->op_data_size = sizeof(*intent);
5072 memset(&it, 0, sizeof(it));
5073 it.it_op = IT_LAYOUT;
5074 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5075 intent->li_opc == LAYOUT_INTENT_TRUNC)
5076 it.it_flags = FMODE_WRITE;
5078 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5079 ll_get_fsname(inode->i_sb, NULL, 0),
5080 PFID(&lli->lli_fid), inode);
5082 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5083 &ll_md_blocking_ast, 0);
5084 if (it.it_request != NULL)
5085 ptlrpc_req_finished(it.it_request);
5086 it.it_request = NULL;
5088 ll_finish_md_op_data(op_data);
5090 /* set lock data in case this is a new lock */
5092 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5094 ll_intent_drop_lock(&it);
5100 * This function checks if there exists a LAYOUT lock on the client side,
5101 * or enqueues it if it doesn't have one in cache.
5103 * This function will not hold layout lock so it may be revoked any time after
5104 * this function returns. Any operations depend on layout should be redone
5107 * This function should be called before lov_io_init() to get an uptodate
5108 * layout version, the caller should save the version number and after IO
5109 * is finished, this function should be called again to verify that layout
5110 * is not changed during IO time.
5112 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5114 struct ll_inode_info *lli = ll_i2info(inode);
5115 struct ll_sb_info *sbi = ll_i2sbi(inode);
5116 struct lustre_handle lockh;
5117 struct layout_intent intent = {
5118 .li_opc = LAYOUT_INTENT_ACCESS,
5120 enum ldlm_mode mode;
5124 *gen = ll_layout_version_get(lli);
5125 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5129 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5130 LASSERT(S_ISREG(inode->i_mode));
5132 /* take layout lock mutex to enqueue layout lock exclusively. */
5133 mutex_lock(&lli->lli_layout_mutex);
5136 /* mostly layout lock is caching on the local side, so try to
5137 * match it before grabbing layout lock mutex. */
5138 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5139 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5140 if (mode != 0) { /* hit cached lock */
5141 rc = ll_layout_lock_set(&lockh, mode, inode);
5147 rc = ll_layout_intent(inode, &intent);
5153 *gen = ll_layout_version_get(lli);
5154 mutex_unlock(&lli->lli_layout_mutex);
5160 * Issue layout intent RPC indicating where in a file an IO is about to write.
5162 * \param[in] inode file inode.
5163 * \param[in] ext write range with start offset of fille in bytes where
5164 * an IO is about to write, and exclusive end offset in
5167 * \retval 0 on success
5168 * \retval < 0 error code
5170 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5171 struct lu_extent *ext)
5173 struct layout_intent intent = {
5175 .li_extent.e_start = ext->e_start,
5176 .li_extent.e_end = ext->e_end,
5181 rc = ll_layout_intent(inode, &intent);
5187 * This function send a restore request to the MDT
5189 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5191 struct hsm_user_request *hur;
5195 len = sizeof(struct hsm_user_request) +
5196 sizeof(struct hsm_user_item);
5197 OBD_ALLOC(hur, len);
5201 hur->hur_request.hr_action = HUA_RESTORE;
5202 hur->hur_request.hr_archive_id = 0;
5203 hur->hur_request.hr_flags = 0;
5204 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5205 sizeof(hur->hur_user_item[0].hui_fid));
5206 hur->hur_user_item[0].hui_extent.offset = offset;
5207 hur->hur_user_item[0].hui_extent.length = length;
5208 hur->hur_request.hr_itemcount = 1;
5209 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,