4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
203 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
204 if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
205 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
207 rc = md_close(md_exp, op_data, och->och_mod, &req);
208 if (rc != 0 && rc != -EINTR)
209 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
210 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
212 if (rc == 0 && op_data->op_bias & bias) {
213 struct mdt_body *body;
215 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
216 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
220 ll_finish_md_op_data(op_data);
224 md_clear_open_replay_data(md_exp, och);
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 ptlrpc_req_finished(req); /* This is close request */
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
274 static int ll_md_close(struct inode *inode, struct file *file)
276 union ldlm_policy_data policy = {
277 .l_inodebits = { MDS_INODELOCK_OPEN },
279 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
280 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lustre_handle lockh;
283 enum ldlm_mode lockmode;
287 /* clear group lock, if present */
288 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
289 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
291 if (fd->fd_lease_och != NULL) {
294 /* Usually the lease is not released when the
295 * application crashed, we need to release here. */
296 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
297 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
298 PFID(&lli->lli_fid), rc, lease_broken);
300 fd->fd_lease_och = NULL;
303 if (fd->fd_och != NULL) {
304 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
309 /* Let's see if we have good enough OPEN lock on the file and if
310 we can skip talking to MDS */
311 mutex_lock(&lli->lli_och_mutex);
312 if (fd->fd_omode & FMODE_WRITE) {
314 LASSERT(lli->lli_open_fd_write_count);
315 lli->lli_open_fd_write_count--;
316 } else if (fd->fd_omode & FMODE_EXEC) {
318 LASSERT(lli->lli_open_fd_exec_count);
319 lli->lli_open_fd_exec_count--;
322 LASSERT(lli->lli_open_fd_read_count);
323 lli->lli_open_fd_read_count--;
325 mutex_unlock(&lli->lli_och_mutex);
327 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
328 LDLM_IBITS, &policy, lockmode, &lockh))
329 rc = ll_md_real_close(inode, fd->fd_omode);
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
338 /* While this returns an error code, fput() the caller does not, so we need
339 * to make every effort to clean up all of our state here. Also, applications
340 * rarely check close errors and even if an error is returned they will not
341 * re-try the close call.
343 int ll_file_release(struct inode *inode, struct file *file)
345 struct ll_file_data *fd;
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
351 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
352 PFID(ll_inode2fid(inode)), inode);
354 if (inode->i_sb->s_root != file_dentry(file))
355 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
356 fd = LUSTRE_FPRIVATE(file);
359 /* The last ref on @file, maybe not the the owner pid of statahead,
360 * because parent and child process can share the same file handle. */
361 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
362 ll_deauthorize_statahead(inode, fd);
364 if (inode->i_sb->s_root == file_dentry(file)) {
365 LUSTRE_FPRIVATE(file) = NULL;
366 ll_file_data_put(fd);
370 if (!S_ISDIR(inode->i_mode)) {
371 if (lli->lli_clob != NULL)
372 lov_read_and_clear_async_rc(lli->lli_clob);
373 lli->lli_async_rc = 0;
376 rc = ll_md_close(inode, file);
378 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
379 libcfs_debug_dumplog();
384 static inline int ll_dom_readpage(void *data, struct page *page)
386 struct niobuf_local *lnb = data;
389 kaddr = ll_kmap_atomic(page, KM_USER0);
390 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
391 if (lnb->lnb_len < PAGE_SIZE)
392 memset(kaddr + lnb->lnb_len, 0,
393 PAGE_SIZE - lnb->lnb_len);
394 flush_dcache_page(page);
395 SetPageUptodate(page);
396 ll_kunmap_atomic(kaddr, KM_USER0);
402 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
403 struct lookup_intent *it)
405 struct ll_inode_info *lli = ll_i2info(inode);
406 struct cl_object *obj = lli->lli_clob;
407 struct address_space *mapping = inode->i_mapping;
409 struct niobuf_remote *rnb;
414 struct lustre_handle lockh;
415 struct ldlm_lock *lock;
416 unsigned long index, start;
417 struct niobuf_local lnb;
419 bool dom_lock = false;
426 if (it->it_lock_mode != 0) {
427 lockh.cookie = it->it_lock_handle;
428 lock = ldlm_handle2lock(&lockh);
430 dom_lock = ldlm_has_dom(lock);
437 env = cl_env_get(&refcheck);
441 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
443 GOTO(out_env, rc = -ENODATA);
445 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
446 data = (char *)rnb + sizeof(*rnb);
448 if (rnb == NULL || rnb->rnb_len == 0)
449 GOTO(out_env, rc = 0);
451 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
452 rnb->rnb_len, i_size_read(inode));
454 io = vvp_env_thread_io(env);
456 io->ci_ignore_layout = 1;
457 rc = cl_io_init(env, io, CIT_MISC, obj);
461 lnb.lnb_file_offset = rnb->rnb_offset;
462 start = lnb.lnb_file_offset / PAGE_SIZE;
464 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
465 lnb.lnb_page_offset = 0;
469 lnb.lnb_data = data + (index << PAGE_SHIFT);
470 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
471 if (lnb.lnb_len > PAGE_SIZE)
472 lnb.lnb_len = PAGE_SIZE;
474 vmpage = read_cache_page(mapping, index + start,
475 ll_dom_readpage, &lnb);
476 if (IS_ERR(vmpage)) {
477 CWARN("%s: cannot fill page %lu for "DFID
478 " with data: rc = %li\n",
479 ll_get_fsname(inode->i_sb, NULL, 0),
480 index + start, PFID(lu_object_fid(&obj->co_lu)),
485 clp = cl_page_find(env, obj, vmpage->index, vmpage,
490 GOTO(out_io, rc = PTR_ERR(clp));
494 cl_page_export(env, clp, 1);
495 cl_page_put(env, clp);
499 } while (rnb->rnb_len > (index << PAGE_SHIFT));
505 cl_env_put(env, &refcheck);
508 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
509 struct lookup_intent *itp)
511 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
512 struct dentry *parent = de->d_parent;
513 const char *name = NULL;
515 struct md_op_data *op_data;
516 struct ptlrpc_request *req = NULL;
520 LASSERT(parent != NULL);
521 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
523 /* if server supports open-by-fid, or file name is invalid, don't pack
524 * name in open request */
525 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
526 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
527 name = de->d_name.name;
528 len = de->d_name.len;
531 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
532 name, len, 0, LUSTRE_OPC_ANY, NULL);
534 RETURN(PTR_ERR(op_data));
535 op_data->op_data = lmm;
536 op_data->op_data_size = lmmsize;
538 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
539 &ll_md_blocking_ast, 0);
540 ll_finish_md_op_data(op_data);
542 /* reason for keep own exit path - don`t flood log
543 * with messages with -ESTALE errors.
545 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
546 it_open_error(DISP_OPEN_OPEN, itp))
548 ll_release_openhandle(de, itp);
552 if (it_disposition(itp, DISP_LOOKUP_NEG))
553 GOTO(out, rc = -ENOENT);
555 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
556 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
557 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
561 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
563 if (!rc && itp->it_lock_mode) {
564 ll_dom_finish_open(de->d_inode, req, itp);
565 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
569 ptlrpc_req_finished(req);
570 ll_intent_drop_lock(itp);
572 /* We did open by fid, but by the time we got to the server,
573 * the object disappeared. If this is a create, we cannot really
574 * tell the userspace that the file it was trying to create
575 * does not exist. Instead let's return -ESTALE, and the VFS will
576 * retry the create with LOOKUP_REVAL that we are going to catch
577 * in ll_revalidate_dentry() and use lookup then.
579 if (rc == -ENOENT && itp->it_op & IT_CREAT)
585 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
586 struct obd_client_handle *och)
588 struct mdt_body *body;
590 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
591 och->och_fh = body->mbo_handle;
592 och->och_fid = body->mbo_fid1;
593 och->och_lease_handle.cookie = it->it_lock_handle;
594 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
595 och->och_flags = it->it_flags;
597 return md_set_open_replay_data(md_exp, och, it);
600 static int ll_local_open(struct file *file, struct lookup_intent *it,
601 struct ll_file_data *fd, struct obd_client_handle *och)
603 struct inode *inode = file_inode(file);
606 LASSERT(!LUSTRE_FPRIVATE(file));
613 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
618 LUSTRE_FPRIVATE(file) = fd;
619 ll_readahead_init(inode, &fd->fd_ras);
620 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
622 /* ll_cl_context initialize */
623 rwlock_init(&fd->fd_lock);
624 INIT_LIST_HEAD(&fd->fd_lccs);
629 /* Open a file, and (for the very first open) create objects on the OSTs at
630 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
631 * creation or open until ll_lov_setstripe() ioctl is called.
633 * If we already have the stripe MD locally then we don't request it in
634 * md_open(), by passing a lmm_size = 0.
636 * It is up to the application to ensure no other processes open this file
637 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
638 * used. We might be able to avoid races of that sort by getting lli_open_sem
639 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
640 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
642 int ll_file_open(struct inode *inode, struct file *file)
644 struct ll_inode_info *lli = ll_i2info(inode);
645 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
646 .it_flags = file->f_flags };
647 struct obd_client_handle **och_p = NULL;
648 __u64 *och_usecount = NULL;
649 struct ll_file_data *fd;
653 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
654 PFID(ll_inode2fid(inode)), inode, file->f_flags);
656 it = file->private_data; /* XXX: compat macro */
657 file->private_data = NULL; /* prevent ll_local_open assertion */
659 fd = ll_file_data_get();
661 GOTO(out_nofiledata, rc = -ENOMEM);
664 if (S_ISDIR(inode->i_mode))
665 ll_authorize_statahead(inode, fd);
667 if (inode->i_sb->s_root == file_dentry(file)) {
668 LUSTRE_FPRIVATE(file) = fd;
672 if (!it || !it->it_disposition) {
673 /* Convert f_flags into access mode. We cannot use file->f_mode,
674 * because everything but O_ACCMODE mask was stripped from
676 if ((oit.it_flags + 1) & O_ACCMODE)
678 if (file->f_flags & O_TRUNC)
679 oit.it_flags |= FMODE_WRITE;
681 /* kernel only call f_op->open in dentry_open. filp_open calls
682 * dentry_open after call to open_namei that checks permissions.
683 * Only nfsd_open call dentry_open directly without checking
684 * permissions and because of that this code below is safe. */
685 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
686 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
688 /* We do not want O_EXCL here, presumably we opened the file
689 * already? XXX - NFS implications? */
690 oit.it_flags &= ~O_EXCL;
692 /* bug20584, if "it_flags" contains O_CREAT, the file will be
693 * created if necessary, then "IT_CREAT" should be set to keep
694 * consistent with it */
695 if (oit.it_flags & O_CREAT)
696 oit.it_op |= IT_CREAT;
702 /* Let's see if we have file open on MDS already. */
703 if (it->it_flags & FMODE_WRITE) {
704 och_p = &lli->lli_mds_write_och;
705 och_usecount = &lli->lli_open_fd_write_count;
706 } else if (it->it_flags & FMODE_EXEC) {
707 och_p = &lli->lli_mds_exec_och;
708 och_usecount = &lli->lli_open_fd_exec_count;
710 och_p = &lli->lli_mds_read_och;
711 och_usecount = &lli->lli_open_fd_read_count;
714 mutex_lock(&lli->lli_och_mutex);
715 if (*och_p) { /* Open handle is present */
716 if (it_disposition(it, DISP_OPEN_OPEN)) {
717 /* Well, there's extra open request that we do not need,
718 let's close it somehow. This will decref request. */
719 rc = it_open_error(DISP_OPEN_OPEN, it);
721 mutex_unlock(&lli->lli_och_mutex);
722 GOTO(out_openerr, rc);
725 ll_release_openhandle(file_dentry(file), it);
729 rc = ll_local_open(file, it, fd, NULL);
732 mutex_unlock(&lli->lli_och_mutex);
733 GOTO(out_openerr, rc);
736 LASSERT(*och_usecount == 0);
737 if (!it->it_disposition) {
738 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
739 /* We cannot just request lock handle now, new ELC code
740 means that one of other OPEN locks for this file
741 could be cancelled, and since blocking ast handler
742 would attempt to grab och_mutex as well, that would
743 result in a deadlock */
744 mutex_unlock(&lli->lli_och_mutex);
746 * Normally called under two situations:
748 * 2. A race/condition on MDS resulting in no open
749 * handle to be returned from LOOKUP|OPEN request,
750 * for example if the target entry was a symlink.
752 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
753 * marked by a bit set in ll_iget_for_nfs. Clear the
754 * bit so that it's not confusing later callers.
756 * NB; when ldd is NULL, it must have come via normal
757 * lookup path only, since ll_iget_for_nfs always calls
760 if (ldd && ldd->lld_nfs_dentry) {
761 ldd->lld_nfs_dentry = 0;
762 it->it_flags |= MDS_OPEN_LOCK;
766 * Always specify MDS_OPEN_BY_FID because we don't want
767 * to get file with different fid.
769 it->it_flags |= MDS_OPEN_BY_FID;
770 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
773 GOTO(out_openerr, rc);
777 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
779 GOTO(out_och_free, rc = -ENOMEM);
783 /* md_intent_lock() didn't get a request ref if there was an
784 * open error, so don't do cleanup on the request here
786 /* XXX (green): Should not we bail out on any error here, not
787 * just open error? */
788 rc = it_open_error(DISP_OPEN_OPEN, it);
790 GOTO(out_och_free, rc);
792 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
793 "inode %p: disposition %x, status %d\n", inode,
794 it_disposition(it, ~0), it->it_status);
796 rc = ll_local_open(file, it, fd, *och_p);
798 GOTO(out_och_free, rc);
800 mutex_unlock(&lli->lli_och_mutex);
803 /* Must do this outside lli_och_mutex lock to prevent deadlock where
804 different kind of OPEN lock for this same inode gets cancelled
805 by ldlm_cancel_lru */
806 if (!S_ISREG(inode->i_mode))
807 GOTO(out_och_free, rc);
809 cl_lov_delay_create_clear(&file->f_flags);
810 GOTO(out_och_free, rc);
814 if (och_p && *och_p) {
815 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
816 *och_p = NULL; /* OBD_FREE writes some magic there */
819 mutex_unlock(&lli->lli_och_mutex);
822 if (lli->lli_opendir_key == fd)
823 ll_deauthorize_statahead(inode, fd);
825 ll_file_data_put(fd);
827 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
831 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
832 ptlrpc_req_finished(it->it_request);
833 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
839 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
840 struct ldlm_lock_desc *desc, void *data, int flag)
843 struct lustre_handle lockh;
847 case LDLM_CB_BLOCKING:
848 ldlm_lock2handle(lock, &lockh);
849 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
851 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
855 case LDLM_CB_CANCELING:
863 * When setting a lease on a file, we take ownership of the lli_mds_*_och
864 * and save it as fd->fd_och so as to force client to reopen the file even
865 * if it has an open lock in cache already.
867 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
868 struct lustre_handle *old_handle)
870 struct ll_inode_info *lli = ll_i2info(inode);
871 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
872 struct obd_client_handle **och_p;
877 /* Get the openhandle of the file */
878 mutex_lock(&lli->lli_och_mutex);
879 if (fd->fd_lease_och != NULL)
880 GOTO(out_unlock, rc = -EBUSY);
882 if (fd->fd_och == NULL) {
883 if (file->f_mode & FMODE_WRITE) {
884 LASSERT(lli->lli_mds_write_och != NULL);
885 och_p = &lli->lli_mds_write_och;
886 och_usecount = &lli->lli_open_fd_write_count;
888 LASSERT(lli->lli_mds_read_och != NULL);
889 och_p = &lli->lli_mds_read_och;
890 och_usecount = &lli->lli_open_fd_read_count;
893 if (*och_usecount > 1)
894 GOTO(out_unlock, rc = -EBUSY);
901 *old_handle = fd->fd_och->och_fh;
905 mutex_unlock(&lli->lli_och_mutex);
910 * Release ownership on lli_mds_*_och when putting back a file lease.
912 static int ll_lease_och_release(struct inode *inode, struct file *file)
914 struct ll_inode_info *lli = ll_i2info(inode);
915 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
916 struct obd_client_handle **och_p;
917 struct obd_client_handle *old_och = NULL;
922 mutex_lock(&lli->lli_och_mutex);
923 if (file->f_mode & FMODE_WRITE) {
924 och_p = &lli->lli_mds_write_och;
925 och_usecount = &lli->lli_open_fd_write_count;
927 och_p = &lli->lli_mds_read_och;
928 och_usecount = &lli->lli_open_fd_read_count;
931 /* The file may have been open by another process (broken lease) so
932 * *och_p is not NULL. In this case we should simply increase usecount
935 if (*och_p != NULL) {
936 old_och = fd->fd_och;
943 mutex_unlock(&lli->lli_och_mutex);
946 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
952 * Acquire a lease and open the file.
954 static struct obd_client_handle *
955 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
958 struct lookup_intent it = { .it_op = IT_OPEN };
959 struct ll_sb_info *sbi = ll_i2sbi(inode);
960 struct md_op_data *op_data;
961 struct ptlrpc_request *req = NULL;
962 struct lustre_handle old_handle = { 0 };
963 struct obd_client_handle *och = NULL;
968 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
969 RETURN(ERR_PTR(-EINVAL));
972 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
973 RETURN(ERR_PTR(-EPERM));
975 rc = ll_lease_och_acquire(inode, file, &old_handle);
982 RETURN(ERR_PTR(-ENOMEM));
984 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
985 LUSTRE_OPC_ANY, NULL);
987 GOTO(out, rc = PTR_ERR(op_data));
989 /* To tell the MDT this openhandle is from the same owner */
990 op_data->op_handle = old_handle;
992 it.it_flags = fmode | open_flags;
993 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
994 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
995 &ll_md_blocking_lease_ast,
996 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
997 * it can be cancelled which may mislead applications that the lease is
999 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1000 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1001 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1002 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1003 ll_finish_md_op_data(op_data);
1004 ptlrpc_req_finished(req);
1006 GOTO(out_release_it, rc);
1008 if (it_disposition(&it, DISP_LOOKUP_NEG))
1009 GOTO(out_release_it, rc = -ENOENT);
1011 rc = it_open_error(DISP_OPEN_OPEN, &it);
1013 GOTO(out_release_it, rc);
1015 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1016 ll_och_fill(sbi->ll_md_exp, &it, och);
1018 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1019 GOTO(out_close, rc = -EOPNOTSUPP);
1021 /* already get lease, handle lease lock */
1022 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1023 if (it.it_lock_mode == 0 ||
1024 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1025 /* open lock must return for lease */
1026 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1027 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1029 GOTO(out_close, rc = -EPROTO);
1032 ll_intent_release(&it);
1036 /* Cancel open lock */
1037 if (it.it_lock_mode != 0) {
1038 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1040 it.it_lock_mode = 0;
1041 och->och_lease_handle.cookie = 0ULL;
1043 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1045 CERROR("%s: error closing file "DFID": %d\n",
1046 ll_get_fsname(inode->i_sb, NULL, 0),
1047 PFID(&ll_i2info(inode)->lli_fid), rc2);
1048 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1050 ll_intent_release(&it);
1054 RETURN(ERR_PTR(rc));
1058 * Check whether a layout swap can be done between two inodes.
1060 * \param[in] inode1 First inode to check
1061 * \param[in] inode2 Second inode to check
1063 * \retval 0 on success, layout swap can be performed between both inodes
1064 * \retval negative error code if requirements are not met
1066 static int ll_check_swap_layouts_validity(struct inode *inode1,
1067 struct inode *inode2)
1069 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1072 if (inode_permission(inode1, MAY_WRITE) ||
1073 inode_permission(inode2, MAY_WRITE))
1076 if (inode1->i_sb != inode2->i_sb)
1082 static int ll_swap_layouts_close(struct obd_client_handle *och,
1083 struct inode *inode, struct inode *inode2)
1085 const struct lu_fid *fid1 = ll_inode2fid(inode);
1086 const struct lu_fid *fid2;
1090 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1091 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1093 rc = ll_check_swap_layouts_validity(inode, inode2);
1095 GOTO(out_free_och, rc);
1097 /* We now know that inode2 is a lustre inode */
1098 fid2 = ll_inode2fid(inode2);
1100 rc = lu_fid_cmp(fid1, fid2);
1102 GOTO(out_free_och, rc = -EINVAL);
1104 /* Close the file and {swap,merge} layouts between inode & inode2.
1105 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1106 * because we still need it to pack l_remote_handle to MDT. */
1107 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1110 och = NULL; /* freed in ll_close_inode_openhandle() */
1120 * Release lease and close the file.
1121 * It will check if the lease has ever broken.
1123 static int ll_lease_close_intent(struct obd_client_handle *och,
1124 struct inode *inode,
1125 bool *lease_broken, enum mds_op_bias bias,
1128 struct ldlm_lock *lock;
1129 bool cancelled = true;
1133 lock = ldlm_handle2lock(&och->och_lease_handle);
1135 lock_res_and_lock(lock);
1136 cancelled = ldlm_is_cancel(lock);
1137 unlock_res_and_lock(lock);
1138 LDLM_LOCK_PUT(lock);
1141 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1142 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1144 if (lease_broken != NULL)
1145 *lease_broken = cancelled;
1147 if (!cancelled && !bias)
1148 ldlm_cli_cancel(&och->och_lease_handle, 0);
1150 if (cancelled) { /* no need to excute intent */
1155 rc = ll_close_inode_openhandle(inode, och, bias, data);
1159 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1162 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1166 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1168 static int ll_lease_file_resync(struct obd_client_handle *och,
1169 struct inode *inode)
1171 struct ll_sb_info *sbi = ll_i2sbi(inode);
1172 struct md_op_data *op_data;
1173 __u64 data_version_unused;
1177 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1178 LUSTRE_OPC_ANY, NULL);
1179 if (IS_ERR(op_data))
1180 RETURN(PTR_ERR(op_data));
1182 /* before starting file resync, it's necessary to clean up page cache
1183 * in client memory, otherwise once the layout version is increased,
1184 * writing back cached data will be denied the OSTs. */
1185 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1189 op_data->op_handle = och->och_lease_handle;
1190 rc = md_file_resync(sbi->ll_md_exp, op_data);
1196 ll_finish_md_op_data(op_data);
1200 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1202 struct ll_inode_info *lli = ll_i2info(inode);
1203 struct cl_object *obj = lli->lli_clob;
1204 struct cl_attr *attr = vvp_env_thread_attr(env);
1212 ll_inode_size_lock(inode);
1214 /* Merge timestamps the most recently obtained from MDS with
1215 * timestamps obtained from OSTs.
1217 * Do not overwrite atime of inode because it may be refreshed
1218 * by file_accessed() function. If the read was served by cache
1219 * data, there is no RPC to be sent so that atime may not be
1220 * transferred to OSTs at all. MDT only updates atime at close time
1221 * if it's at least 'mdd.*.atime_diff' older.
1222 * All in all, the atime in Lustre does not strictly comply with
1223 * POSIX. Solving this problem needs to send an RPC to MDT for each
1224 * read, this will hurt performance. */
1225 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1226 LTIME_S(inode->i_atime) = lli->lli_atime;
1227 lli->lli_update_atime = 0;
1229 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1230 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1232 atime = LTIME_S(inode->i_atime);
1233 mtime = LTIME_S(inode->i_mtime);
1234 ctime = LTIME_S(inode->i_ctime);
1236 cl_object_attr_lock(obj);
1237 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1240 rc = cl_object_attr_get(env, obj, attr);
1241 cl_object_attr_unlock(obj);
1244 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1246 if (atime < attr->cat_atime)
1247 atime = attr->cat_atime;
1249 if (ctime < attr->cat_ctime)
1250 ctime = attr->cat_ctime;
1252 if (mtime < attr->cat_mtime)
1253 mtime = attr->cat_mtime;
1255 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1256 PFID(&lli->lli_fid), attr->cat_size);
1258 i_size_write(inode, attr->cat_size);
1259 inode->i_blocks = attr->cat_blocks;
1261 LTIME_S(inode->i_atime) = atime;
1262 LTIME_S(inode->i_mtime) = mtime;
1263 LTIME_S(inode->i_ctime) = ctime;
1266 ll_inode_size_unlock(inode);
1272 * Set designated mirror for I/O.
1274 * So far only read, write, and truncated can support to issue I/O to
1275 * designated mirror.
1277 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1281 /* clear layout version for generic(non-resync) I/O in case it carries
1282 * stale layout version due to I/O restart */
1283 io->ci_layout_version = 0;
1285 /* FLR: disable non-delay for designated mirror I/O because obviously
1286 * only one mirror is available */
1287 if (fd->fd_designated_mirror > 0) {
1289 io->ci_designated_mirror = fd->fd_designated_mirror;
1290 io->ci_layout_version = fd->fd_layout_version;
1291 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1295 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1296 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1299 static bool file_is_noatime(const struct file *file)
1301 const struct vfsmount *mnt = file->f_path.mnt;
1302 const struct inode *inode = file_inode((struct file *)file);
1304 /* Adapted from file_accessed() and touch_atime().*/
1305 if (file->f_flags & O_NOATIME)
1308 if (inode->i_flags & S_NOATIME)
1311 if (IS_NOATIME(inode))
1314 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1317 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1320 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1326 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1328 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1330 struct inode *inode = file_inode(file);
1331 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1333 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1334 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1335 io->u.ci_rw.rw_file = file;
1336 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1337 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1338 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1340 if (iot == CIT_WRITE) {
1341 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1342 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1343 file->f_flags & O_DIRECT ||
1346 io->ci_obj = ll_i2info(inode)->lli_clob;
1347 io->ci_lockreq = CILR_MAYBE;
1348 if (ll_file_nolock(file)) {
1349 io->ci_lockreq = CILR_NEVER;
1350 io->ci_no_srvlock = 1;
1351 } else if (file->f_flags & O_APPEND) {
1352 io->ci_lockreq = CILR_MANDATORY;
1354 io->ci_noatime = file_is_noatime(file);
1355 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1356 io->ci_pio = !io->u.ci_rw.rw_append;
1360 /* FLR: only use non-delay I/O for read as there is only one
1361 * avaliable mirror for write. */
1362 io->ci_ndelay = !(iot == CIT_WRITE);
1364 ll_io_set_mirror(io, file);
1367 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1369 struct cl_io_pt *pt = ptask->pt_cbdata;
1370 struct file *file = pt->cip_file;
1373 loff_t pos = pt->cip_pos;
1378 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1379 file_dentry(file)->d_name.name,
1380 pt->cip_iot == CIT_READ ? "read" : "write",
1381 pos, pos + pt->cip_count);
1383 env = cl_env_get(&refcheck);
1385 RETURN(PTR_ERR(env));
1387 io = vvp_env_thread_io(env);
1388 ll_io_init(io, file, pt->cip_iot);
1389 io->u.ci_rw.rw_iter = pt->cip_iter;
1390 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1391 io->ci_pio = 0; /* It's already in parallel task */
1393 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1394 pt->cip_count - pt->cip_result);
1396 struct vvp_io *vio = vvp_env_io(env);
1398 vio->vui_io_subtype = IO_NORMAL;
1399 vio->vui_fd = LUSTRE_FPRIVATE(file);
1401 ll_cl_add(file, env, io, LCC_RW);
1402 rc = cl_io_loop(env, io);
1403 ll_cl_remove(file, env);
1405 /* cl_io_rw_init() handled IO */
1409 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1415 if (io->ci_nob > 0) {
1416 pt->cip_result += io->ci_nob;
1417 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1419 pt->cip_iocb.ki_pos = pos;
1420 #ifdef HAVE_KIOCB_KI_LEFT
1421 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1422 #elif defined(HAVE_KI_NBYTES)
1423 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1427 cl_io_fini(env, io);
1428 cl_env_put(env, &refcheck);
1430 pt->cip_need_restart = io->ci_need_restart;
1432 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1433 file_dentry(file)->d_name.name,
1434 pt->cip_iot == CIT_READ ? "read" : "write",
1435 pt->cip_result, rc);
1437 RETURN(pt->cip_result > 0 ? 0 : rc);
1441 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1442 struct file *file, enum cl_io_type iot,
1443 loff_t *ppos, size_t count)
1445 struct range_lock range;
1446 struct vvp_io *vio = vvp_env_io(env);
1447 struct inode *inode = file_inode(file);
1448 struct ll_inode_info *lli = ll_i2info(inode);
1449 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1454 unsigned retried = 0;
1455 bool restarted = false;
1459 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1460 file_dentry(file)->d_name.name,
1461 iot == CIT_READ ? "read" : "write", pos, pos + count);
1464 io = vvp_env_thread_io(env);
1465 ll_io_init(io, file, iot);
1466 if (args->via_io_subtype == IO_NORMAL) {
1467 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1468 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1470 if (args->via_io_subtype != IO_NORMAL || restarted)
1472 io->ci_ndelay_tried = retried;
1474 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1475 bool range_locked = false;
1477 if (file->f_flags & O_APPEND)
1478 range_lock_init(&range, 0, LUSTRE_EOF);
1480 range_lock_init(&range, pos, pos + count - 1);
1482 vio->vui_fd = LUSTRE_FPRIVATE(file);
1483 vio->vui_io_subtype = args->via_io_subtype;
1485 switch (vio->vui_io_subtype) {
1487 /* Direct IO reads must also take range lock,
1488 * or multiple reads will try to work on the same pages
1489 * See LU-6227 for details. */
1490 if (((iot == CIT_WRITE) ||
1491 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1492 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1493 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1495 rc = range_lock(&lli->lli_write_tree, &range);
1499 range_locked = true;
1503 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1504 vio->u.splice.vui_flags = args->u.splice.via_flags;
1507 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1511 ll_cl_add(file, env, io, LCC_RW);
1512 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1513 !lli->lli_inode_locked) {
1515 lli->lli_inode_locked = 1;
1517 rc = cl_io_loop(env, io);
1518 if (lli->lli_inode_locked) {
1519 lli->lli_inode_locked = 0;
1520 inode_unlock(inode);
1522 ll_cl_remove(file, env);
1525 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1527 range_unlock(&lli->lli_write_tree, &range);
1530 /* cl_io_rw_init() handled IO */
1534 if (io->ci_nob > 0) {
1535 result += io->ci_nob;
1536 count -= io->ci_nob;
1538 if (args->via_io_subtype == IO_NORMAL) {
1539 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1541 /* CLIO is too complicated. See LU-11069. */
1542 if (cl_io_is_append(io))
1543 pos = io->u.ci_rw.rw_iocb.ki_pos;
1547 args->u.normal.via_iocb->ki_pos = pos;
1548 #ifdef HAVE_KIOCB_KI_LEFT
1549 args->u.normal.via_iocb->ki_left = count;
1550 #elif defined(HAVE_KI_NBYTES)
1551 args->u.normal.via_iocb->ki_nbytes = count;
1555 pos = io->u.ci_rw.rw_range.cir_pos;
1559 cl_io_fini(env, io);
1562 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1563 file->f_path.dentry->d_name.name,
1564 iot, rc, result, io->ci_need_restart);
1566 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1568 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1569 file_dentry(file)->d_name.name,
1570 iot == CIT_READ ? "read" : "write",
1571 pos, pos + count, result, rc);
1572 /* preserve the tried count for FLR */
1573 retried = io->ci_ndelay_tried;
1578 if (iot == CIT_READ) {
1580 ll_stats_ops_tally(ll_i2sbi(inode),
1581 LPROC_LL_READ_BYTES, result);
1582 } else if (iot == CIT_WRITE) {
1584 ll_stats_ops_tally(ll_i2sbi(inode),
1585 LPROC_LL_WRITE_BYTES, result);
1586 fd->fd_write_failed = false;
1587 } else if (result == 0 && rc == 0) {
1590 fd->fd_write_failed = true;
1592 fd->fd_write_failed = false;
1593 } else if (rc != -ERESTARTSYS) {
1594 fd->fd_write_failed = true;
1598 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1599 file_dentry(file)->d_name.name,
1600 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1604 RETURN(result > 0 ? result : rc);
1608 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1609 * especially for small I/O.
1611 * To serve a read request, CLIO has to create and initialize a cl_io and
1612 * then request DLM lock. This has turned out to have siginificant overhead
1613 * and affects the performance of small I/O dramatically.
1615 * It's not necessary to create a cl_io for each I/O. Under the help of read
1616 * ahead, most of the pages being read are already in memory cache and we can
1617 * read those pages directly because if the pages exist, the corresponding DLM
1618 * lock must exist so that page content must be valid.
1620 * In fast read implementation, the llite speculatively finds and reads pages
1621 * in memory cache. There are three scenarios for fast read:
1622 * - If the page exists and is uptodate, kernel VM will provide the data and
1623 * CLIO won't be intervened;
1624 * - If the page was brought into memory by read ahead, it will be exported
1625 * and read ahead parameters will be updated;
1626 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1627 * it will go back and invoke normal read, i.e., a cl_io will be created
1628 * and DLM lock will be requested.
1630 * POSIX compliance: posix standard states that read is intended to be atomic.
1631 * Lustre read implementation is in line with Linux kernel read implementation
1632 * and neither of them complies with POSIX standard in this matter. Fast read
1633 * doesn't make the situation worse on single node but it may interleave write
1634 * results from multiple nodes due to short read handling in ll_file_aio_read().
1636 * \param env - lu_env
1637 * \param iocb - kiocb from kernel
1638 * \param iter - user space buffers where the data will be copied
1640 * \retval - number of bytes have been read, or error code if error occurred.
1643 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1647 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1650 /* NB: we can't do direct IO for fast read because it will need a lock
1651 * to make IO engine happy. */
1652 if (iocb->ki_filp->f_flags & O_DIRECT)
1655 result = generic_file_read_iter(iocb, iter);
1657 /* If the first page is not in cache, generic_file_aio_read() will be
1658 * returned with -ENODATA.
1659 * See corresponding code in ll_readpage(). */
1660 if (result == -ENODATA)
1664 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1665 LPROC_LL_READ_BYTES, result);
1671 * Read from a file (through the page cache).
1673 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1676 struct vvp_io_args *args;
1681 result = ll_do_fast_read(iocb, to);
1682 if (result < 0 || iov_iter_count(to) == 0)
1685 env = cl_env_get(&refcheck);
1687 return PTR_ERR(env);
1689 args = ll_env_args(env, IO_NORMAL);
1690 args->u.normal.via_iter = to;
1691 args->u.normal.via_iocb = iocb;
1693 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1694 &iocb->ki_pos, iov_iter_count(to));
1697 else if (result == 0)
1700 cl_env_put(env, &refcheck);
1706 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1707 * If a page is already in the page cache and dirty (and some other things -
1708 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1709 * write to it without doing a full I/O, because Lustre already knows about it
1710 * and will write it out. This saves a lot of processing time.
1712 * All writes here are within one page, so exclusion is handled by the page
1713 * lock on the vm page. We do not do tiny writes for writes which touch
1714 * multiple pages because it's very unlikely multiple sequential pages are
1715 * are already dirty.
1717 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1718 * and are unlikely to be to already dirty pages.
1720 * Attribute updates are important here, we do them in ll_tiny_write_end.
1722 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1724 ssize_t count = iov_iter_count(iter);
1725 struct file *file = iocb->ki_filp;
1726 struct inode *inode = file_inode(file);
1731 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1732 * of function for why.
1734 if (count >= PAGE_SIZE ||
1735 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1738 result = __generic_file_write_iter(iocb, iter);
1740 /* If the page is not already dirty, ll_tiny_write_begin returns
1741 * -ENODATA. We continue on to normal write.
1743 if (result == -ENODATA)
1747 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1749 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1752 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1758 * Write to a file (through the page cache).
1760 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1762 struct vvp_io_args *args;
1764 ssize_t rc_tiny = 0, rc_normal;
1769 /* NB: we can't do direct IO for tiny writes because they use the page
1770 * cache, we can't do sync writes because tiny writes can't flush
1771 * pages, and we can't do append writes because we can't guarantee the
1772 * required DLM locks are held to protect file size.
1774 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1775 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1776 rc_tiny = ll_do_tiny_write(iocb, from);
1778 /* In case of error, go on and try normal write - Only stop if tiny
1779 * write completed I/O.
1781 if (iov_iter_count(from) == 0)
1782 GOTO(out, rc_normal = rc_tiny);
1784 env = cl_env_get(&refcheck);
1786 return PTR_ERR(env);
1788 args = ll_env_args(env, IO_NORMAL);
1789 args->u.normal.via_iter = from;
1790 args->u.normal.via_iocb = iocb;
1792 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1793 &iocb->ki_pos, iov_iter_count(from));
1795 /* On success, combine bytes written. */
1796 if (rc_tiny >= 0 && rc_normal > 0)
1797 rc_normal += rc_tiny;
1798 /* On error, only return error from normal write if tiny write did not
1799 * write any bytes. Otherwise return bytes written by tiny write.
1801 else if (rc_tiny > 0)
1802 rc_normal = rc_tiny;
1804 cl_env_put(env, &refcheck);
1809 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1811 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1813 static int ll_file_get_iov_count(const struct iovec *iov,
1814 unsigned long *nr_segs, size_t *count)
1819 for (seg = 0; seg < *nr_segs; seg++) {
1820 const struct iovec *iv = &iov[seg];
1823 * If any segment has a negative length, or the cumulative
1824 * length ever wraps negative then return -EINVAL.
1827 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1829 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1834 cnt -= iv->iov_len; /* This segment is no good */
1841 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1842 unsigned long nr_segs, loff_t pos)
1849 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1853 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1854 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1855 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1856 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1857 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1859 result = ll_file_read_iter(iocb, &to);
1864 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1867 struct iovec iov = { .iov_base = buf, .iov_len = count };
1872 init_sync_kiocb(&kiocb, file);
1873 kiocb.ki_pos = *ppos;
1874 #ifdef HAVE_KIOCB_KI_LEFT
1875 kiocb.ki_left = count;
1876 #elif defined(HAVE_KI_NBYTES)
1877 kiocb.i_nbytes = count;
1880 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1881 *ppos = kiocb.ki_pos;
1887 * Write to a file (through the page cache).
1890 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1891 unsigned long nr_segs, loff_t pos)
1893 struct iov_iter from;
1898 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1902 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1903 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1904 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1905 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1906 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1908 result = ll_file_write_iter(iocb, &from);
1913 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1914 size_t count, loff_t *ppos)
1916 struct iovec iov = { .iov_base = (void __user *)buf,
1923 init_sync_kiocb(&kiocb, file);
1924 kiocb.ki_pos = *ppos;
1925 #ifdef HAVE_KIOCB_KI_LEFT
1926 kiocb.ki_left = count;
1927 #elif defined(HAVE_KI_NBYTES)
1928 kiocb.ki_nbytes = count;
1931 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1932 *ppos = kiocb.ki_pos;
1936 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1939 * Send file content (through pagecache) somewhere with helper
1941 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1942 struct pipe_inode_info *pipe, size_t count,
1946 struct vvp_io_args *args;
1951 env = cl_env_get(&refcheck);
1953 RETURN(PTR_ERR(env));
1955 args = ll_env_args(env, IO_SPLICE);
1956 args->u.splice.via_pipe = pipe;
1957 args->u.splice.via_flags = flags;
1959 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1960 cl_env_put(env, &refcheck);
1964 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1965 __u64 flags, struct lov_user_md *lum, int lum_size)
1967 struct lookup_intent oit = {
1969 .it_flags = flags | MDS_OPEN_BY_FID,
1974 ll_inode_size_lock(inode);
1975 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1977 GOTO(out_unlock, rc);
1979 ll_release_openhandle(dentry, &oit);
1982 ll_inode_size_unlock(inode);
1983 ll_intent_release(&oit);
1988 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1989 struct lov_mds_md **lmmp, int *lmm_size,
1990 struct ptlrpc_request **request)
1992 struct ll_sb_info *sbi = ll_i2sbi(inode);
1993 struct mdt_body *body;
1994 struct lov_mds_md *lmm = NULL;
1995 struct ptlrpc_request *req = NULL;
1996 struct md_op_data *op_data;
1999 rc = ll_get_default_mdsize(sbi, &lmmsize);
2003 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2004 strlen(filename), lmmsize,
2005 LUSTRE_OPC_ANY, NULL);
2006 if (IS_ERR(op_data))
2007 RETURN(PTR_ERR(op_data));
2009 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2010 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2011 ll_finish_md_op_data(op_data);
2013 CDEBUG(D_INFO, "md_getattr_name failed "
2014 "on %s: rc %d\n", filename, rc);
2018 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2019 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2021 lmmsize = body->mbo_eadatasize;
2023 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2025 GOTO(out, rc = -ENODATA);
2028 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2029 LASSERT(lmm != NULL);
2031 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2032 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2033 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2034 GOTO(out, rc = -EPROTO);
2037 * This is coming from the MDS, so is probably in
2038 * little endian. We convert it to host endian before
2039 * passing it to userspace.
2041 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2044 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2045 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2046 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2047 if (le32_to_cpu(lmm->lmm_pattern) &
2048 LOV_PATTERN_F_RELEASED)
2052 /* if function called for directory - we should
2053 * avoid swab not existent lsm objects */
2054 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2055 lustre_swab_lov_user_md_v1(
2056 (struct lov_user_md_v1 *)lmm);
2057 if (S_ISREG(body->mbo_mode))
2058 lustre_swab_lov_user_md_objects(
2059 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2061 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2062 lustre_swab_lov_user_md_v3(
2063 (struct lov_user_md_v3 *)lmm);
2064 if (S_ISREG(body->mbo_mode))
2065 lustre_swab_lov_user_md_objects(
2066 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2068 } else if (lmm->lmm_magic ==
2069 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2070 lustre_swab_lov_comp_md_v1(
2071 (struct lov_comp_md_v1 *)lmm);
2077 *lmm_size = lmmsize;
2082 static int ll_lov_setea(struct inode *inode, struct file *file,
2085 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2086 struct lov_user_md *lump;
2087 int lum_size = sizeof(struct lov_user_md) +
2088 sizeof(struct lov_user_ost_data);
2092 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2095 OBD_ALLOC_LARGE(lump, lum_size);
2099 if (copy_from_user(lump, arg, lum_size))
2100 GOTO(out_lump, rc = -EFAULT);
2102 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2104 cl_lov_delay_create_clear(&file->f_flags);
2107 OBD_FREE_LARGE(lump, lum_size);
2111 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2118 env = cl_env_get(&refcheck);
2120 RETURN(PTR_ERR(env));
2122 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2123 cl_env_put(env, &refcheck);
2127 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2130 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2131 struct lov_user_md *klum;
2133 __u64 flags = FMODE_WRITE;
2136 rc = ll_copy_user_md(lum, &klum);
2141 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2146 rc = put_user(0, &lum->lmm_stripe_count);
2150 rc = ll_layout_refresh(inode, &gen);
2154 rc = ll_file_getstripe(inode, arg, lum_size);
2156 cl_lov_delay_create_clear(&file->f_flags);
2159 OBD_FREE(klum, lum_size);
2164 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2166 struct ll_inode_info *lli = ll_i2info(inode);
2167 struct cl_object *obj = lli->lli_clob;
2168 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2169 struct ll_grouplock grouplock;
2174 CWARN("group id for group lock must not be 0\n");
2178 if (ll_file_nolock(file))
2179 RETURN(-EOPNOTSUPP);
2181 spin_lock(&lli->lli_lock);
2182 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2183 CWARN("group lock already existed with gid %lu\n",
2184 fd->fd_grouplock.lg_gid);
2185 spin_unlock(&lli->lli_lock);
2188 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2189 spin_unlock(&lli->lli_lock);
2192 * XXX: group lock needs to protect all OST objects while PFL
2193 * can add new OST objects during the IO, so we'd instantiate
2194 * all OST objects before getting its group lock.
2199 struct cl_layout cl = {
2200 .cl_is_composite = false,
2202 struct lu_extent ext = {
2204 .e_end = OBD_OBJECT_EOF,
2207 env = cl_env_get(&refcheck);
2209 RETURN(PTR_ERR(env));
2211 rc = cl_object_layout_get(env, obj, &cl);
2212 if (!rc && cl.cl_is_composite)
2213 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2216 cl_env_put(env, &refcheck);
2221 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2222 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2226 spin_lock(&lli->lli_lock);
2227 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2228 spin_unlock(&lli->lli_lock);
2229 CERROR("another thread just won the race\n");
2230 cl_put_grouplock(&grouplock);
2234 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2235 fd->fd_grouplock = grouplock;
2236 spin_unlock(&lli->lli_lock);
2238 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2242 static int ll_put_grouplock(struct inode *inode, struct file *file,
2245 struct ll_inode_info *lli = ll_i2info(inode);
2246 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2247 struct ll_grouplock grouplock;
2250 spin_lock(&lli->lli_lock);
2251 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2252 spin_unlock(&lli->lli_lock);
2253 CWARN("no group lock held\n");
2257 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2259 if (fd->fd_grouplock.lg_gid != arg) {
2260 CWARN("group lock %lu doesn't match current id %lu\n",
2261 arg, fd->fd_grouplock.lg_gid);
2262 spin_unlock(&lli->lli_lock);
2266 grouplock = fd->fd_grouplock;
2267 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2268 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2269 spin_unlock(&lli->lli_lock);
2271 cl_put_grouplock(&grouplock);
2272 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2277 * Close inode open handle
2279 * \param dentry [in] dentry which contains the inode
2280 * \param it [in,out] intent which contains open info and result
2283 * \retval <0 failure
2285 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2287 struct inode *inode = dentry->d_inode;
2288 struct obd_client_handle *och;
2294 /* Root ? Do nothing. */
2295 if (dentry->d_inode->i_sb->s_root == dentry)
2298 /* No open handle to close? Move away */
2299 if (!it_disposition(it, DISP_OPEN_OPEN))
2302 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2304 OBD_ALLOC(och, sizeof(*och));
2306 GOTO(out, rc = -ENOMEM);
2308 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2310 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2312 /* this one is in place of ll_file_open */
2313 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2314 ptlrpc_req_finished(it->it_request);
2315 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2321 * Get size for inode for which FIEMAP mapping is requested.
2322 * Make the FIEMAP get_info call and returns the result.
2323 * \param fiemap kernel buffer to hold extens
2324 * \param num_bytes kernel buffer size
2326 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2332 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2335 /* Checks for fiemap flags */
2336 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2337 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2341 /* Check for FIEMAP_FLAG_SYNC */
2342 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2343 rc = filemap_fdatawrite(inode->i_mapping);
2348 env = cl_env_get(&refcheck);
2350 RETURN(PTR_ERR(env));
2352 if (i_size_read(inode) == 0) {
2353 rc = ll_glimpse_size(inode);
2358 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2359 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2360 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2362 /* If filesize is 0, then there would be no objects for mapping */
2363 if (fmkey.lfik_oa.o_size == 0) {
2364 fiemap->fm_mapped_extents = 0;
2368 fmkey.lfik_fiemap = *fiemap;
2370 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2371 &fmkey, fiemap, &num_bytes);
2373 cl_env_put(env, &refcheck);
2377 int ll_fid2path(struct inode *inode, void __user *arg)
2379 struct obd_export *exp = ll_i2mdexp(inode);
2380 const struct getinfo_fid2path __user *gfin = arg;
2382 struct getinfo_fid2path *gfout;
2388 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2389 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2392 /* Only need to get the buflen */
2393 if (get_user(pathlen, &gfin->gf_pathlen))
2396 if (pathlen > PATH_MAX)
2399 outsize = sizeof(*gfout) + pathlen;
2400 OBD_ALLOC(gfout, outsize);
2404 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2405 GOTO(gf_free, rc = -EFAULT);
2406 /* append root FID after gfout to let MDT know the root FID so that it
2407 * can lookup the correct path, this is mainly for fileset.
2408 * old server without fileset mount support will ignore this. */
2409 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2411 /* Call mdc_iocontrol */
2412 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2416 if (copy_to_user(arg, gfout, outsize))
2420 OBD_FREE(gfout, outsize);
2425 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2427 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2435 ioc->idv_version = 0;
2436 ioc->idv_layout_version = UINT_MAX;
2438 /* If no file object initialized, we consider its version is 0. */
2442 env = cl_env_get(&refcheck);
2444 RETURN(PTR_ERR(env));
2446 io = vvp_env_thread_io(env);
2448 io->u.ci_data_version.dv_data_version = 0;
2449 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2450 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2453 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2454 result = cl_io_loop(env, io);
2456 result = io->ci_result;
2458 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2459 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2461 cl_io_fini(env, io);
2463 if (unlikely(io->ci_need_restart))
2466 cl_env_put(env, &refcheck);
2472 * Read the data_version for inode.
2474 * This value is computed using stripe object version on OST.
2475 * Version is computed using server side locking.
2477 * @param flags if do sync on the OST side;
2479 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2480 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2482 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2484 struct ioc_data_version ioc = { .idv_flags = flags };
2487 rc = ll_ioc_data_version(inode, &ioc);
2489 *data_version = ioc.idv_version;
2495 * Trigger a HSM release request for the provided inode.
2497 int ll_hsm_release(struct inode *inode)
2500 struct obd_client_handle *och = NULL;
2501 __u64 data_version = 0;
2506 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2507 ll_get_fsname(inode->i_sb, NULL, 0),
2508 PFID(&ll_i2info(inode)->lli_fid));
2510 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2512 GOTO(out, rc = PTR_ERR(och));
2514 /* Grab latest data_version and [am]time values */
2515 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2519 env = cl_env_get(&refcheck);
2521 GOTO(out, rc = PTR_ERR(env));
2523 rc = ll_merge_attr(env, inode);
2524 cl_env_put(env, &refcheck);
2526 /* If error happen, we have the wrong size for a file.
2532 /* Release the file.
2533 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2534 * we still need it to pack l_remote_handle to MDT. */
2535 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2541 if (och != NULL && !IS_ERR(och)) /* close the file */
2542 ll_lease_close(och, inode, NULL);
2547 struct ll_swap_stack {
2550 struct inode *inode1;
2551 struct inode *inode2;
2556 static int ll_swap_layouts(struct file *file1, struct file *file2,
2557 struct lustre_swap_layouts *lsl)
2559 struct mdc_swap_layouts msl;
2560 struct md_op_data *op_data;
2563 struct ll_swap_stack *llss = NULL;
2566 OBD_ALLOC_PTR(llss);
2570 llss->inode1 = file_inode(file1);
2571 llss->inode2 = file_inode(file2);
2573 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2577 /* we use 2 bool because it is easier to swap than 2 bits */
2578 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2579 llss->check_dv1 = true;
2581 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2582 llss->check_dv2 = true;
2584 /* we cannot use lsl->sl_dvX directly because we may swap them */
2585 llss->dv1 = lsl->sl_dv1;
2586 llss->dv2 = lsl->sl_dv2;
2588 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2589 if (rc == 0) /* same file, done! */
2592 if (rc < 0) { /* sequentialize it */
2593 swap(llss->inode1, llss->inode2);
2595 swap(llss->dv1, llss->dv2);
2596 swap(llss->check_dv1, llss->check_dv2);
2600 if (gid != 0) { /* application asks to flush dirty cache */
2601 rc = ll_get_grouplock(llss->inode1, file1, gid);
2605 rc = ll_get_grouplock(llss->inode2, file2, gid);
2607 ll_put_grouplock(llss->inode1, file1, gid);
2612 /* ultimate check, before swaping the layouts we check if
2613 * dataversion has changed (if requested) */
2614 if (llss->check_dv1) {
2615 rc = ll_data_version(llss->inode1, &dv, 0);
2618 if (dv != llss->dv1)
2619 GOTO(putgl, rc = -EAGAIN);
2622 if (llss->check_dv2) {
2623 rc = ll_data_version(llss->inode2, &dv, 0);
2626 if (dv != llss->dv2)
2627 GOTO(putgl, rc = -EAGAIN);
2630 /* struct md_op_data is used to send the swap args to the mdt
2631 * only flags is missing, so we use struct mdc_swap_layouts
2632 * through the md_op_data->op_data */
2633 /* flags from user space have to be converted before they are send to
2634 * server, no flag is sent today, they are only used on the client */
2637 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2638 0, LUSTRE_OPC_ANY, &msl);
2639 if (IS_ERR(op_data))
2640 GOTO(free, rc = PTR_ERR(op_data));
2642 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2643 sizeof(*op_data), op_data, NULL);
2644 ll_finish_md_op_data(op_data);
2651 ll_put_grouplock(llss->inode2, file2, gid);
2652 ll_put_grouplock(llss->inode1, file1, gid);
2662 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2664 struct md_op_data *op_data;
2668 /* Detect out-of range masks */
2669 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2672 /* Non-root users are forbidden to set or clear flags which are
2673 * NOT defined in HSM_USER_MASK. */
2674 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2675 !cfs_capable(CFS_CAP_SYS_ADMIN))
2678 /* Detect out-of range archive id */
2679 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2680 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2683 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2684 LUSTRE_OPC_ANY, hss);
2685 if (IS_ERR(op_data))
2686 RETURN(PTR_ERR(op_data));
2688 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2689 sizeof(*op_data), op_data, NULL);
2691 ll_finish_md_op_data(op_data);
2696 static int ll_hsm_import(struct inode *inode, struct file *file,
2697 struct hsm_user_import *hui)
2699 struct hsm_state_set *hss = NULL;
2700 struct iattr *attr = NULL;
2704 if (!S_ISREG(inode->i_mode))
2710 GOTO(out, rc = -ENOMEM);
2712 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2713 hss->hss_archive_id = hui->hui_archive_id;
2714 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2715 rc = ll_hsm_state_set(inode, hss);
2719 OBD_ALLOC_PTR(attr);
2721 GOTO(out, rc = -ENOMEM);
2723 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2724 attr->ia_mode |= S_IFREG;
2725 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2726 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2727 attr->ia_size = hui->hui_size;
2728 attr->ia_mtime.tv_sec = hui->hui_mtime;
2729 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2730 attr->ia_atime.tv_sec = hui->hui_atime;
2731 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2733 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2734 ATTR_UID | ATTR_GID |
2735 ATTR_MTIME | ATTR_MTIME_SET |
2736 ATTR_ATIME | ATTR_ATIME_SET;
2740 rc = ll_setattr_raw(file_dentry(file), attr, true);
2744 inode_unlock(inode);
2756 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2758 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2759 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2762 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2764 struct inode *inode = file_inode(file);
2766 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2767 ATTR_MTIME | ATTR_MTIME_SET |
2768 ATTR_CTIME | ATTR_CTIME_SET,
2770 .tv_sec = lfu->lfu_atime_sec,
2771 .tv_nsec = lfu->lfu_atime_nsec,
2774 .tv_sec = lfu->lfu_mtime_sec,
2775 .tv_nsec = lfu->lfu_mtime_nsec,
2778 .tv_sec = lfu->lfu_ctime_sec,
2779 .tv_nsec = lfu->lfu_ctime_nsec,
2785 if (!capable(CAP_SYS_ADMIN))
2788 if (!S_ISREG(inode->i_mode))
2792 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2793 inode_unlock(inode);
2798 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2801 case MODE_READ_USER:
2803 case MODE_WRITE_USER:
2810 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2812 /* Used to allow the upper layers of the client to request an LDLM lock
2813 * without doing an actual read or write.
2815 * Used for ladvise lockahead to manually request specific locks.
2817 * \param[in] file file this ladvise lock request is on
2818 * \param[in] ladvise ladvise struct describing this lock request
2820 * \retval 0 success, no detailed result available (sync requests
2821 * and requests sent to the server [not handled locally]
2822 * cannot return detailed results)
2823 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2824 * see definitions for details.
2825 * \retval negative negative errno on error
2827 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2829 struct lu_env *env = NULL;
2830 struct cl_io *io = NULL;
2831 struct cl_lock *lock = NULL;
2832 struct cl_lock_descr *descr = NULL;
2833 struct dentry *dentry = file->f_path.dentry;
2834 struct inode *inode = dentry->d_inode;
2835 enum cl_lock_mode cl_mode;
2836 off_t start = ladvise->lla_start;
2837 off_t end = ladvise->lla_end;
2843 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2844 "start=%llu, end=%llu\n", dentry->d_name.len,
2845 dentry->d_name.name, dentry->d_inode,
2846 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2849 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2851 GOTO(out, result = cl_mode);
2853 /* Get IO environment */
2854 result = cl_io_get(inode, &env, &io, &refcheck);
2858 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2861 * nothing to do for this io. This currently happens when
2862 * stripe sub-object's are not yet created.
2864 result = io->ci_result;
2865 } else if (result == 0) {
2866 lock = vvp_env_lock(env);
2867 descr = &lock->cll_descr;
2869 descr->cld_obj = io->ci_obj;
2870 /* Convert byte offsets to pages */
2871 descr->cld_start = cl_index(io->ci_obj, start);
2872 descr->cld_end = cl_index(io->ci_obj, end);
2873 descr->cld_mode = cl_mode;
2874 /* CEF_MUST is used because we do not want to convert a
2875 * lockahead request to a lockless lock */
2876 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2879 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2880 descr->cld_enq_flags |= CEF_SPECULATIVE;
2882 result = cl_lock_request(env, io, lock);
2884 /* On success, we need to release the lock */
2886 cl_lock_release(env, lock);
2888 cl_io_fini(env, io);
2889 cl_env_put(env, &refcheck);
2891 /* -ECANCELED indicates a matching lock with a different extent
2892 * was already present, and -EEXIST indicates a matching lock
2893 * on exactly the same extent was already present.
2894 * We convert them to positive values for userspace to make
2895 * recognizing true errors easier.
2896 * Note we can only return these detailed results on async requests,
2897 * as sync requests look the same as i/o requests for locking. */
2898 if (result == -ECANCELED)
2899 result = LLA_RESULT_DIFFERENT;
2900 else if (result == -EEXIST)
2901 result = LLA_RESULT_SAME;
2906 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2908 static int ll_ladvise_sanity(struct inode *inode,
2909 struct llapi_lu_ladvise *ladvise)
2911 enum lu_ladvise_type advice = ladvise->lla_advice;
2912 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2913 * be in the first 32 bits of enum ladvise_flags */
2914 __u32 flags = ladvise->lla_peradvice_flags;
2915 /* 3 lines at 80 characters per line, should be plenty */
2918 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2920 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2921 "last supported advice is %s (value '%d'): rc = %d\n",
2922 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2923 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2927 /* Per-advice checks */
2929 case LU_LADVISE_LOCKNOEXPAND:
2930 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2932 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2934 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2935 ladvise_names[advice], rc);
2939 case LU_LADVISE_LOCKAHEAD:
2940 /* Currently only READ and WRITE modes can be requested */
2941 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2942 ladvise->lla_lockahead_mode == 0) {
2944 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2946 ll_get_fsname(inode->i_sb, NULL, 0),
2947 ladvise->lla_lockahead_mode,
2948 ladvise_names[advice], rc);
2951 case LU_LADVISE_WILLREAD:
2952 case LU_LADVISE_DONTNEED:
2954 /* Note fall through above - These checks apply to all advices
2955 * except LOCKNOEXPAND */
2956 if (flags & ~LF_DEFAULT_MASK) {
2958 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2960 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2961 ladvise_names[advice], rc);
2964 if (ladvise->lla_start >= ladvise->lla_end) {
2966 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2967 "for %s: rc = %d\n",
2968 ll_get_fsname(inode->i_sb, NULL, 0),
2969 ladvise->lla_start, ladvise->lla_end,
2970 ladvise_names[advice], rc);
2982 * Give file access advices
2984 * The ladvise interface is similar to Linux fadvise() system call, except it
2985 * forwards the advices directly from Lustre client to server. The server side
2986 * codes will apply appropriate read-ahead and caching techniques for the
2987 * corresponding files.
2989 * A typical workload for ladvise is e.g. a bunch of different clients are
2990 * doing small random reads of a file, so prefetching pages into OSS cache
2991 * with big linear reads before the random IO is a net benefit. Fetching
2992 * all that data into each client cache with fadvise() may not be, due to
2993 * much more data being sent to the client.
2995 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2996 struct llapi_lu_ladvise *ladvise)
3000 struct cl_ladvise_io *lio;
3005 env = cl_env_get(&refcheck);
3007 RETURN(PTR_ERR(env));
3009 io = vvp_env_thread_io(env);
3010 io->ci_obj = ll_i2info(inode)->lli_clob;
3012 /* initialize parameters for ladvise */
3013 lio = &io->u.ci_ladvise;
3014 lio->li_start = ladvise->lla_start;
3015 lio->li_end = ladvise->lla_end;
3016 lio->li_fid = ll_inode2fid(inode);
3017 lio->li_advice = ladvise->lla_advice;
3018 lio->li_flags = flags;
3020 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3021 rc = cl_io_loop(env, io);
3025 cl_io_fini(env, io);
3026 cl_env_put(env, &refcheck);
3030 static int ll_lock_noexpand(struct file *file, int flags)
3032 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3034 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3039 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3042 struct fsxattr fsxattr;
3044 if (copy_from_user(&fsxattr,
3045 (const struct fsxattr __user *)arg,
3049 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
3050 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3051 if (copy_to_user((struct fsxattr __user *)arg,
3052 &fsxattr, sizeof(fsxattr)))
3058 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3062 struct md_op_data *op_data;
3063 struct ptlrpc_request *req = NULL;
3065 struct fsxattr fsxattr;
3066 struct cl_object *obj;
3068 /* only root could change project ID */
3069 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3072 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3073 LUSTRE_OPC_ANY, NULL);
3074 if (IS_ERR(op_data))
3075 RETURN(PTR_ERR(op_data));
3077 if (copy_from_user(&fsxattr,
3078 (const struct fsxattr __user *)arg,
3080 GOTO(out_fsxattr1, rc = -EFAULT);
3082 op_data->op_attr_flags = fsxattr.fsx_xflags;
3083 op_data->op_projid = fsxattr.fsx_projid;
3084 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3085 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3087 ptlrpc_req_finished(req);
3089 obj = ll_i2info(inode)->lli_clob;
3093 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
3094 OBD_ALLOC_PTR(attr);
3096 GOTO(out_fsxattr1, rc = -ENOMEM);
3097 attr->ia_valid = ATTR_ATTR_FLAG;
3098 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3103 ll_finish_md_op_data(op_data);
3107 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3110 struct inode *inode = file_inode(file);
3111 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3112 struct ll_inode_info *lli = ll_i2info(inode);
3113 struct obd_client_handle *och = NULL;
3114 struct split_param sp;
3117 enum mds_op_bias bias = 0;
3118 struct file *layout_file = NULL;
3120 size_t data_size = 0;
3124 mutex_lock(&lli->lli_och_mutex);
3125 if (fd->fd_lease_och != NULL) {
3126 och = fd->fd_lease_och;
3127 fd->fd_lease_och = NULL;
3129 mutex_unlock(&lli->lli_och_mutex);
3132 GOTO(out, rc = -ENOLCK);
3134 fmode = och->och_flags;
3136 switch (ioc->lil_flags) {
3137 case LL_LEASE_RESYNC_DONE:
3138 if (ioc->lil_count > IOC_IDS_MAX)
3139 GOTO(out, rc = -EINVAL);
3141 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3142 OBD_ALLOC(data, data_size);
3144 GOTO(out, rc = -ENOMEM);
3146 if (copy_from_user(data, (void __user *)arg, data_size))
3147 GOTO(out, rc = -EFAULT);
3149 bias = MDS_CLOSE_RESYNC_DONE;
3151 case LL_LEASE_LAYOUT_MERGE: {
3154 if (ioc->lil_count != 1)
3155 GOTO(out, rc = -EINVAL);
3157 arg += sizeof(*ioc);
3158 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3159 GOTO(out, rc = -EFAULT);
3161 layout_file = fget(fd);
3163 GOTO(out, rc = -EBADF);
3165 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3166 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3167 GOTO(out, rc = -EPERM);
3169 data = file_inode(layout_file);
3170 bias = MDS_CLOSE_LAYOUT_MERGE;
3173 case LL_LEASE_LAYOUT_SPLIT: {
3177 if (ioc->lil_count != 2)
3178 GOTO(out, rc = -EINVAL);
3180 arg += sizeof(*ioc);
3181 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3182 GOTO(out, rc = -EFAULT);
3184 arg += sizeof(__u32);
3185 if (copy_from_user(&mirror_id, (void __user *)arg,
3187 GOTO(out, rc = -EFAULT);
3189 layout_file = fget(fdv);
3191 GOTO(out, rc = -EBADF);
3193 sp.sp_inode = file_inode(layout_file);
3194 sp.sp_mirror_id = (__u16)mirror_id;
3196 bias = MDS_CLOSE_LAYOUT_SPLIT;
3200 /* without close intent */
3204 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3208 rc = ll_lease_och_release(inode, file);
3217 switch (ioc->lil_flags) {
3218 case LL_LEASE_RESYNC_DONE:
3220 OBD_FREE(data, data_size);
3222 case LL_LEASE_LAYOUT_MERGE:
3223 case LL_LEASE_LAYOUT_SPLIT:
3230 rc = ll_lease_type_from_fmode(fmode);
3234 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3237 struct inode *inode = file_inode(file);
3238 struct ll_inode_info *lli = ll_i2info(inode);
3239 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3240 struct obd_client_handle *och = NULL;
3241 __u64 open_flags = 0;
3247 switch (ioc->lil_mode) {
3248 case LL_LEASE_WRLCK:
3249 if (!(file->f_mode & FMODE_WRITE))
3251 fmode = FMODE_WRITE;
3253 case LL_LEASE_RDLCK:
3254 if (!(file->f_mode & FMODE_READ))
3258 case LL_LEASE_UNLCK:
3259 RETURN(ll_file_unlock_lease(file, ioc, arg));
3264 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3266 /* apply for lease */
3267 if (ioc->lil_flags & LL_LEASE_RESYNC)
3268 open_flags = MDS_OPEN_RESYNC;
3269 och = ll_lease_open(inode, file, fmode, open_flags);
3271 RETURN(PTR_ERR(och));
3273 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3274 rc = ll_lease_file_resync(och, inode);
3276 ll_lease_close(och, inode, NULL);
3279 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3281 ll_lease_close(och, inode, NULL);
3287 mutex_lock(&lli->lli_och_mutex);
3288 if (fd->fd_lease_och == NULL) {
3289 fd->fd_lease_och = och;
3292 mutex_unlock(&lli->lli_och_mutex);
3294 /* impossible now that only excl is supported for now */
3295 ll_lease_close(och, inode, &lease_broken);
3302 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3304 struct inode *inode = file_inode(file);
3305 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3309 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3310 PFID(ll_inode2fid(inode)), inode, cmd);
3311 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3313 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3314 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3318 case LL_IOC_GETFLAGS:
3319 /* Get the current value of the file flags */
3320 return put_user(fd->fd_flags, (int __user *)arg);
3321 case LL_IOC_SETFLAGS:
3322 case LL_IOC_CLRFLAGS:
3323 /* Set or clear specific file flags */
3324 /* XXX This probably needs checks to ensure the flags are
3325 * not abused, and to handle any flag side effects.
3327 if (get_user(flags, (int __user *) arg))
3330 if (cmd == LL_IOC_SETFLAGS) {
3331 if ((flags & LL_FILE_IGNORE_LOCK) &&
3332 !(file->f_flags & O_DIRECT)) {
3333 CERROR("%s: unable to disable locking on "
3334 "non-O_DIRECT file\n", current->comm);
3338 fd->fd_flags |= flags;
3340 fd->fd_flags &= ~flags;
3343 case LL_IOC_LOV_SETSTRIPE:
3344 case LL_IOC_LOV_SETSTRIPE_NEW:
3345 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3346 case LL_IOC_LOV_SETEA:
3347 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3348 case LL_IOC_LOV_SWAP_LAYOUTS: {
3350 struct lustre_swap_layouts lsl;
3352 if (copy_from_user(&lsl, (char __user *)arg,
3353 sizeof(struct lustre_swap_layouts)))
3356 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3359 file2 = fget(lsl.sl_fd);
3363 /* O_WRONLY or O_RDWR */
3364 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3365 GOTO(out, rc = -EPERM);
3367 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3368 struct inode *inode2;
3369 struct ll_inode_info *lli;
3370 struct obd_client_handle *och = NULL;
3372 lli = ll_i2info(inode);
3373 mutex_lock(&lli->lli_och_mutex);
3374 if (fd->fd_lease_och != NULL) {
3375 och = fd->fd_lease_och;
3376 fd->fd_lease_och = NULL;
3378 mutex_unlock(&lli->lli_och_mutex);
3380 GOTO(out, rc = -ENOLCK);
3381 inode2 = file_inode(file2);
3382 rc = ll_swap_layouts_close(och, inode, inode2);
3384 rc = ll_swap_layouts(file, file2, &lsl);
3390 case LL_IOC_LOV_GETSTRIPE:
3391 case LL_IOC_LOV_GETSTRIPE_NEW:
3392 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3393 case FS_IOC_GETFLAGS:
3394 case FS_IOC_SETFLAGS:
3395 RETURN(ll_iocontrol(inode, file, cmd, arg));
3396 case FSFILT_IOC_GETVERSION:
3397 case FS_IOC_GETVERSION:
3398 RETURN(put_user(inode->i_generation, (int __user *)arg));
3399 /* We need to special case any other ioctls we want to handle,
3400 * to send them to the MDS/OST as appropriate and to properly
3401 * network encode the arg field. */
3402 case FS_IOC_SETVERSION:
3405 case LL_IOC_GROUP_LOCK:
3406 RETURN(ll_get_grouplock(inode, file, arg));
3407 case LL_IOC_GROUP_UNLOCK:
3408 RETURN(ll_put_grouplock(inode, file, arg));
3409 case IOC_OBD_STATFS:
3410 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3412 case LL_IOC_FLUSHCTX:
3413 RETURN(ll_flush_ctx(inode));
3414 case LL_IOC_PATH2FID: {
3415 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3416 sizeof(struct lu_fid)))
3421 case LL_IOC_GETPARENT:
3422 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3424 case OBD_IOC_FID2PATH:
3425 RETURN(ll_fid2path(inode, (void __user *)arg));
3426 case LL_IOC_DATA_VERSION: {
3427 struct ioc_data_version idv;
3430 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3433 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3434 rc = ll_ioc_data_version(inode, &idv);
3437 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3443 case LL_IOC_GET_MDTIDX: {
3446 mdtidx = ll_get_mdt_idx(inode);
3450 if (put_user((int)mdtidx, (int __user *)arg))
3455 case OBD_IOC_GETDTNAME:
3456 case OBD_IOC_GETMDNAME:
3457 RETURN(ll_get_obd_name(inode, cmd, arg));
3458 case LL_IOC_HSM_STATE_GET: {
3459 struct md_op_data *op_data;
3460 struct hsm_user_state *hus;
3467 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3468 LUSTRE_OPC_ANY, hus);
3469 if (IS_ERR(op_data)) {
3471 RETURN(PTR_ERR(op_data));
3474 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3477 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3480 ll_finish_md_op_data(op_data);
3484 case LL_IOC_HSM_STATE_SET: {
3485 struct hsm_state_set *hss;
3492 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3497 rc = ll_hsm_state_set(inode, hss);
3502 case LL_IOC_HSM_ACTION: {
3503 struct md_op_data *op_data;
3504 struct hsm_current_action *hca;
3511 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3512 LUSTRE_OPC_ANY, hca);
3513 if (IS_ERR(op_data)) {
3515 RETURN(PTR_ERR(op_data));
3518 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3521 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3524 ll_finish_md_op_data(op_data);
3528 case LL_IOC_SET_LEASE_OLD: {
3529 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3531 RETURN(ll_file_set_lease(file, &ioc, 0));
3533 case LL_IOC_SET_LEASE: {
3534 struct ll_ioc_lease ioc;
3536 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3539 RETURN(ll_file_set_lease(file, &ioc, arg));
3541 case LL_IOC_GET_LEASE: {
3542 struct ll_inode_info *lli = ll_i2info(inode);
3543 struct ldlm_lock *lock = NULL;
3546 mutex_lock(&lli->lli_och_mutex);
3547 if (fd->fd_lease_och != NULL) {
3548 struct obd_client_handle *och = fd->fd_lease_och;
3550 lock = ldlm_handle2lock(&och->och_lease_handle);
3552 lock_res_and_lock(lock);
3553 if (!ldlm_is_cancel(lock))
3554 fmode = och->och_flags;
3556 unlock_res_and_lock(lock);
3557 LDLM_LOCK_PUT(lock);
3560 mutex_unlock(&lli->lli_och_mutex);
3562 RETURN(ll_lease_type_from_fmode(fmode));
3564 case LL_IOC_HSM_IMPORT: {
3565 struct hsm_user_import *hui;
3571 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3576 rc = ll_hsm_import(inode, file, hui);
3581 case LL_IOC_FUTIMES_3: {
3582 struct ll_futimes_3 lfu;
3584 if (copy_from_user(&lfu,
3585 (const struct ll_futimes_3 __user *)arg,
3589 RETURN(ll_file_futimes_3(file, &lfu));
3591 case LL_IOC_LADVISE: {
3592 struct llapi_ladvise_hdr *k_ladvise_hdr;
3593 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3596 int alloc_size = sizeof(*k_ladvise_hdr);
3599 u_ladvise_hdr = (void __user *)arg;
3600 OBD_ALLOC_PTR(k_ladvise_hdr);
3601 if (k_ladvise_hdr == NULL)
3604 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3605 GOTO(out_ladvise, rc = -EFAULT);
3607 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3608 k_ladvise_hdr->lah_count < 1)
3609 GOTO(out_ladvise, rc = -EINVAL);
3611 num_advise = k_ladvise_hdr->lah_count;
3612 if (num_advise >= LAH_COUNT_MAX)
3613 GOTO(out_ladvise, rc = -EFBIG);
3615 OBD_FREE_PTR(k_ladvise_hdr);
3616 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3617 lah_advise[num_advise]);
3618 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3619 if (k_ladvise_hdr == NULL)
3623 * TODO: submit multiple advices to one server in a single RPC
3625 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3626 GOTO(out_ladvise, rc = -EFAULT);
3628 for (i = 0; i < num_advise; i++) {
3629 struct llapi_lu_ladvise *k_ladvise =
3630 &k_ladvise_hdr->lah_advise[i];
3631 struct llapi_lu_ladvise __user *u_ladvise =
3632 &u_ladvise_hdr->lah_advise[i];
3634 rc = ll_ladvise_sanity(inode, k_ladvise);
3636 GOTO(out_ladvise, rc);
3638 switch (k_ladvise->lla_advice) {
3639 case LU_LADVISE_LOCKNOEXPAND:
3640 rc = ll_lock_noexpand(file,
3641 k_ladvise->lla_peradvice_flags);
3642 GOTO(out_ladvise, rc);
3643 case LU_LADVISE_LOCKAHEAD:
3645 rc = ll_file_lock_ahead(file, k_ladvise);
3648 GOTO(out_ladvise, rc);
3651 &u_ladvise->lla_lockahead_result))
3652 GOTO(out_ladvise, rc = -EFAULT);
3655 rc = ll_ladvise(inode, file,
3656 k_ladvise_hdr->lah_flags,
3659 GOTO(out_ladvise, rc);
3666 OBD_FREE(k_ladvise_hdr, alloc_size);
3669 case LL_IOC_FLR_SET_MIRROR: {
3670 /* mirror I/O must be direct to avoid polluting page cache
3672 if (!(file->f_flags & O_DIRECT))
3675 fd->fd_designated_mirror = (__u32)arg;
3678 case LL_IOC_FSGETXATTR:
3679 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3680 case LL_IOC_FSSETXATTR:
3681 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3683 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3685 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3686 (void __user *)arg));
3690 #ifndef HAVE_FILE_LLSEEK_SIZE
3691 static inline loff_t
3692 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3694 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3696 if (offset > maxsize)
3699 if (offset != file->f_pos) {
3700 file->f_pos = offset;
3701 file->f_version = 0;
3707 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3708 loff_t maxsize, loff_t eof)
3710 struct inode *inode = file_inode(file);
3718 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3719 * position-querying operation. Avoid rewriting the "same"
3720 * f_pos value back to the file because a concurrent read(),
3721 * write() or lseek() might have altered it
3726 * f_lock protects against read/modify/write race with other
3727 * SEEK_CURs. Note that parallel writes and reads behave
3731 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3732 inode_unlock(inode);
3736 * In the generic case the entire file is data, so as long as
3737 * offset isn't at the end of the file then the offset is data.
3744 * There is a virtual hole at the end of the file, so as long as
3745 * offset isn't i_size or larger, return i_size.
3753 return llseek_execute(file, offset, maxsize);
3757 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3759 struct inode *inode = file_inode(file);
3760 loff_t retval, eof = 0;
3763 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3764 (origin == SEEK_CUR) ? file->f_pos : 0);
3765 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3766 PFID(ll_inode2fid(inode)), inode, retval, retval,
3768 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3770 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3771 retval = ll_glimpse_size(inode);
3774 eof = i_size_read(inode);
3777 retval = ll_generic_file_llseek_size(file, offset, origin,
3778 ll_file_maxbytes(inode), eof);
3782 static int ll_flush(struct file *file, fl_owner_t id)
3784 struct inode *inode = file_inode(file);
3785 struct ll_inode_info *lli = ll_i2info(inode);
3786 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3789 LASSERT(!S_ISDIR(inode->i_mode));
3791 /* catch async errors that were recorded back when async writeback
3792 * failed for pages in this mapping. */
3793 rc = lli->lli_async_rc;
3794 lli->lli_async_rc = 0;
3795 if (lli->lli_clob != NULL) {
3796 err = lov_read_and_clear_async_rc(lli->lli_clob);
3801 /* The application has been told write failure already.
3802 * Do not report failure again. */
3803 if (fd->fd_write_failed)
3805 return rc ? -EIO : 0;
3809 * Called to make sure a portion of file has been written out.
3810 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3812 * Return how many pages have been written.
3814 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3815 enum cl_fsync_mode mode, int ignore_layout)
3819 struct cl_fsync_io *fio;
3824 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3825 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3828 env = cl_env_get(&refcheck);
3830 RETURN(PTR_ERR(env));
3832 io = vvp_env_thread_io(env);
3833 io->ci_obj = ll_i2info(inode)->lli_clob;
3834 io->ci_ignore_layout = ignore_layout;
3836 /* initialize parameters for sync */
3837 fio = &io->u.ci_fsync;
3838 fio->fi_start = start;
3840 fio->fi_fid = ll_inode2fid(inode);
3841 fio->fi_mode = mode;
3842 fio->fi_nr_written = 0;
3844 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3845 result = cl_io_loop(env, io);
3847 result = io->ci_result;
3849 result = fio->fi_nr_written;
3850 cl_io_fini(env, io);
3851 cl_env_put(env, &refcheck);
3857 * When dentry is provided (the 'else' case), file_dentry() may be
3858 * null and dentry must be used directly rather than pulled from
3859 * file_dentry() as is done otherwise.
3862 #ifdef HAVE_FILE_FSYNC_4ARGS
3863 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3865 struct dentry *dentry = file_dentry(file);
3867 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3868 int ll_fsync(struct file *file, int datasync)
3870 struct dentry *dentry = file_dentry(file);
3872 loff_t end = LLONG_MAX;
3874 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3877 loff_t end = LLONG_MAX;
3879 struct inode *inode = dentry->d_inode;
3880 struct ll_inode_info *lli = ll_i2info(inode);
3881 struct ptlrpc_request *req;
3885 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3886 PFID(ll_inode2fid(inode)), inode);
3887 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3889 #ifdef HAVE_FILE_FSYNC_4ARGS
3890 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3891 lock_inode = !lli->lli_inode_locked;
3895 /* fsync's caller has already called _fdata{sync,write}, we want
3896 * that IO to finish before calling the osc and mdc sync methods */
3897 rc = filemap_fdatawait(inode->i_mapping);
3900 /* catch async errors that were recorded back when async writeback
3901 * failed for pages in this mapping. */
3902 if (!S_ISDIR(inode->i_mode)) {
3903 err = lli->lli_async_rc;
3904 lli->lli_async_rc = 0;
3907 if (lli->lli_clob != NULL) {
3908 err = lov_read_and_clear_async_rc(lli->lli_clob);
3914 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3918 ptlrpc_req_finished(req);
3920 if (S_ISREG(inode->i_mode)) {
3921 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3923 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3924 if (rc == 0 && err < 0)
3927 fd->fd_write_failed = true;
3929 fd->fd_write_failed = false;
3932 #ifdef HAVE_FILE_FSYNC_4ARGS
3934 inode_unlock(inode);
3940 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3942 struct inode *inode = file_inode(file);
3943 struct ll_sb_info *sbi = ll_i2sbi(inode);
3944 struct ldlm_enqueue_info einfo = {
3945 .ei_type = LDLM_FLOCK,
3946 .ei_cb_cp = ldlm_flock_completion_ast,
3947 .ei_cbdata = file_lock,
3949 struct md_op_data *op_data;
3950 struct lustre_handle lockh = { 0 };
3951 union ldlm_policy_data flock = { { 0 } };
3952 int fl_type = file_lock->fl_type;
3958 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3959 PFID(ll_inode2fid(inode)), file_lock);
3961 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3963 if (file_lock->fl_flags & FL_FLOCK) {
3964 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3965 /* flocks are whole-file locks */
3966 flock.l_flock.end = OFFSET_MAX;
3967 /* For flocks owner is determined by the local file desctiptor*/
3968 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3969 } else if (file_lock->fl_flags & FL_POSIX) {
3970 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3971 flock.l_flock.start = file_lock->fl_start;
3972 flock.l_flock.end = file_lock->fl_end;
3976 flock.l_flock.pid = file_lock->fl_pid;
3978 /* Somewhat ugly workaround for svc lockd.
3979 * lockd installs custom fl_lmops->lm_compare_owner that checks
3980 * for the fl_owner to be the same (which it always is on local node
3981 * I guess between lockd processes) and then compares pid.
3982 * As such we assign pid to the owner field to make it all work,
3983 * conflict with normal locks is unlikely since pid space and
3984 * pointer space for current->files are not intersecting */
3985 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3986 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3990 einfo.ei_mode = LCK_PR;
3993 /* An unlock request may or may not have any relation to
3994 * existing locks so we may not be able to pass a lock handle
3995 * via a normal ldlm_lock_cancel() request. The request may even
3996 * unlock a byte range in the middle of an existing lock. In
3997 * order to process an unlock request we need all of the same
3998 * information that is given with a normal read or write record
3999 * lock request. To avoid creating another ldlm unlock (cancel)
4000 * message we'll treat a LCK_NL flock request as an unlock. */
4001 einfo.ei_mode = LCK_NL;
4004 einfo.ei_mode = LCK_PW;
4007 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4022 flags = LDLM_FL_BLOCK_NOWAIT;
4028 flags = LDLM_FL_TEST_LOCK;
4031 CERROR("unknown fcntl lock command: %d\n", cmd);
4035 /* Save the old mode so that if the mode in the lock changes we
4036 * can decrement the appropriate reader or writer refcount. */
4037 file_lock->fl_type = einfo.ei_mode;
4039 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4040 LUSTRE_OPC_ANY, NULL);
4041 if (IS_ERR(op_data))
4042 RETURN(PTR_ERR(op_data));
4044 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4045 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4046 flock.l_flock.pid, flags, einfo.ei_mode,
4047 flock.l_flock.start, flock.l_flock.end);
4049 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4052 /* Restore the file lock type if not TEST lock. */
4053 if (!(flags & LDLM_FL_TEST_LOCK))
4054 file_lock->fl_type = fl_type;
4056 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4057 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4058 !(flags & LDLM_FL_TEST_LOCK))
4059 rc2 = locks_lock_file_wait(file, file_lock);
4061 if ((file_lock->fl_flags & FL_FLOCK) &&
4062 (rc == 0 || file_lock->fl_type == F_UNLCK))
4063 rc2 = flock_lock_file_wait(file, file_lock);
4064 if ((file_lock->fl_flags & FL_POSIX) &&
4065 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4066 !(flags & LDLM_FL_TEST_LOCK))
4067 rc2 = posix_lock_file_wait(file, file_lock);
4068 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4070 if (rc2 && file_lock->fl_type != F_UNLCK) {
4071 einfo.ei_mode = LCK_NL;
4072 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4077 ll_finish_md_op_data(op_data);
4082 int ll_get_fid_by_name(struct inode *parent, const char *name,
4083 int namelen, struct lu_fid *fid,
4084 struct inode **inode)
4086 struct md_op_data *op_data = NULL;
4087 struct mdt_body *body;
4088 struct ptlrpc_request *req;
4092 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4093 LUSTRE_OPC_ANY, NULL);
4094 if (IS_ERR(op_data))
4095 RETURN(PTR_ERR(op_data));
4097 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4098 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4099 ll_finish_md_op_data(op_data);
4103 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4105 GOTO(out_req, rc = -EFAULT);
4107 *fid = body->mbo_fid1;
4110 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4112 ptlrpc_req_finished(req);
4116 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4119 struct dentry *dchild = NULL;
4120 struct inode *child_inode = NULL;
4121 struct md_op_data *op_data;
4122 struct ptlrpc_request *request = NULL;
4123 struct obd_client_handle *och = NULL;
4125 struct mdt_body *body;
4126 __u64 data_version = 0;
4127 size_t namelen = strlen(name);
4128 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4132 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4133 PFID(ll_inode2fid(parent)), name,
4134 lum->lum_stripe_offset, lum->lum_stripe_count);
4136 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4137 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4138 lustre_swab_lmv_user_md(lum);
4140 /* Get child FID first */
4141 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4144 dchild = d_lookup(file_dentry(file), &qstr);
4146 if (dchild->d_inode)
4147 child_inode = igrab(dchild->d_inode);
4152 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4162 * lfs migrate command needs to be blocked on the client
4163 * by checking the migrate FID against the FID of the
4166 if (child_inode == parent->i_sb->s_root->d_inode)
4167 GOTO(out_iput, rc = -EINVAL);
4169 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4170 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4171 if (IS_ERR(op_data))
4172 GOTO(out_iput, rc = PTR_ERR(op_data));
4174 inode_lock(child_inode);
4175 op_data->op_fid3 = *ll_inode2fid(child_inode);
4176 if (!fid_is_sane(&op_data->op_fid3)) {
4177 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4178 ll_get_fsname(parent->i_sb, NULL, 0), name,
4179 PFID(&op_data->op_fid3));
4180 GOTO(out_unlock, rc = -EINVAL);
4183 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4184 op_data->op_data = lum;
4185 op_data->op_data_size = lumlen;
4188 if (S_ISREG(child_inode->i_mode)) {
4189 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4193 GOTO(out_unlock, rc);
4196 rc = ll_data_version(child_inode, &data_version,
4199 GOTO(out_close, rc);
4201 op_data->op_handle = och->och_fh;
4202 op_data->op_data_version = data_version;
4203 op_data->op_lease_handle = och->och_lease_handle;
4204 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4206 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4207 och->och_mod->mod_open_req->rq_replay = 0;
4208 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4211 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4212 name, namelen, &request);
4214 LASSERT(request != NULL);
4215 ll_update_times(request, parent);
4217 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4218 LASSERT(body != NULL);
4220 /* If the server does release layout lock, then we cleanup
4221 * the client och here, otherwise release it in out_close: */
4222 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4223 obd_mod_put(och->och_mod);
4224 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4226 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4232 if (request != NULL) {
4233 ptlrpc_req_finished(request);
4237 /* Try again if the file layout has changed. */
4238 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4243 ll_lease_close(och, child_inode, NULL);
4245 clear_nlink(child_inode);
4247 inode_unlock(child_inode);
4248 ll_finish_md_op_data(op_data);
4255 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4263 * test if some locks matching bits and l_req_mode are acquired
4264 * - bits can be in different locks
4265 * - if found clear the common lock bits in *bits
4266 * - the bits not found, are kept in *bits
4268 * \param bits [IN] searched lock bits [IN]
4269 * \param l_req_mode [IN] searched lock mode
4270 * \retval boolean, true iff all bits are found
4272 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4274 struct lustre_handle lockh;
4275 union ldlm_policy_data policy;
4276 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4277 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4286 fid = &ll_i2info(inode)->lli_fid;
4287 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4288 ldlm_lockname[mode]);
4290 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4291 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4292 policy.l_inodebits.bits = *bits & (1 << i);
4293 if (policy.l_inodebits.bits == 0)
4296 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4297 &policy, mode, &lockh)) {
4298 struct ldlm_lock *lock;
4300 lock = ldlm_handle2lock(&lockh);
4303 ~(lock->l_policy_data.l_inodebits.bits);
4304 LDLM_LOCK_PUT(lock);
4306 *bits &= ~policy.l_inodebits.bits;
4313 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4314 struct lustre_handle *lockh, __u64 flags,
4315 enum ldlm_mode mode)
4317 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4322 fid = &ll_i2info(inode)->lli_fid;
4323 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4325 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4326 fid, LDLM_IBITS, &policy, mode, lockh);
4331 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4333 /* Already unlinked. Just update nlink and return success */
4334 if (rc == -ENOENT) {
4336 /* If it is striped directory, and there is bad stripe
4337 * Let's revalidate the dentry again, instead of returning
4339 if (S_ISDIR(inode->i_mode) &&
4340 ll_i2info(inode)->lli_lsm_md != NULL)
4343 /* This path cannot be hit for regular files unless in
4344 * case of obscure races, so no need to to validate
4346 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4348 } else if (rc != 0) {
4349 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4350 "%s: revalidate FID "DFID" error: rc = %d\n",
4351 ll_get_fsname(inode->i_sb, NULL, 0),
4352 PFID(ll_inode2fid(inode)), rc);
4358 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4360 struct inode *inode = dentry->d_inode;
4361 struct obd_export *exp = ll_i2mdexp(inode);
4362 struct lookup_intent oit = {
4365 struct ptlrpc_request *req = NULL;
4366 struct md_op_data *op_data;
4370 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4371 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4373 /* Call getattr by fid, so do not provide name at all. */
4374 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4375 LUSTRE_OPC_ANY, NULL);
4376 if (IS_ERR(op_data))
4377 RETURN(PTR_ERR(op_data));
4379 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4380 ll_finish_md_op_data(op_data);
4382 rc = ll_inode_revalidate_fini(inode, rc);
4386 rc = ll_revalidate_it_finish(req, &oit, dentry);
4388 ll_intent_release(&oit);
4392 /* Unlinked? Unhash dentry, so it is not picked up later by
4393 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4394 * here to preserve get_cwd functionality on 2.6.
4396 if (!dentry->d_inode->i_nlink) {
4397 ll_lock_dcache(inode);
4398 d_lustre_invalidate(dentry, 0);
4399 ll_unlock_dcache(inode);
4402 ll_lookup_finish_locks(&oit, dentry);
4404 ptlrpc_req_finished(req);
4409 static int ll_merge_md_attr(struct inode *inode)
4411 struct cl_attr attr = { 0 };
4414 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4415 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4416 &attr, ll_md_blocking_ast);
4420 set_nlink(inode, attr.cat_nlink);
4421 inode->i_blocks = attr.cat_blocks;
4422 i_size_write(inode, attr.cat_size);
4424 ll_i2info(inode)->lli_atime = attr.cat_atime;
4425 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4426 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4431 static inline dev_t ll_compat_encode_dev(dev_t dev)
4433 /* The compat_sys_*stat*() syscalls will fail unless the
4434 * device majors and minors are both less than 256. Note that
4435 * the value returned here will be passed through
4436 * old_encode_dev() in cp_compat_stat(). And so we are not
4437 * trying to return a valid compat (u16) device number, just
4438 * one that will pass the old_valid_dev() check. */
4440 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4443 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4444 int ll_getattr(const struct path *path, struct kstat *stat,
4445 u32 request_mask, unsigned int flags)
4447 struct dentry *de = path->dentry;
4449 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4452 struct inode *inode = de->d_inode;
4453 struct ll_sb_info *sbi = ll_i2sbi(inode);
4454 struct ll_inode_info *lli = ll_i2info(inode);
4457 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4459 rc = ll_inode_revalidate(de, IT_GETATTR);
4463 if (S_ISREG(inode->i_mode)) {
4464 /* In case of restore, the MDT has the right size and has
4465 * already send it back without granting the layout lock,
4466 * inode is up-to-date so glimpse is useless.
4467 * Also to glimpse we need the layout, in case of a running
4468 * restore the MDT holds the layout lock so the glimpse will
4469 * block up to the end of restore (getattr will block)
4471 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4472 rc = ll_glimpse_size(inode);
4477 /* If object isn't regular a file then don't validate size. */
4478 if (S_ISDIR(inode->i_mode) &&
4479 lli->lli_lsm_md != NULL) {
4480 rc = ll_merge_md_attr(inode);
4485 LTIME_S(inode->i_atime) = lli->lli_atime;
4486 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4487 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4490 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4492 if (ll_need_32bit_api(sbi)) {
4493 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4494 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4495 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4497 stat->ino = inode->i_ino;
4498 stat->dev = inode->i_sb->s_dev;
4499 stat->rdev = inode->i_rdev;
4502 stat->mode = inode->i_mode;
4503 stat->uid = inode->i_uid;
4504 stat->gid = inode->i_gid;
4505 stat->atime = inode->i_atime;
4506 stat->mtime = inode->i_mtime;
4507 stat->ctime = inode->i_ctime;
4508 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4510 stat->nlink = inode->i_nlink;
4511 stat->size = i_size_read(inode);
4512 stat->blocks = inode->i_blocks;
4517 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4518 __u64 start, __u64 len)
4522 struct fiemap *fiemap;
4523 unsigned int extent_count = fieinfo->fi_extents_max;
4525 num_bytes = sizeof(*fiemap) + (extent_count *
4526 sizeof(struct fiemap_extent));
4527 OBD_ALLOC_LARGE(fiemap, num_bytes);
4532 fiemap->fm_flags = fieinfo->fi_flags;
4533 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4534 fiemap->fm_start = start;
4535 fiemap->fm_length = len;
4536 if (extent_count > 0 &&
4537 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4538 sizeof(struct fiemap_extent)) != 0)
4539 GOTO(out, rc = -EFAULT);
4541 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4543 fieinfo->fi_flags = fiemap->fm_flags;
4544 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4545 if (extent_count > 0 &&
4546 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4547 fiemap->fm_mapped_extents *
4548 sizeof(struct fiemap_extent)) != 0)
4549 GOTO(out, rc = -EFAULT);
4551 OBD_FREE_LARGE(fiemap, num_bytes);
4555 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4557 struct ll_inode_info *lli = ll_i2info(inode);
4558 struct posix_acl *acl = NULL;
4561 spin_lock(&lli->lli_lock);
4562 /* VFS' acl_permission_check->check_acl will release the refcount */
4563 acl = posix_acl_dup(lli->lli_posix_acl);
4564 spin_unlock(&lli->lli_lock);
4569 #ifdef HAVE_IOP_SET_ACL
4570 #ifdef CONFIG_FS_POSIX_ACL
4571 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4573 struct ll_sb_info *sbi = ll_i2sbi(inode);
4574 struct ptlrpc_request *req = NULL;
4575 const char *name = NULL;
4577 size_t value_size = 0;
4582 case ACL_TYPE_ACCESS:
4583 name = XATTR_NAME_POSIX_ACL_ACCESS;
4585 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4588 case ACL_TYPE_DEFAULT:
4589 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4590 if (!S_ISDIR(inode->i_mode))
4591 rc = acl ? -EACCES : 0;
4602 value_size = posix_acl_xattr_size(acl->a_count);
4603 value = kmalloc(value_size, GFP_NOFS);
4605 GOTO(out, rc = -ENOMEM);
4607 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4609 GOTO(out_value, rc);
4612 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4613 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4614 name, value, value_size, 0, 0, &req);
4616 ptlrpc_req_finished(req);
4621 forget_cached_acl(inode, type);
4623 set_cached_acl(inode, type, acl);
4626 #endif /* CONFIG_FS_POSIX_ACL */
4627 #endif /* HAVE_IOP_SET_ACL */
4629 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4631 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4632 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4634 ll_check_acl(struct inode *inode, int mask)
4637 # ifdef CONFIG_FS_POSIX_ACL
4638 struct posix_acl *acl;
4642 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4643 if (flags & IPERM_FLAG_RCU)
4646 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4651 rc = posix_acl_permission(inode, acl, mask);
4652 posix_acl_release(acl);
4655 # else /* !CONFIG_FS_POSIX_ACL */
4657 # endif /* CONFIG_FS_POSIX_ACL */
4659 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4661 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4662 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4664 # ifdef HAVE_INODE_PERMISION_2ARGS
4665 int ll_inode_permission(struct inode *inode, int mask)
4667 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4672 struct ll_sb_info *sbi;
4673 struct root_squash_info *squash;
4674 struct cred *cred = NULL;
4675 const struct cred *old_cred = NULL;
4677 bool squash_id = false;
4680 #ifdef MAY_NOT_BLOCK
4681 if (mask & MAY_NOT_BLOCK)
4683 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4684 if (flags & IPERM_FLAG_RCU)
4688 /* as root inode are NOT getting validated in lookup operation,
4689 * need to do it before permission check. */
4691 if (inode == inode->i_sb->s_root->d_inode) {
4692 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4697 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4698 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4700 /* squash fsuid/fsgid if needed */
4701 sbi = ll_i2sbi(inode);
4702 squash = &sbi->ll_squash;
4703 if (unlikely(squash->rsi_uid != 0 &&
4704 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4705 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4709 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4710 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4711 squash->rsi_uid, squash->rsi_gid);
4713 /* update current process's credentials
4714 * and FS capability */
4715 cred = prepare_creds();
4719 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4720 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4721 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4722 if ((1 << cap) & CFS_CAP_FS_MASK)
4723 cap_lower(cred->cap_effective, cap);
4725 old_cred = override_creds(cred);
4728 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4729 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4730 /* restore current process's credentials and FS capability */
4732 revert_creds(old_cred);
4739 /* -o localflock - only provides locally consistent flock locks */
4740 struct file_operations ll_file_operations = {
4741 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4742 # ifdef HAVE_SYNC_READ_WRITE
4743 .read = new_sync_read,
4744 .write = new_sync_write,
4746 .read_iter = ll_file_read_iter,
4747 .write_iter = ll_file_write_iter,
4748 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4749 .read = ll_file_read,
4750 .aio_read = ll_file_aio_read,
4751 .write = ll_file_write,
4752 .aio_write = ll_file_aio_write,
4753 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4754 .unlocked_ioctl = ll_file_ioctl,
4755 .open = ll_file_open,
4756 .release = ll_file_release,
4757 .mmap = ll_file_mmap,
4758 .llseek = ll_file_seek,
4759 .splice_read = ll_file_splice_read,
4764 struct file_operations ll_file_operations_flock = {
4765 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4766 # ifdef HAVE_SYNC_READ_WRITE
4767 .read = new_sync_read,
4768 .write = new_sync_write,
4769 # endif /* HAVE_SYNC_READ_WRITE */
4770 .read_iter = ll_file_read_iter,
4771 .write_iter = ll_file_write_iter,
4772 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4773 .read = ll_file_read,
4774 .aio_read = ll_file_aio_read,
4775 .write = ll_file_write,
4776 .aio_write = ll_file_aio_write,
4777 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4778 .unlocked_ioctl = ll_file_ioctl,
4779 .open = ll_file_open,
4780 .release = ll_file_release,
4781 .mmap = ll_file_mmap,
4782 .llseek = ll_file_seek,
4783 .splice_read = ll_file_splice_read,
4786 .flock = ll_file_flock,
4787 .lock = ll_file_flock
4790 /* These are for -o noflock - to return ENOSYS on flock calls */
4791 struct file_operations ll_file_operations_noflock = {
4792 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4793 # ifdef HAVE_SYNC_READ_WRITE
4794 .read = new_sync_read,
4795 .write = new_sync_write,
4796 # endif /* HAVE_SYNC_READ_WRITE */
4797 .read_iter = ll_file_read_iter,
4798 .write_iter = ll_file_write_iter,
4799 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4800 .read = ll_file_read,
4801 .aio_read = ll_file_aio_read,
4802 .write = ll_file_write,
4803 .aio_write = ll_file_aio_write,
4804 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4805 .unlocked_ioctl = ll_file_ioctl,
4806 .open = ll_file_open,
4807 .release = ll_file_release,
4808 .mmap = ll_file_mmap,
4809 .llseek = ll_file_seek,
4810 .splice_read = ll_file_splice_read,
4813 .flock = ll_file_noflock,
4814 .lock = ll_file_noflock
4817 struct inode_operations ll_file_inode_operations = {
4818 .setattr = ll_setattr,
4819 .getattr = ll_getattr,
4820 .permission = ll_inode_permission,
4821 #ifdef HAVE_IOP_XATTR
4822 .setxattr = ll_setxattr,
4823 .getxattr = ll_getxattr,
4824 .removexattr = ll_removexattr,
4826 .listxattr = ll_listxattr,
4827 .fiemap = ll_fiemap,
4828 #ifdef HAVE_IOP_GET_ACL
4829 .get_acl = ll_get_acl,
4831 #ifdef HAVE_IOP_SET_ACL
4832 .set_acl = ll_set_acl,
4836 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4838 struct ll_inode_info *lli = ll_i2info(inode);
4839 struct cl_object *obj = lli->lli_clob;
4848 env = cl_env_get(&refcheck);
4850 RETURN(PTR_ERR(env));
4852 rc = cl_conf_set(env, lli->lli_clob, conf);
4856 if (conf->coc_opc == OBJECT_CONF_SET) {
4857 struct ldlm_lock *lock = conf->coc_lock;
4858 struct cl_layout cl = {
4862 LASSERT(lock != NULL);
4863 LASSERT(ldlm_has_layout(lock));
4865 /* it can only be allowed to match after layout is
4866 * applied to inode otherwise false layout would be
4867 * seen. Applying layout shoud happen before dropping
4868 * the intent lock. */
4869 ldlm_lock_allow_match(lock);
4871 rc = cl_object_layout_get(env, obj, &cl);
4876 DFID": layout version change: %u -> %u\n",
4877 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4879 ll_layout_version_set(lli, cl.cl_layout_gen);
4883 cl_env_put(env, &refcheck);
4888 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4889 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4892 struct ll_sb_info *sbi = ll_i2sbi(inode);
4893 struct ptlrpc_request *req;
4894 struct mdt_body *body;
4901 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4902 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4903 lock->l_lvb_data, lock->l_lvb_len);
4905 if (lock->l_lvb_data != NULL)
4908 /* if layout lock was granted right away, the layout is returned
4909 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4910 * blocked and then granted via completion ast, we have to fetch
4911 * layout here. Please note that we can't use the LVB buffer in
4912 * completion AST because it doesn't have a large enough buffer */
4913 rc = ll_get_default_mdsize(sbi, &lmmsize);
4915 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4916 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4920 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4922 GOTO(out, rc = -EPROTO);
4924 lmmsize = body->mbo_eadatasize;
4925 if (lmmsize == 0) /* empty layout */
4928 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4930 GOTO(out, rc = -EFAULT);
4932 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4933 if (lvbdata == NULL)
4934 GOTO(out, rc = -ENOMEM);
4936 memcpy(lvbdata, lmm, lmmsize);
4937 lock_res_and_lock(lock);
4938 if (unlikely(lock->l_lvb_data == NULL)) {
4939 lock->l_lvb_type = LVB_T_LAYOUT;
4940 lock->l_lvb_data = lvbdata;
4941 lock->l_lvb_len = lmmsize;
4944 unlock_res_and_lock(lock);
4947 OBD_FREE_LARGE(lvbdata, lmmsize);
4952 ptlrpc_req_finished(req);
4957 * Apply the layout to the inode. Layout lock is held and will be released
4960 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4961 struct inode *inode)
4963 struct ll_inode_info *lli = ll_i2info(inode);
4964 struct ll_sb_info *sbi = ll_i2sbi(inode);
4965 struct ldlm_lock *lock;
4966 struct cl_object_conf conf;
4969 bool wait_layout = false;
4972 LASSERT(lustre_handle_is_used(lockh));
4974 lock = ldlm_handle2lock(lockh);
4975 LASSERT(lock != NULL);
4976 LASSERT(ldlm_has_layout(lock));
4978 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4979 PFID(&lli->lli_fid), inode);
4981 /* in case this is a caching lock and reinstate with new inode */
4982 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4984 lock_res_and_lock(lock);
4985 lvb_ready = ldlm_is_lvb_ready(lock);
4986 unlock_res_and_lock(lock);
4988 /* checking lvb_ready is racy but this is okay. The worst case is
4989 * that multi processes may configure the file on the same time. */
4993 rc = ll_layout_fetch(inode, lock);
4997 /* for layout lock, lmm is stored in lock's lvb.
4998 * lvb_data is immutable if the lock is held so it's safe to access it
5001 * set layout to file. Unlikely this will fail as old layout was
5002 * surely eliminated */
5003 memset(&conf, 0, sizeof conf);
5004 conf.coc_opc = OBJECT_CONF_SET;
5005 conf.coc_inode = inode;
5006 conf.coc_lock = lock;
5007 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5008 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5009 rc = ll_layout_conf(inode, &conf);
5011 /* refresh layout failed, need to wait */
5012 wait_layout = rc == -EBUSY;
5015 LDLM_LOCK_PUT(lock);
5016 ldlm_lock_decref(lockh, mode);
5018 /* wait for IO to complete if it's still being used. */
5020 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5021 ll_get_fsname(inode->i_sb, NULL, 0),
5022 PFID(&lli->lli_fid), inode);
5024 memset(&conf, 0, sizeof conf);
5025 conf.coc_opc = OBJECT_CONF_WAIT;
5026 conf.coc_inode = inode;
5027 rc = ll_layout_conf(inode, &conf);
5031 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5032 ll_get_fsname(inode->i_sb, NULL, 0),
5033 PFID(&lli->lli_fid), rc);
5039 * Issue layout intent RPC to MDS.
5040 * \param inode [in] file inode
5041 * \param intent [in] layout intent
5043 * \retval 0 on success
5044 * \retval < 0 error code
5046 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5048 struct ll_inode_info *lli = ll_i2info(inode);
5049 struct ll_sb_info *sbi = ll_i2sbi(inode);
5050 struct md_op_data *op_data;
5051 struct lookup_intent it;
5052 struct ptlrpc_request *req;
5056 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5057 0, 0, LUSTRE_OPC_ANY, NULL);
5058 if (IS_ERR(op_data))
5059 RETURN(PTR_ERR(op_data));
5061 op_data->op_data = intent;
5062 op_data->op_data_size = sizeof(*intent);
5064 memset(&it, 0, sizeof(it));
5065 it.it_op = IT_LAYOUT;
5066 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5067 intent->li_opc == LAYOUT_INTENT_TRUNC)
5068 it.it_flags = FMODE_WRITE;
5070 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5071 ll_get_fsname(inode->i_sb, NULL, 0),
5072 PFID(&lli->lli_fid), inode);
5074 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5075 &ll_md_blocking_ast, 0);
5076 if (it.it_request != NULL)
5077 ptlrpc_req_finished(it.it_request);
5078 it.it_request = NULL;
5080 ll_finish_md_op_data(op_data);
5082 /* set lock data in case this is a new lock */
5084 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5086 ll_intent_drop_lock(&it);
5092 * This function checks if there exists a LAYOUT lock on the client side,
5093 * or enqueues it if it doesn't have one in cache.
5095 * This function will not hold layout lock so it may be revoked any time after
5096 * this function returns. Any operations depend on layout should be redone
5099 * This function should be called before lov_io_init() to get an uptodate
5100 * layout version, the caller should save the version number and after IO
5101 * is finished, this function should be called again to verify that layout
5102 * is not changed during IO time.
5104 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5106 struct ll_inode_info *lli = ll_i2info(inode);
5107 struct ll_sb_info *sbi = ll_i2sbi(inode);
5108 struct lustre_handle lockh;
5109 struct layout_intent intent = {
5110 .li_opc = LAYOUT_INTENT_ACCESS,
5112 enum ldlm_mode mode;
5116 *gen = ll_layout_version_get(lli);
5117 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5121 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5122 LASSERT(S_ISREG(inode->i_mode));
5124 /* take layout lock mutex to enqueue layout lock exclusively. */
5125 mutex_lock(&lli->lli_layout_mutex);
5128 /* mostly layout lock is caching on the local side, so try to
5129 * match it before grabbing layout lock mutex. */
5130 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5131 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5132 if (mode != 0) { /* hit cached lock */
5133 rc = ll_layout_lock_set(&lockh, mode, inode);
5139 rc = ll_layout_intent(inode, &intent);
5145 *gen = ll_layout_version_get(lli);
5146 mutex_unlock(&lli->lli_layout_mutex);
5152 * Issue layout intent RPC indicating where in a file an IO is about to write.
5154 * \param[in] inode file inode.
5155 * \param[in] ext write range with start offset of fille in bytes where
5156 * an IO is about to write, and exclusive end offset in
5159 * \retval 0 on success
5160 * \retval < 0 error code
5162 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5163 struct lu_extent *ext)
5165 struct layout_intent intent = {
5167 .li_extent.e_start = ext->e_start,
5168 .li_extent.e_end = ext->e_end,
5173 rc = ll_layout_intent(inode, &intent);
5179 * This function send a restore request to the MDT
5181 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5183 struct hsm_user_request *hur;
5187 len = sizeof(struct hsm_user_request) +
5188 sizeof(struct hsm_user_item);
5189 OBD_ALLOC(hur, len);
5193 hur->hur_request.hr_action = HUA_RESTORE;
5194 hur->hur_request.hr_archive_id = 0;
5195 hur->hur_request.hr_flags = 0;
5196 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5197 sizeof(hur->hur_user_item[0].hui_fid));
5198 hur->hur_user_item[0].hui_extent.offset = offset;
5199 hur->hur_user_item[0].hui_extent.length = length;
5200 hur->hur_request.hr_itemcount = 1;
5201 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,