4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
166 case MDS_CLOSE_LAYOUT_SPLIT:
167 case MDS_CLOSE_LAYOUT_SWAP: {
168 struct split_param *sp = data;
170 LASSERT(data != NULL);
171 op_data->op_bias |= bias;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
175 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
176 op_data->op_mirror_id = sp->sp_mirror_id;
178 op_data->op_fid2 = *ll_inode2fid(data);
183 case MDS_CLOSE_RESYNC_DONE: {
184 struct ll_ioc_lease *ioc = data;
186 LASSERT(data != NULL);
187 op_data->op_attr_blocks +=
188 ioc->lil_count * op_data->op_attr_blocks;
189 op_data->op_attr.ia_valid |= ATTR_SIZE;
190 op_data->op_xvalid |= OP_XVALID_BLOCKS;
191 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_data = &ioc->lil_ids[0];
195 op_data->op_data_size =
196 ioc->lil_count * sizeof(ioc->lil_ids[0]);
200 case MDS_PCC_ATTACH: {
201 struct pcc_param *param = data;
203 LASSERT(data != NULL);
204 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
205 op_data->op_archive_id = param->pa_archive_id;
206 op_data->op_data_version = param->pa_data_version;
207 op_data->op_lease_handle = och->och_lease_handle;
211 case MDS_HSM_RELEASE:
212 LASSERT(data != NULL);
213 op_data->op_bias |= MDS_HSM_RELEASE;
214 op_data->op_data_version = *(__u64 *)data;
215 op_data->op_lease_handle = och->och_lease_handle;
216 op_data->op_attr.ia_valid |= ATTR_SIZE;
217 op_data->op_xvalid |= OP_XVALID_BLOCKS;
221 LASSERT(data == NULL);
225 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
226 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
227 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
228 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
230 rc = md_close(md_exp, op_data, och->och_mod, &req);
231 if (rc != 0 && rc != -EINTR)
232 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
233 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
235 if (rc == 0 && op_data->op_bias & bias) {
236 struct mdt_body *body;
238 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
239 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 if (bias & MDS_PCC_ATTACH) {
243 struct pcc_param *param = data;
245 param->pa_layout_gen = body->mbo_layout_gen;
249 ll_finish_md_op_data(op_data);
253 md_clear_open_replay_data(md_exp, och);
254 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
257 ptlrpc_req_finished(req); /* This is close request */
261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
263 struct ll_inode_info *lli = ll_i2info(inode);
264 struct obd_client_handle **och_p;
265 struct obd_client_handle *och;
270 if (fmode & FMODE_WRITE) {
271 och_p = &lli->lli_mds_write_och;
272 och_usecount = &lli->lli_open_fd_write_count;
273 } else if (fmode & FMODE_EXEC) {
274 och_p = &lli->lli_mds_exec_och;
275 och_usecount = &lli->lli_open_fd_exec_count;
277 LASSERT(fmode & FMODE_READ);
278 och_p = &lli->lli_mds_read_och;
279 och_usecount = &lli->lli_open_fd_read_count;
282 mutex_lock(&lli->lli_och_mutex);
283 if (*och_usecount > 0) {
284 /* There are still users of this handle, so skip
286 mutex_unlock(&lli->lli_och_mutex);
292 mutex_unlock(&lli->lli_och_mutex);
295 /* There might be a race and this handle may already
297 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
303 static int ll_md_close(struct inode *inode, struct file *file)
305 union ldlm_policy_data policy = {
306 .l_inodebits = { MDS_INODELOCK_OPEN },
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310 struct ll_inode_info *lli = ll_i2info(inode);
311 struct lustre_handle lockh;
312 enum ldlm_mode lockmode;
316 /* clear group lock, if present */
317 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
318 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
320 if (fd->fd_lease_och != NULL) {
323 /* Usually the lease is not released when the
324 * application crashed, we need to release here. */
325 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
326 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
327 PFID(&lli->lli_fid), rc, lease_broken);
329 fd->fd_lease_och = NULL;
332 if (fd->fd_och != NULL) {
333 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
338 /* Let's see if we have good enough OPEN lock on the file and if
339 we can skip talking to MDS */
340 mutex_lock(&lli->lli_och_mutex);
341 if (fd->fd_omode & FMODE_WRITE) {
343 LASSERT(lli->lli_open_fd_write_count);
344 lli->lli_open_fd_write_count--;
345 } else if (fd->fd_omode & FMODE_EXEC) {
347 LASSERT(lli->lli_open_fd_exec_count);
348 lli->lli_open_fd_exec_count--;
351 LASSERT(lli->lli_open_fd_read_count);
352 lli->lli_open_fd_read_count--;
354 mutex_unlock(&lli->lli_och_mutex);
356 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
357 LDLM_IBITS, &policy, lockmode, &lockh))
358 rc = ll_md_real_close(inode, fd->fd_omode);
361 LUSTRE_FPRIVATE(file) = NULL;
362 ll_file_data_put(fd);
367 /* While this returns an error code, fput() the caller does not, so we need
368 * to make every effort to clean up all of our state here. Also, applications
369 * rarely check close errors and even if an error is returned they will not
370 * re-try the close call.
372 int ll_file_release(struct inode *inode, struct file *file)
374 struct ll_file_data *fd;
375 struct ll_sb_info *sbi = ll_i2sbi(inode);
376 struct ll_inode_info *lli = ll_i2info(inode);
380 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
381 PFID(ll_inode2fid(inode)), inode);
383 if (inode->i_sb->s_root != file_dentry(file))
384 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
385 fd = LUSTRE_FPRIVATE(file);
388 /* The last ref on @file, maybe not the the owner pid of statahead,
389 * because parent and child process can share the same file handle. */
390 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
391 ll_deauthorize_statahead(inode, fd);
393 if (inode->i_sb->s_root == file_dentry(file)) {
394 LUSTRE_FPRIVATE(file) = NULL;
395 ll_file_data_put(fd);
399 pcc_file_release(inode, file);
401 if (!S_ISDIR(inode->i_mode)) {
402 if (lli->lli_clob != NULL)
403 lov_read_and_clear_async_rc(lli->lli_clob);
404 lli->lli_async_rc = 0;
407 rc = ll_md_close(inode, file);
409 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
410 libcfs_debug_dumplog();
415 static inline int ll_dom_readpage(void *data, struct page *page)
417 struct niobuf_local *lnb = data;
420 kaddr = ll_kmap_atomic(page, KM_USER0);
421 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
422 if (lnb->lnb_len < PAGE_SIZE)
423 memset(kaddr + lnb->lnb_len, 0,
424 PAGE_SIZE - lnb->lnb_len);
425 flush_dcache_page(page);
426 SetPageUptodate(page);
427 ll_kunmap_atomic(kaddr, KM_USER0);
433 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
434 struct lookup_intent *it)
436 struct ll_inode_info *lli = ll_i2info(inode);
437 struct cl_object *obj = lli->lli_clob;
438 struct address_space *mapping = inode->i_mapping;
440 struct niobuf_remote *rnb;
442 unsigned long index, start;
443 struct niobuf_local lnb;
450 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
454 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
455 if (rnb == NULL || rnb->rnb_len == 0)
458 /* LU-11595: Server may return whole file and that is OK always or
459 * it may return just file tail and its offset must be aligned with
460 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
461 * smaller then offset may be not aligned and that data is just ignored.
463 if (rnb->rnb_offset % PAGE_SIZE)
466 /* Server returns whole file or just file tail if it fills in
467 * reply buffer, in both cases total size should be inode size.
469 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
470 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
471 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
472 rnb->rnb_len, i_size_read(inode));
476 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
477 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
479 data = (char *)rnb + sizeof(*rnb);
481 lnb.lnb_file_offset = rnb->rnb_offset;
482 start = lnb.lnb_file_offset / PAGE_SIZE;
484 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
485 lnb.lnb_page_offset = 0;
487 lnb.lnb_data = data + (index << PAGE_SHIFT);
488 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
489 if (lnb.lnb_len > PAGE_SIZE)
490 lnb.lnb_len = PAGE_SIZE;
492 vmpage = read_cache_page(mapping, index + start,
493 ll_dom_readpage, &lnb);
494 if (IS_ERR(vmpage)) {
495 CWARN("%s: cannot fill page %lu for "DFID
496 " with data: rc = %li\n",
497 ll_i2sbi(inode)->ll_fsname, index + start,
498 PFID(lu_object_fid(&obj->co_lu)),
504 } while (rnb->rnb_len > (index << PAGE_SHIFT));
508 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
509 struct lookup_intent *itp)
511 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
512 struct dentry *parent = de->d_parent;
515 struct md_op_data *op_data;
516 struct ptlrpc_request *req = NULL;
520 LASSERT(parent != NULL);
521 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
523 /* if server supports open-by-fid, or file name is invalid, don't pack
524 * name in open request */
525 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
526 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
528 len = de->d_name.len;
529 name = kmalloc(len + 1, GFP_NOFS);
534 spin_lock(&de->d_lock);
535 if (len != de->d_name.len) {
536 spin_unlock(&de->d_lock);
540 memcpy(name, de->d_name.name, len);
542 spin_unlock(&de->d_lock);
544 if (!lu_name_is_valid_2(name, len)) {
550 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
551 name, len, 0, LUSTRE_OPC_ANY, NULL);
552 if (IS_ERR(op_data)) {
554 RETURN(PTR_ERR(op_data));
556 op_data->op_data = lmm;
557 op_data->op_data_size = lmmsize;
559 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
560 &ll_md_blocking_ast, 0);
562 ll_finish_md_op_data(op_data);
564 /* reason for keep own exit path - don`t flood log
565 * with messages with -ESTALE errors.
567 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
568 it_open_error(DISP_OPEN_OPEN, itp))
570 ll_release_openhandle(de, itp);
574 if (it_disposition(itp, DISP_LOOKUP_NEG))
575 GOTO(out, rc = -ENOENT);
577 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
578 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
579 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
583 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
585 if (!rc && itp->it_lock_mode) {
586 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
587 struct ldlm_lock *lock;
588 bool has_dom_bit = false;
590 /* If we got a lock back and it has a LOOKUP bit set,
591 * make sure the dentry is marked as valid so we can find it.
592 * We don't need to care about actual hashing since other bits
593 * of kernel will deal with that later.
595 lock = ldlm_handle2lock(&handle);
597 has_dom_bit = ldlm_has_dom(lock);
598 if (lock->l_policy_data.l_inodebits.bits &
599 MDS_INODELOCK_LOOKUP)
600 d_lustre_revalidate(de);
604 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
606 ll_dom_finish_open(de->d_inode, req, itp);
610 ptlrpc_req_finished(req);
611 ll_intent_drop_lock(itp);
613 /* We did open by fid, but by the time we got to the server,
614 * the object disappeared. If this is a create, we cannot really
615 * tell the userspace that the file it was trying to create
616 * does not exist. Instead let's return -ESTALE, and the VFS will
617 * retry the create with LOOKUP_REVAL that we are going to catch
618 * in ll_revalidate_dentry() and use lookup then.
620 if (rc == -ENOENT && itp->it_op & IT_CREAT)
626 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
627 struct obd_client_handle *och)
629 struct mdt_body *body;
631 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
632 och->och_open_handle = body->mbo_open_handle;
633 och->och_fid = body->mbo_fid1;
634 och->och_lease_handle.cookie = it->it_lock_handle;
635 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
636 och->och_flags = it->it_flags;
638 return md_set_open_replay_data(md_exp, och, it);
641 static int ll_local_open(struct file *file, struct lookup_intent *it,
642 struct ll_file_data *fd, struct obd_client_handle *och)
644 struct inode *inode = file_inode(file);
647 LASSERT(!LUSTRE_FPRIVATE(file));
654 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
659 LUSTRE_FPRIVATE(file) = fd;
660 ll_readahead_init(inode, &fd->fd_ras);
661 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
663 /* ll_cl_context initialize */
664 rwlock_init(&fd->fd_lock);
665 INIT_LIST_HEAD(&fd->fd_lccs);
670 /* Open a file, and (for the very first open) create objects on the OSTs at
671 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
672 * creation or open until ll_lov_setstripe() ioctl is called.
674 * If we already have the stripe MD locally then we don't request it in
675 * md_open(), by passing a lmm_size = 0.
677 * It is up to the application to ensure no other processes open this file
678 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
679 * used. We might be able to avoid races of that sort by getting lli_open_sem
680 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
681 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
683 int ll_file_open(struct inode *inode, struct file *file)
685 struct ll_inode_info *lli = ll_i2info(inode);
686 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
687 .it_flags = file->f_flags };
688 struct obd_client_handle **och_p = NULL;
689 __u64 *och_usecount = NULL;
690 struct ll_file_data *fd;
694 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
695 PFID(ll_inode2fid(inode)), inode, file->f_flags);
697 it = file->private_data; /* XXX: compat macro */
698 file->private_data = NULL; /* prevent ll_local_open assertion */
700 fd = ll_file_data_get();
702 GOTO(out_nofiledata, rc = -ENOMEM);
705 if (S_ISDIR(inode->i_mode))
706 ll_authorize_statahead(inode, fd);
708 if (inode->i_sb->s_root == file_dentry(file)) {
709 LUSTRE_FPRIVATE(file) = fd;
713 if (!it || !it->it_disposition) {
714 /* Convert f_flags into access mode. We cannot use file->f_mode,
715 * because everything but O_ACCMODE mask was stripped from
717 if ((oit.it_flags + 1) & O_ACCMODE)
719 if (file->f_flags & O_TRUNC)
720 oit.it_flags |= FMODE_WRITE;
722 /* kernel only call f_op->open in dentry_open. filp_open calls
723 * dentry_open after call to open_namei that checks permissions.
724 * Only nfsd_open call dentry_open directly without checking
725 * permissions and because of that this code below is safe.
727 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
728 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
730 /* We do not want O_EXCL here, presumably we opened the file
731 * already? XXX - NFS implications? */
732 oit.it_flags &= ~O_EXCL;
734 /* bug20584, if "it_flags" contains O_CREAT, the file will be
735 * created if necessary, then "IT_CREAT" should be set to keep
736 * consistent with it */
737 if (oit.it_flags & O_CREAT)
738 oit.it_op |= IT_CREAT;
744 /* Let's see if we have file open on MDS already. */
745 if (it->it_flags & FMODE_WRITE) {
746 och_p = &lli->lli_mds_write_och;
747 och_usecount = &lli->lli_open_fd_write_count;
748 } else if (it->it_flags & FMODE_EXEC) {
749 och_p = &lli->lli_mds_exec_och;
750 och_usecount = &lli->lli_open_fd_exec_count;
752 och_p = &lli->lli_mds_read_och;
753 och_usecount = &lli->lli_open_fd_read_count;
756 mutex_lock(&lli->lli_och_mutex);
757 if (*och_p) { /* Open handle is present */
758 if (it_disposition(it, DISP_OPEN_OPEN)) {
759 /* Well, there's extra open request that we do not need,
760 let's close it somehow. This will decref request. */
761 rc = it_open_error(DISP_OPEN_OPEN, it);
763 mutex_unlock(&lli->lli_och_mutex);
764 GOTO(out_openerr, rc);
767 ll_release_openhandle(file_dentry(file), it);
771 rc = ll_local_open(file, it, fd, NULL);
774 mutex_unlock(&lli->lli_och_mutex);
775 GOTO(out_openerr, rc);
778 LASSERT(*och_usecount == 0);
779 if (!it->it_disposition) {
780 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
781 /* We cannot just request lock handle now, new ELC code
782 means that one of other OPEN locks for this file
783 could be cancelled, and since blocking ast handler
784 would attempt to grab och_mutex as well, that would
785 result in a deadlock */
786 mutex_unlock(&lli->lli_och_mutex);
788 * Normally called under two situations:
790 * 2. A race/condition on MDS resulting in no open
791 * handle to be returned from LOOKUP|OPEN request,
792 * for example if the target entry was a symlink.
794 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
795 * marked by a bit set in ll_iget_for_nfs. Clear the
796 * bit so that it's not confusing later callers.
798 * NB; when ldd is NULL, it must have come via normal
799 * lookup path only, since ll_iget_for_nfs always calls
802 if (ldd && ldd->lld_nfs_dentry) {
803 ldd->lld_nfs_dentry = 0;
804 it->it_flags |= MDS_OPEN_LOCK;
808 * Always specify MDS_OPEN_BY_FID because we don't want
809 * to get file with different fid.
811 it->it_flags |= MDS_OPEN_BY_FID;
812 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
815 GOTO(out_openerr, rc);
819 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
821 GOTO(out_och_free, rc = -ENOMEM);
825 /* md_intent_lock() didn't get a request ref if there was an
826 * open error, so don't do cleanup on the request here
828 /* XXX (green): Should not we bail out on any error here, not
829 * just open error? */
830 rc = it_open_error(DISP_OPEN_OPEN, it);
832 GOTO(out_och_free, rc);
834 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
835 "inode %p: disposition %x, status %d\n", inode,
836 it_disposition(it, ~0), it->it_status);
838 rc = ll_local_open(file, it, fd, *och_p);
840 GOTO(out_och_free, rc);
843 rc = pcc_file_open(inode, file);
845 GOTO(out_och_free, rc);
847 mutex_unlock(&lli->lli_och_mutex);
850 /* Must do this outside lli_och_mutex lock to prevent deadlock where
851 different kind of OPEN lock for this same inode gets cancelled
852 by ldlm_cancel_lru */
853 if (!S_ISREG(inode->i_mode))
854 GOTO(out_och_free, rc);
856 cl_lov_delay_create_clear(&file->f_flags);
857 GOTO(out_och_free, rc);
861 if (och_p && *och_p) {
862 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
863 *och_p = NULL; /* OBD_FREE writes some magic there */
866 mutex_unlock(&lli->lli_och_mutex);
869 if (lli->lli_opendir_key == fd)
870 ll_deauthorize_statahead(inode, fd);
873 ll_file_data_put(fd);
875 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
879 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
880 ptlrpc_req_finished(it->it_request);
881 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
887 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
888 struct ldlm_lock_desc *desc, void *data, int flag)
891 struct lustre_handle lockh;
895 case LDLM_CB_BLOCKING:
896 ldlm_lock2handle(lock, &lockh);
897 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
899 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
903 case LDLM_CB_CANCELING:
911 * When setting a lease on a file, we take ownership of the lli_mds_*_och
912 * and save it as fd->fd_och so as to force client to reopen the file even
913 * if it has an open lock in cache already.
915 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
916 struct lustre_handle *old_open_handle)
918 struct ll_inode_info *lli = ll_i2info(inode);
919 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
920 struct obd_client_handle **och_p;
925 /* Get the openhandle of the file */
926 mutex_lock(&lli->lli_och_mutex);
927 if (fd->fd_lease_och != NULL)
928 GOTO(out_unlock, rc = -EBUSY);
930 if (fd->fd_och == NULL) {
931 if (file->f_mode & FMODE_WRITE) {
932 LASSERT(lli->lli_mds_write_och != NULL);
933 och_p = &lli->lli_mds_write_och;
934 och_usecount = &lli->lli_open_fd_write_count;
936 LASSERT(lli->lli_mds_read_och != NULL);
937 och_p = &lli->lli_mds_read_och;
938 och_usecount = &lli->lli_open_fd_read_count;
941 if (*och_usecount > 1)
942 GOTO(out_unlock, rc = -EBUSY);
949 *old_open_handle = fd->fd_och->och_open_handle;
953 mutex_unlock(&lli->lli_och_mutex);
958 * Release ownership on lli_mds_*_och when putting back a file lease.
960 static int ll_lease_och_release(struct inode *inode, struct file *file)
962 struct ll_inode_info *lli = ll_i2info(inode);
963 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
964 struct obd_client_handle **och_p;
965 struct obd_client_handle *old_och = NULL;
970 mutex_lock(&lli->lli_och_mutex);
971 if (file->f_mode & FMODE_WRITE) {
972 och_p = &lli->lli_mds_write_och;
973 och_usecount = &lli->lli_open_fd_write_count;
975 och_p = &lli->lli_mds_read_och;
976 och_usecount = &lli->lli_open_fd_read_count;
979 /* The file may have been open by another process (broken lease) so
980 * *och_p is not NULL. In this case we should simply increase usecount
983 if (*och_p != NULL) {
984 old_och = fd->fd_och;
991 mutex_unlock(&lli->lli_och_mutex);
994 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1000 * Acquire a lease and open the file.
1002 static struct obd_client_handle *
1003 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1006 struct lookup_intent it = { .it_op = IT_OPEN };
1007 struct ll_sb_info *sbi = ll_i2sbi(inode);
1008 struct md_op_data *op_data;
1009 struct ptlrpc_request *req = NULL;
1010 struct lustre_handle old_open_handle = { 0 };
1011 struct obd_client_handle *och = NULL;
1016 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1017 RETURN(ERR_PTR(-EINVAL));
1020 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1021 RETURN(ERR_PTR(-EPERM));
1023 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1025 RETURN(ERR_PTR(rc));
1030 RETURN(ERR_PTR(-ENOMEM));
1032 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1033 LUSTRE_OPC_ANY, NULL);
1034 if (IS_ERR(op_data))
1035 GOTO(out, rc = PTR_ERR(op_data));
1037 /* To tell the MDT this openhandle is from the same owner */
1038 op_data->op_open_handle = old_open_handle;
1040 it.it_flags = fmode | open_flags;
1041 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1042 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1043 &ll_md_blocking_lease_ast,
1044 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1045 * it can be cancelled which may mislead applications that the lease is
1047 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1048 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1049 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1050 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1051 ll_finish_md_op_data(op_data);
1052 ptlrpc_req_finished(req);
1054 GOTO(out_release_it, rc);
1056 if (it_disposition(&it, DISP_LOOKUP_NEG))
1057 GOTO(out_release_it, rc = -ENOENT);
1059 rc = it_open_error(DISP_OPEN_OPEN, &it);
1061 GOTO(out_release_it, rc);
1063 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1064 ll_och_fill(sbi->ll_md_exp, &it, och);
1066 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1067 GOTO(out_close, rc = -EOPNOTSUPP);
1069 /* already get lease, handle lease lock */
1070 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1071 if (it.it_lock_mode == 0 ||
1072 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1073 /* open lock must return for lease */
1074 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1075 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1077 GOTO(out_close, rc = -EPROTO);
1080 ll_intent_release(&it);
1084 /* Cancel open lock */
1085 if (it.it_lock_mode != 0) {
1086 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1088 it.it_lock_mode = 0;
1089 och->och_lease_handle.cookie = 0ULL;
1091 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1093 CERROR("%s: error closing file "DFID": %d\n",
1094 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1095 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1097 ll_intent_release(&it);
1101 RETURN(ERR_PTR(rc));
1105 * Check whether a layout swap can be done between two inodes.
1107 * \param[in] inode1 First inode to check
1108 * \param[in] inode2 Second inode to check
1110 * \retval 0 on success, layout swap can be performed between both inodes
1111 * \retval negative error code if requirements are not met
1113 static int ll_check_swap_layouts_validity(struct inode *inode1,
1114 struct inode *inode2)
1116 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1119 if (inode_permission(inode1, MAY_WRITE) ||
1120 inode_permission(inode2, MAY_WRITE))
1123 if (inode1->i_sb != inode2->i_sb)
1129 static int ll_swap_layouts_close(struct obd_client_handle *och,
1130 struct inode *inode, struct inode *inode2)
1132 const struct lu_fid *fid1 = ll_inode2fid(inode);
1133 const struct lu_fid *fid2;
1137 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1138 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1140 rc = ll_check_swap_layouts_validity(inode, inode2);
1142 GOTO(out_free_och, rc);
1144 /* We now know that inode2 is a lustre inode */
1145 fid2 = ll_inode2fid(inode2);
1147 rc = lu_fid_cmp(fid1, fid2);
1149 GOTO(out_free_och, rc = -EINVAL);
1151 /* Close the file and {swap,merge} layouts between inode & inode2.
1152 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1153 * because we still need it to pack l_remote_handle to MDT. */
1154 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1157 och = NULL; /* freed in ll_close_inode_openhandle() */
1167 * Release lease and close the file.
1168 * It will check if the lease has ever broken.
1170 static int ll_lease_close_intent(struct obd_client_handle *och,
1171 struct inode *inode,
1172 bool *lease_broken, enum mds_op_bias bias,
1175 struct ldlm_lock *lock;
1176 bool cancelled = true;
1180 lock = ldlm_handle2lock(&och->och_lease_handle);
1182 lock_res_and_lock(lock);
1183 cancelled = ldlm_is_cancel(lock);
1184 unlock_res_and_lock(lock);
1185 LDLM_LOCK_PUT(lock);
1188 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1189 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1191 if (lease_broken != NULL)
1192 *lease_broken = cancelled;
1194 if (!cancelled && !bias)
1195 ldlm_cli_cancel(&och->och_lease_handle, 0);
1197 if (cancelled) { /* no need to excute intent */
1202 rc = ll_close_inode_openhandle(inode, och, bias, data);
1206 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1209 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1213 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1215 static int ll_lease_file_resync(struct obd_client_handle *och,
1216 struct inode *inode, unsigned long arg)
1218 struct ll_sb_info *sbi = ll_i2sbi(inode);
1219 struct md_op_data *op_data;
1220 struct ll_ioc_lease_id ioc;
1221 __u64 data_version_unused;
1225 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1226 LUSTRE_OPC_ANY, NULL);
1227 if (IS_ERR(op_data))
1228 RETURN(PTR_ERR(op_data));
1230 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1234 /* before starting file resync, it's necessary to clean up page cache
1235 * in client memory, otherwise once the layout version is increased,
1236 * writing back cached data will be denied the OSTs. */
1237 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1241 op_data->op_lease_handle = och->och_lease_handle;
1242 op_data->op_mirror_id = ioc.lil_mirror_id;
1243 rc = md_file_resync(sbi->ll_md_exp, op_data);
1249 ll_finish_md_op_data(op_data);
1253 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1255 struct ll_inode_info *lli = ll_i2info(inode);
1256 struct cl_object *obj = lli->lli_clob;
1257 struct cl_attr *attr = vvp_env_thread_attr(env);
1265 ll_inode_size_lock(inode);
1267 /* Merge timestamps the most recently obtained from MDS with
1268 * timestamps obtained from OSTs.
1270 * Do not overwrite atime of inode because it may be refreshed
1271 * by file_accessed() function. If the read was served by cache
1272 * data, there is no RPC to be sent so that atime may not be
1273 * transferred to OSTs at all. MDT only updates atime at close time
1274 * if it's at least 'mdd.*.atime_diff' older.
1275 * All in all, the atime in Lustre does not strictly comply with
1276 * POSIX. Solving this problem needs to send an RPC to MDT for each
1277 * read, this will hurt performance.
1279 if (inode->i_atime.tv_sec < lli->lli_atime ||
1280 lli->lli_update_atime) {
1281 inode->i_atime.tv_sec = lli->lli_atime;
1282 lli->lli_update_atime = 0;
1284 inode->i_mtime.tv_sec = lli->lli_mtime;
1285 inode->i_ctime.tv_sec = lli->lli_ctime;
1287 mtime = inode->i_mtime.tv_sec;
1288 atime = inode->i_atime.tv_sec;
1289 ctime = inode->i_ctime.tv_sec;
1291 cl_object_attr_lock(obj);
1292 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1295 rc = cl_object_attr_get(env, obj, attr);
1296 cl_object_attr_unlock(obj);
1299 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1301 if (atime < attr->cat_atime)
1302 atime = attr->cat_atime;
1304 if (ctime < attr->cat_ctime)
1305 ctime = attr->cat_ctime;
1307 if (mtime < attr->cat_mtime)
1308 mtime = attr->cat_mtime;
1310 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1311 PFID(&lli->lli_fid), attr->cat_size);
1313 i_size_write(inode, attr->cat_size);
1314 inode->i_blocks = attr->cat_blocks;
1316 inode->i_mtime.tv_sec = mtime;
1317 inode->i_atime.tv_sec = atime;
1318 inode->i_ctime.tv_sec = ctime;
1321 ll_inode_size_unlock(inode);
1327 * Set designated mirror for I/O.
1329 * So far only read, write, and truncated can support to issue I/O to
1330 * designated mirror.
1332 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1334 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1336 /* clear layout version for generic(non-resync) I/O in case it carries
1337 * stale layout version due to I/O restart */
1338 io->ci_layout_version = 0;
1340 /* FLR: disable non-delay for designated mirror I/O because obviously
1341 * only one mirror is available */
1342 if (fd->fd_designated_mirror > 0) {
1344 io->ci_designated_mirror = fd->fd_designated_mirror;
1345 io->ci_layout_version = fd->fd_layout_version;
1348 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1349 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1352 static bool file_is_noatime(const struct file *file)
1354 const struct vfsmount *mnt = file->f_path.mnt;
1355 const struct inode *inode = file_inode((struct file *)file);
1357 /* Adapted from file_accessed() and touch_atime().*/
1358 if (file->f_flags & O_NOATIME)
1361 if (inode->i_flags & S_NOATIME)
1364 if (IS_NOATIME(inode))
1367 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1370 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1373 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1379 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1381 struct inode *inode = file_inode(file);
1382 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1384 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1385 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1387 if (iot == CIT_WRITE) {
1388 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1389 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1390 file->f_flags & O_DIRECT ||
1393 io->ci_obj = ll_i2info(inode)->lli_clob;
1394 io->ci_lockreq = CILR_MAYBE;
1395 if (ll_file_nolock(file)) {
1396 io->ci_lockreq = CILR_NEVER;
1397 io->ci_no_srvlock = 1;
1398 } else if (file->f_flags & O_APPEND) {
1399 io->ci_lockreq = CILR_MANDATORY;
1401 io->ci_noatime = file_is_noatime(file);
1403 /* FLR: only use non-delay I/O for read as there is only one
1404 * avaliable mirror for write. */
1405 io->ci_ndelay = !(iot == CIT_WRITE);
1407 ll_io_set_mirror(io, file);
1410 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1413 struct ll_inode_info *lli = ll_i2info(inode);
1414 struct ll_sb_info *sbi = ll_i2sbi(inode);
1415 enum obd_heat_type sample_type;
1416 enum obd_heat_type iobyte_type;
1417 __u64 now = ktime_get_real_seconds();
1419 if (!ll_sbi_has_file_heat(sbi) ||
1420 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1423 if (iot == CIT_READ) {
1424 sample_type = OBD_HEAT_READSAMPLE;
1425 iobyte_type = OBD_HEAT_READBYTE;
1426 } else if (iot == CIT_WRITE) {
1427 sample_type = OBD_HEAT_WRITESAMPLE;
1428 iobyte_type = OBD_HEAT_WRITEBYTE;
1433 spin_lock(&lli->lli_heat_lock);
1434 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1435 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1436 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1437 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1438 spin_unlock(&lli->lli_heat_lock);
1442 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1443 struct file *file, enum cl_io_type iot,
1444 loff_t *ppos, size_t count)
1446 struct vvp_io *vio = vvp_env_io(env);
1447 struct inode *inode = file_inode(file);
1448 struct ll_inode_info *lli = ll_i2info(inode);
1449 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1450 struct range_lock range;
1454 unsigned retried = 0;
1455 bool restarted = false;
1459 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1460 file_dentry(file)->d_name.name,
1461 iot == CIT_READ ? "read" : "write", *ppos, count);
1464 io = vvp_env_thread_io(env);
1465 ll_io_init(io, file, iot);
1466 io->ci_ndelay_tried = retried;
1468 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1469 bool range_locked = false;
1471 if (file->f_flags & O_APPEND)
1472 range_lock_init(&range, 0, LUSTRE_EOF);
1474 range_lock_init(&range, *ppos, *ppos + count - 1);
1476 vio->vui_fd = LUSTRE_FPRIVATE(file);
1477 vio->vui_io_subtype = args->via_io_subtype;
1479 switch (vio->vui_io_subtype) {
1481 vio->vui_iter = args->u.normal.via_iter;
1482 vio->vui_iocb = args->u.normal.via_iocb;
1483 /* Direct IO reads must also take range lock,
1484 * or multiple reads will try to work on the same pages
1485 * See LU-6227 for details. */
1486 if (((iot == CIT_WRITE) ||
1487 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1488 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1489 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1491 rc = range_lock(&lli->lli_write_tree, &range);
1495 range_locked = true;
1499 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1500 vio->u.splice.vui_flags = args->u.splice.via_flags;
1503 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1507 ll_cl_add(file, env, io, LCC_RW);
1508 rc = cl_io_loop(env, io);
1509 ll_cl_remove(file, env);
1512 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1514 range_unlock(&lli->lli_write_tree, &range);
1517 /* cl_io_rw_init() handled IO */
1521 if (io->ci_nob > 0) {
1522 result += io->ci_nob;
1523 count -= io->ci_nob;
1524 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1526 /* prepare IO restart */
1527 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1528 args->u.normal.via_iter = vio->vui_iter;
1531 cl_io_fini(env, io);
1534 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1535 file->f_path.dentry->d_name.name,
1536 iot, rc, result, io->ci_need_restart);
1538 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1540 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1541 file_dentry(file)->d_name.name,
1542 iot == CIT_READ ? "read" : "write",
1543 *ppos, count, result, rc);
1544 /* preserve the tried count for FLR */
1545 retried = io->ci_ndelay_tried;
1550 if (iot == CIT_READ) {
1552 ll_stats_ops_tally(ll_i2sbi(inode),
1553 LPROC_LL_READ_BYTES, result);
1554 } else if (iot == CIT_WRITE) {
1556 ll_stats_ops_tally(ll_i2sbi(inode),
1557 LPROC_LL_WRITE_BYTES, result);
1558 fd->fd_write_failed = false;
1559 } else if (result == 0 && rc == 0) {
1562 fd->fd_write_failed = true;
1564 fd->fd_write_failed = false;
1565 } else if (rc != -ERESTARTSYS) {
1566 fd->fd_write_failed = true;
1570 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1572 ll_heat_add(inode, iot, result);
1574 RETURN(result > 0 ? result : rc);
1578 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1579 * especially for small I/O.
1581 * To serve a read request, CLIO has to create and initialize a cl_io and
1582 * then request DLM lock. This has turned out to have siginificant overhead
1583 * and affects the performance of small I/O dramatically.
1585 * It's not necessary to create a cl_io for each I/O. Under the help of read
1586 * ahead, most of the pages being read are already in memory cache and we can
1587 * read those pages directly because if the pages exist, the corresponding DLM
1588 * lock must exist so that page content must be valid.
1590 * In fast read implementation, the llite speculatively finds and reads pages
1591 * in memory cache. There are three scenarios for fast read:
1592 * - If the page exists and is uptodate, kernel VM will provide the data and
1593 * CLIO won't be intervened;
1594 * - If the page was brought into memory by read ahead, it will be exported
1595 * and read ahead parameters will be updated;
1596 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1597 * it will go back and invoke normal read, i.e., a cl_io will be created
1598 * and DLM lock will be requested.
1600 * POSIX compliance: posix standard states that read is intended to be atomic.
1601 * Lustre read implementation is in line with Linux kernel read implementation
1602 * and neither of them complies with POSIX standard in this matter. Fast read
1603 * doesn't make the situation worse on single node but it may interleave write
1604 * results from multiple nodes due to short read handling in ll_file_aio_read().
1606 * \param env - lu_env
1607 * \param iocb - kiocb from kernel
1608 * \param iter - user space buffers where the data will be copied
1610 * \retval - number of bytes have been read, or error code if error occurred.
1613 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1617 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1620 /* NB: we can't do direct IO for fast read because it will need a lock
1621 * to make IO engine happy. */
1622 if (iocb->ki_filp->f_flags & O_DIRECT)
1625 result = generic_file_read_iter(iocb, iter);
1627 /* If the first page is not in cache, generic_file_aio_read() will be
1628 * returned with -ENODATA.
1629 * See corresponding code in ll_readpage(). */
1630 if (result == -ENODATA)
1634 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1635 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1636 LPROC_LL_READ_BYTES, result);
1643 * Read from a file (through the page cache).
1645 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1648 struct vvp_io_args *args;
1655 * Currently when PCC read failed, we do not fall back to the
1656 * normal read path, just return the error.
1657 * The resaon is that: for RW-PCC, the file data may be modified
1658 * in the PCC and inconsistent with the data on OSTs (or file
1659 * data has been removed from the Lustre file system), at this
1660 * time, fallback to the normal read path may read the wrong
1662 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1663 * path: read data from data copy on OSTs.
1665 result = pcc_file_read_iter(iocb, to, &cached);
1669 ll_ras_enter(iocb->ki_filp);
1671 result = ll_do_fast_read(iocb, to);
1672 if (result < 0 || iov_iter_count(to) == 0)
1675 env = cl_env_get(&refcheck);
1677 return PTR_ERR(env);
1679 args = ll_env_args(env, IO_NORMAL);
1680 args->u.normal.via_iter = to;
1681 args->u.normal.via_iocb = iocb;
1683 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1684 &iocb->ki_pos, iov_iter_count(to));
1687 else if (result == 0)
1690 cl_env_put(env, &refcheck);
1696 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1697 * If a page is already in the page cache and dirty (and some other things -
1698 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1699 * write to it without doing a full I/O, because Lustre already knows about it
1700 * and will write it out. This saves a lot of processing time.
1702 * All writes here are within one page, so exclusion is handled by the page
1703 * lock on the vm page. We do not do tiny writes for writes which touch
1704 * multiple pages because it's very unlikely multiple sequential pages are
1705 * are already dirty.
1707 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1708 * and are unlikely to be to already dirty pages.
1710 * Attribute updates are important here, we do them in ll_tiny_write_end.
1712 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1714 ssize_t count = iov_iter_count(iter);
1715 struct file *file = iocb->ki_filp;
1716 struct inode *inode = file_inode(file);
1717 bool lock_inode = !IS_NOSEC(inode);
1722 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1723 * of function for why.
1725 if (count >= PAGE_SIZE ||
1726 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1729 if (unlikely(lock_inode))
1731 result = __generic_file_write_iter(iocb, iter);
1733 if (unlikely(lock_inode))
1734 inode_unlock(inode);
1736 /* If the page is not already dirty, ll_tiny_write_begin returns
1737 * -ENODATA. We continue on to normal write.
1739 if (result == -ENODATA)
1743 ll_heat_add(inode, CIT_WRITE, result);
1744 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1746 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1749 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1755 * Write to a file (through the page cache).
1757 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1759 struct vvp_io_args *args;
1761 ssize_t rc_tiny = 0, rc_normal;
1769 * When PCC write failed, we usually do not fall back to the normal
1770 * write path, just return the error. But there is a special case when
1771 * returned error code is -ENOSPC due to running out of space on PCC HSM
1772 * bakcend. At this time, it will fall back to normal I/O path and
1773 * retry the I/O. As the file is in HSM released state, it will restore
1774 * the file data to OSTs first and redo the write again. And the
1775 * restore process will revoke the layout lock and detach the file
1776 * from PCC cache automatically.
1778 result = pcc_file_write_iter(iocb, from, &cached);
1779 if (cached && result != -ENOSPC && result != -EDQUOT)
1782 /* NB: we can't do direct IO for tiny writes because they use the page
1783 * cache, we can't do sync writes because tiny writes can't flush
1784 * pages, and we can't do append writes because we can't guarantee the
1785 * required DLM locks are held to protect file size.
1787 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1788 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1789 rc_tiny = ll_do_tiny_write(iocb, from);
1791 /* In case of error, go on and try normal write - Only stop if tiny
1792 * write completed I/O.
1794 if (iov_iter_count(from) == 0)
1795 GOTO(out, rc_normal = rc_tiny);
1797 env = cl_env_get(&refcheck);
1799 return PTR_ERR(env);
1801 args = ll_env_args(env, IO_NORMAL);
1802 args->u.normal.via_iter = from;
1803 args->u.normal.via_iocb = iocb;
1805 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1806 &iocb->ki_pos, iov_iter_count(from));
1808 /* On success, combine bytes written. */
1809 if (rc_tiny >= 0 && rc_normal > 0)
1810 rc_normal += rc_tiny;
1811 /* On error, only return error from normal write if tiny write did not
1812 * write any bytes. Otherwise return bytes written by tiny write.
1814 else if (rc_tiny > 0)
1815 rc_normal = rc_tiny;
1817 cl_env_put(env, &refcheck);
1822 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1824 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1826 static int ll_file_get_iov_count(const struct iovec *iov,
1827 unsigned long *nr_segs, size_t *count)
1832 for (seg = 0; seg < *nr_segs; seg++) {
1833 const struct iovec *iv = &iov[seg];
1836 * If any segment has a negative length, or the cumulative
1837 * length ever wraps negative then return -EINVAL.
1840 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1842 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1847 cnt -= iv->iov_len; /* This segment is no good */
1854 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1855 unsigned long nr_segs, loff_t pos)
1862 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1866 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1867 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1868 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1869 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1870 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1872 result = ll_file_read_iter(iocb, &to);
1877 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1880 struct iovec iov = { .iov_base = buf, .iov_len = count };
1885 init_sync_kiocb(&kiocb, file);
1886 kiocb.ki_pos = *ppos;
1887 #ifdef HAVE_KIOCB_KI_LEFT
1888 kiocb.ki_left = count;
1889 #elif defined(HAVE_KI_NBYTES)
1890 kiocb.i_nbytes = count;
1893 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1894 *ppos = kiocb.ki_pos;
1900 * Write to a file (through the page cache).
1903 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1904 unsigned long nr_segs, loff_t pos)
1906 struct iov_iter from;
1911 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1915 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1916 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1917 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1918 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1919 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1921 result = ll_file_write_iter(iocb, &from);
1926 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1927 size_t count, loff_t *ppos)
1929 struct iovec iov = { .iov_base = (void __user *)buf,
1936 init_sync_kiocb(&kiocb, file);
1937 kiocb.ki_pos = *ppos;
1938 #ifdef HAVE_KIOCB_KI_LEFT
1939 kiocb.ki_left = count;
1940 #elif defined(HAVE_KI_NBYTES)
1941 kiocb.ki_nbytes = count;
1944 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1945 *ppos = kiocb.ki_pos;
1949 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1952 * Send file content (through pagecache) somewhere with helper
1954 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1955 struct pipe_inode_info *pipe, size_t count,
1959 struct vvp_io_args *args;
1966 result = pcc_file_splice_read(in_file, ppos, pipe,
1967 count, flags, &cached);
1971 ll_ras_enter(in_file);
1973 env = cl_env_get(&refcheck);
1975 RETURN(PTR_ERR(env));
1977 args = ll_env_args(env, IO_SPLICE);
1978 args->u.splice.via_pipe = pipe;
1979 args->u.splice.via_flags = flags;
1981 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1982 cl_env_put(env, &refcheck);
1986 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1987 __u64 flags, struct lov_user_md *lum, int lum_size)
1989 struct lookup_intent oit = {
1991 .it_flags = flags | MDS_OPEN_BY_FID,
1996 ll_inode_size_lock(inode);
1997 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1999 GOTO(out_unlock, rc);
2001 ll_release_openhandle(dentry, &oit);
2004 ll_inode_size_unlock(inode);
2005 ll_intent_release(&oit);
2010 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2011 struct lov_mds_md **lmmp, int *lmm_size,
2012 struct ptlrpc_request **request)
2014 struct ll_sb_info *sbi = ll_i2sbi(inode);
2015 struct mdt_body *body;
2016 struct lov_mds_md *lmm = NULL;
2017 struct ptlrpc_request *req = NULL;
2018 struct md_op_data *op_data;
2021 rc = ll_get_default_mdsize(sbi, &lmmsize);
2025 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2026 strlen(filename), lmmsize,
2027 LUSTRE_OPC_ANY, NULL);
2028 if (IS_ERR(op_data))
2029 RETURN(PTR_ERR(op_data));
2031 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2032 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2033 ll_finish_md_op_data(op_data);
2035 CDEBUG(D_INFO, "md_getattr_name failed "
2036 "on %s: rc %d\n", filename, rc);
2040 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2041 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2043 lmmsize = body->mbo_eadatasize;
2045 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2047 GOTO(out, rc = -ENODATA);
2050 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2051 LASSERT(lmm != NULL);
2053 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2054 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2055 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2056 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2057 GOTO(out, rc = -EPROTO);
2060 * This is coming from the MDS, so is probably in
2061 * little endian. We convert it to host endian before
2062 * passing it to userspace.
2064 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2067 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2068 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2069 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2070 if (le32_to_cpu(lmm->lmm_pattern) &
2071 LOV_PATTERN_F_RELEASED)
2075 /* if function called for directory - we should
2076 * avoid swab not existent lsm objects */
2077 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2078 lustre_swab_lov_user_md_v1(
2079 (struct lov_user_md_v1 *)lmm);
2080 if (S_ISREG(body->mbo_mode))
2081 lustre_swab_lov_user_md_objects(
2082 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2084 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2085 lustre_swab_lov_user_md_v3(
2086 (struct lov_user_md_v3 *)lmm);
2087 if (S_ISREG(body->mbo_mode))
2088 lustre_swab_lov_user_md_objects(
2089 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2091 } else if (lmm->lmm_magic ==
2092 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2093 lustre_swab_lov_comp_md_v1(
2094 (struct lov_comp_md_v1 *)lmm);
2095 } else if (lmm->lmm_magic ==
2096 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2097 struct lov_foreign_md *lfm;
2099 lfm = (struct lov_foreign_md *)lmm;
2100 __swab32s(&lfm->lfm_magic);
2101 __swab32s(&lfm->lfm_length);
2102 __swab32s(&lfm->lfm_type);
2103 __swab32s(&lfm->lfm_flags);
2109 *lmm_size = lmmsize;
2114 static int ll_lov_setea(struct inode *inode, struct file *file,
2117 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2118 struct lov_user_md *lump;
2119 int lum_size = sizeof(struct lov_user_md) +
2120 sizeof(struct lov_user_ost_data);
2124 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2127 OBD_ALLOC_LARGE(lump, lum_size);
2131 if (copy_from_user(lump, arg, lum_size))
2132 GOTO(out_lump, rc = -EFAULT);
2134 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2136 cl_lov_delay_create_clear(&file->f_flags);
2139 OBD_FREE_LARGE(lump, lum_size);
2143 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2150 env = cl_env_get(&refcheck);
2152 RETURN(PTR_ERR(env));
2154 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2155 cl_env_put(env, &refcheck);
2159 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2162 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2163 struct lov_user_md *klum;
2165 __u64 flags = FMODE_WRITE;
2168 rc = ll_copy_user_md(lum, &klum);
2173 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2178 rc = put_user(0, &lum->lmm_stripe_count);
2182 rc = ll_layout_refresh(inode, &gen);
2186 rc = ll_file_getstripe(inode, arg, lum_size);
2188 cl_lov_delay_create_clear(&file->f_flags);
2191 OBD_FREE(klum, lum_size);
2196 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2198 struct ll_inode_info *lli = ll_i2info(inode);
2199 struct cl_object *obj = lli->lli_clob;
2200 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2201 struct ll_grouplock grouplock;
2206 CWARN("group id for group lock must not be 0\n");
2210 if (ll_file_nolock(file))
2211 RETURN(-EOPNOTSUPP);
2213 spin_lock(&lli->lli_lock);
2214 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2215 CWARN("group lock already existed with gid %lu\n",
2216 fd->fd_grouplock.lg_gid);
2217 spin_unlock(&lli->lli_lock);
2220 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2221 spin_unlock(&lli->lli_lock);
2224 * XXX: group lock needs to protect all OST objects while PFL
2225 * can add new OST objects during the IO, so we'd instantiate
2226 * all OST objects before getting its group lock.
2231 struct cl_layout cl = {
2232 .cl_is_composite = false,
2234 struct lu_extent ext = {
2236 .e_end = OBD_OBJECT_EOF,
2239 env = cl_env_get(&refcheck);
2241 RETURN(PTR_ERR(env));
2243 rc = cl_object_layout_get(env, obj, &cl);
2244 if (!rc && cl.cl_is_composite)
2245 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2248 cl_env_put(env, &refcheck);
2253 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2254 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2258 spin_lock(&lli->lli_lock);
2259 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2260 spin_unlock(&lli->lli_lock);
2261 CERROR("another thread just won the race\n");
2262 cl_put_grouplock(&grouplock);
2266 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2267 fd->fd_grouplock = grouplock;
2268 spin_unlock(&lli->lli_lock);
2270 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2274 static int ll_put_grouplock(struct inode *inode, struct file *file,
2277 struct ll_inode_info *lli = ll_i2info(inode);
2278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2279 struct ll_grouplock grouplock;
2282 spin_lock(&lli->lli_lock);
2283 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2284 spin_unlock(&lli->lli_lock);
2285 CWARN("no group lock held\n");
2289 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2291 if (fd->fd_grouplock.lg_gid != arg) {
2292 CWARN("group lock %lu doesn't match current id %lu\n",
2293 arg, fd->fd_grouplock.lg_gid);
2294 spin_unlock(&lli->lli_lock);
2298 grouplock = fd->fd_grouplock;
2299 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2300 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2301 spin_unlock(&lli->lli_lock);
2303 cl_put_grouplock(&grouplock);
2304 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2309 * Close inode open handle
2311 * \param dentry [in] dentry which contains the inode
2312 * \param it [in,out] intent which contains open info and result
2315 * \retval <0 failure
2317 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2319 struct inode *inode = dentry->d_inode;
2320 struct obd_client_handle *och;
2326 /* Root ? Do nothing. */
2327 if (dentry->d_inode->i_sb->s_root == dentry)
2330 /* No open handle to close? Move away */
2331 if (!it_disposition(it, DISP_OPEN_OPEN))
2334 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2336 OBD_ALLOC(och, sizeof(*och));
2338 GOTO(out, rc = -ENOMEM);
2340 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2342 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2344 /* this one is in place of ll_file_open */
2345 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2346 ptlrpc_req_finished(it->it_request);
2347 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2353 * Get size for inode for which FIEMAP mapping is requested.
2354 * Make the FIEMAP get_info call and returns the result.
2355 * \param fiemap kernel buffer to hold extens
2356 * \param num_bytes kernel buffer size
2358 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2364 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2367 /* Checks for fiemap flags */
2368 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2369 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2373 /* Check for FIEMAP_FLAG_SYNC */
2374 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2375 rc = filemap_fdatawrite(inode->i_mapping);
2380 env = cl_env_get(&refcheck);
2382 RETURN(PTR_ERR(env));
2384 if (i_size_read(inode) == 0) {
2385 rc = ll_glimpse_size(inode);
2390 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2391 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2392 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2394 /* If filesize is 0, then there would be no objects for mapping */
2395 if (fmkey.lfik_oa.o_size == 0) {
2396 fiemap->fm_mapped_extents = 0;
2400 fmkey.lfik_fiemap = *fiemap;
2402 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2403 &fmkey, fiemap, &num_bytes);
2405 cl_env_put(env, &refcheck);
2409 int ll_fid2path(struct inode *inode, void __user *arg)
2411 struct obd_export *exp = ll_i2mdexp(inode);
2412 const struct getinfo_fid2path __user *gfin = arg;
2414 struct getinfo_fid2path *gfout;
2420 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2421 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2424 /* Only need to get the buflen */
2425 if (get_user(pathlen, &gfin->gf_pathlen))
2428 if (pathlen > PATH_MAX)
2431 outsize = sizeof(*gfout) + pathlen;
2432 OBD_ALLOC(gfout, outsize);
2436 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2437 GOTO(gf_free, rc = -EFAULT);
2438 /* append root FID after gfout to let MDT know the root FID so that it
2439 * can lookup the correct path, this is mainly for fileset.
2440 * old server without fileset mount support will ignore this. */
2441 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2443 /* Call mdc_iocontrol */
2444 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2448 if (copy_to_user(arg, gfout, outsize))
2452 OBD_FREE(gfout, outsize);
2457 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2459 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2467 ioc->idv_version = 0;
2468 ioc->idv_layout_version = UINT_MAX;
2470 /* If no file object initialized, we consider its version is 0. */
2474 env = cl_env_get(&refcheck);
2476 RETURN(PTR_ERR(env));
2478 io = vvp_env_thread_io(env);
2480 io->u.ci_data_version.dv_data_version = 0;
2481 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2482 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2485 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2486 result = cl_io_loop(env, io);
2488 result = io->ci_result;
2490 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2491 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2493 cl_io_fini(env, io);
2495 if (unlikely(io->ci_need_restart))
2498 cl_env_put(env, &refcheck);
2504 * Read the data_version for inode.
2506 * This value is computed using stripe object version on OST.
2507 * Version is computed using server side locking.
2509 * @param flags if do sync on the OST side;
2511 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2512 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2514 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2516 struct ioc_data_version ioc = { .idv_flags = flags };
2519 rc = ll_ioc_data_version(inode, &ioc);
2521 *data_version = ioc.idv_version;
2527 * Trigger a HSM release request for the provided inode.
2529 int ll_hsm_release(struct inode *inode)
2532 struct obd_client_handle *och = NULL;
2533 __u64 data_version = 0;
2538 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2539 ll_i2sbi(inode)->ll_fsname,
2540 PFID(&ll_i2info(inode)->lli_fid));
2542 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2544 GOTO(out, rc = PTR_ERR(och));
2546 /* Grab latest data_version and [am]time values */
2547 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2551 env = cl_env_get(&refcheck);
2553 GOTO(out, rc = PTR_ERR(env));
2555 rc = ll_merge_attr(env, inode);
2556 cl_env_put(env, &refcheck);
2558 /* If error happen, we have the wrong size for a file.
2564 /* Release the file.
2565 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2566 * we still need it to pack l_remote_handle to MDT. */
2567 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2573 if (och != NULL && !IS_ERR(och)) /* close the file */
2574 ll_lease_close(och, inode, NULL);
2579 struct ll_swap_stack {
2582 struct inode *inode1;
2583 struct inode *inode2;
2588 static int ll_swap_layouts(struct file *file1, struct file *file2,
2589 struct lustre_swap_layouts *lsl)
2591 struct mdc_swap_layouts msl;
2592 struct md_op_data *op_data;
2595 struct ll_swap_stack *llss = NULL;
2598 OBD_ALLOC_PTR(llss);
2602 llss->inode1 = file_inode(file1);
2603 llss->inode2 = file_inode(file2);
2605 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2609 /* we use 2 bool because it is easier to swap than 2 bits */
2610 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2611 llss->check_dv1 = true;
2613 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2614 llss->check_dv2 = true;
2616 /* we cannot use lsl->sl_dvX directly because we may swap them */
2617 llss->dv1 = lsl->sl_dv1;
2618 llss->dv2 = lsl->sl_dv2;
2620 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2621 if (rc == 0) /* same file, done! */
2624 if (rc < 0) { /* sequentialize it */
2625 swap(llss->inode1, llss->inode2);
2627 swap(llss->dv1, llss->dv2);
2628 swap(llss->check_dv1, llss->check_dv2);
2632 if (gid != 0) { /* application asks to flush dirty cache */
2633 rc = ll_get_grouplock(llss->inode1, file1, gid);
2637 rc = ll_get_grouplock(llss->inode2, file2, gid);
2639 ll_put_grouplock(llss->inode1, file1, gid);
2644 /* ultimate check, before swaping the layouts we check if
2645 * dataversion has changed (if requested) */
2646 if (llss->check_dv1) {
2647 rc = ll_data_version(llss->inode1, &dv, 0);
2650 if (dv != llss->dv1)
2651 GOTO(putgl, rc = -EAGAIN);
2654 if (llss->check_dv2) {
2655 rc = ll_data_version(llss->inode2, &dv, 0);
2658 if (dv != llss->dv2)
2659 GOTO(putgl, rc = -EAGAIN);
2662 /* struct md_op_data is used to send the swap args to the mdt
2663 * only flags is missing, so we use struct mdc_swap_layouts
2664 * through the md_op_data->op_data */
2665 /* flags from user space have to be converted before they are send to
2666 * server, no flag is sent today, they are only used on the client */
2669 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2670 0, LUSTRE_OPC_ANY, &msl);
2671 if (IS_ERR(op_data))
2672 GOTO(free, rc = PTR_ERR(op_data));
2674 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2675 sizeof(*op_data), op_data, NULL);
2676 ll_finish_md_op_data(op_data);
2683 ll_put_grouplock(llss->inode2, file2, gid);
2684 ll_put_grouplock(llss->inode1, file1, gid);
2694 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2696 struct obd_export *exp = ll_i2mdexp(inode);
2697 struct md_op_data *op_data;
2701 /* Detect out-of range masks */
2702 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2705 /* Non-root users are forbidden to set or clear flags which are
2706 * NOT defined in HSM_USER_MASK. */
2707 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2708 !cfs_capable(CFS_CAP_SYS_ADMIN))
2711 if (!exp_connect_archive_id_array(exp)) {
2712 /* Detect out-of range archive id */
2713 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2714 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2718 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2719 LUSTRE_OPC_ANY, hss);
2720 if (IS_ERR(op_data))
2721 RETURN(PTR_ERR(op_data));
2723 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2726 ll_finish_md_op_data(op_data);
2731 static int ll_hsm_import(struct inode *inode, struct file *file,
2732 struct hsm_user_import *hui)
2734 struct hsm_state_set *hss = NULL;
2735 struct iattr *attr = NULL;
2739 if (!S_ISREG(inode->i_mode))
2745 GOTO(out, rc = -ENOMEM);
2747 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2748 hss->hss_archive_id = hui->hui_archive_id;
2749 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2750 rc = ll_hsm_state_set(inode, hss);
2754 OBD_ALLOC_PTR(attr);
2756 GOTO(out, rc = -ENOMEM);
2758 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2759 attr->ia_mode |= S_IFREG;
2760 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2761 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2762 attr->ia_size = hui->hui_size;
2763 attr->ia_mtime.tv_sec = hui->hui_mtime;
2764 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2765 attr->ia_atime.tv_sec = hui->hui_atime;
2766 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2768 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2769 ATTR_UID | ATTR_GID |
2770 ATTR_MTIME | ATTR_MTIME_SET |
2771 ATTR_ATIME | ATTR_ATIME_SET;
2775 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2779 inode_unlock(inode);
2791 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2793 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2794 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2797 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2799 struct inode *inode = file_inode(file);
2801 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2802 ATTR_MTIME | ATTR_MTIME_SET |
2805 .tv_sec = lfu->lfu_atime_sec,
2806 .tv_nsec = lfu->lfu_atime_nsec,
2809 .tv_sec = lfu->lfu_mtime_sec,
2810 .tv_nsec = lfu->lfu_mtime_nsec,
2813 .tv_sec = lfu->lfu_ctime_sec,
2814 .tv_nsec = lfu->lfu_ctime_nsec,
2820 if (!capable(CAP_SYS_ADMIN))
2823 if (!S_ISREG(inode->i_mode))
2827 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2829 inode_unlock(inode);
2834 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2837 case MODE_READ_USER:
2839 case MODE_WRITE_USER:
2846 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2848 /* Used to allow the upper layers of the client to request an LDLM lock
2849 * without doing an actual read or write.
2851 * Used for ladvise lockahead to manually request specific locks.
2853 * \param[in] file file this ladvise lock request is on
2854 * \param[in] ladvise ladvise struct describing this lock request
2856 * \retval 0 success, no detailed result available (sync requests
2857 * and requests sent to the server [not handled locally]
2858 * cannot return detailed results)
2859 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2860 * see definitions for details.
2861 * \retval negative negative errno on error
2863 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2865 struct lu_env *env = NULL;
2866 struct cl_io *io = NULL;
2867 struct cl_lock *lock = NULL;
2868 struct cl_lock_descr *descr = NULL;
2869 struct dentry *dentry = file->f_path.dentry;
2870 struct inode *inode = dentry->d_inode;
2871 enum cl_lock_mode cl_mode;
2872 off_t start = ladvise->lla_start;
2873 off_t end = ladvise->lla_end;
2879 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2880 "start=%llu, end=%llu\n", dentry->d_name.len,
2881 dentry->d_name.name, dentry->d_inode,
2882 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2885 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2887 GOTO(out, result = cl_mode);
2889 /* Get IO environment */
2890 result = cl_io_get(inode, &env, &io, &refcheck);
2894 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2897 * nothing to do for this io. This currently happens when
2898 * stripe sub-object's are not yet created.
2900 result = io->ci_result;
2901 } else if (result == 0) {
2902 lock = vvp_env_lock(env);
2903 descr = &lock->cll_descr;
2905 descr->cld_obj = io->ci_obj;
2906 /* Convert byte offsets to pages */
2907 descr->cld_start = cl_index(io->ci_obj, start);
2908 descr->cld_end = cl_index(io->ci_obj, end);
2909 descr->cld_mode = cl_mode;
2910 /* CEF_MUST is used because we do not want to convert a
2911 * lockahead request to a lockless lock */
2912 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2915 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2916 descr->cld_enq_flags |= CEF_SPECULATIVE;
2918 result = cl_lock_request(env, io, lock);
2920 /* On success, we need to release the lock */
2922 cl_lock_release(env, lock);
2924 cl_io_fini(env, io);
2925 cl_env_put(env, &refcheck);
2927 /* -ECANCELED indicates a matching lock with a different extent
2928 * was already present, and -EEXIST indicates a matching lock
2929 * on exactly the same extent was already present.
2930 * We convert them to positive values for userspace to make
2931 * recognizing true errors easier.
2932 * Note we can only return these detailed results on async requests,
2933 * as sync requests look the same as i/o requests for locking. */
2934 if (result == -ECANCELED)
2935 result = LLA_RESULT_DIFFERENT;
2936 else if (result == -EEXIST)
2937 result = LLA_RESULT_SAME;
2942 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2944 static int ll_ladvise_sanity(struct inode *inode,
2945 struct llapi_lu_ladvise *ladvise)
2947 struct ll_sb_info *sbi = ll_i2sbi(inode);
2948 enum lu_ladvise_type advice = ladvise->lla_advice;
2949 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2950 * be in the first 32 bits of enum ladvise_flags */
2951 __u32 flags = ladvise->lla_peradvice_flags;
2952 /* 3 lines at 80 characters per line, should be plenty */
2955 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2957 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2958 "last supported advice is %s (value '%d'): rc = %d\n",
2959 sbi->ll_fsname, advice,
2960 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2964 /* Per-advice checks */
2966 case LU_LADVISE_LOCKNOEXPAND:
2967 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2969 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2970 "rc = %d\n", sbi->ll_fsname, flags,
2971 ladvise_names[advice], rc);
2975 case LU_LADVISE_LOCKAHEAD:
2976 /* Currently only READ and WRITE modes can be requested */
2977 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2978 ladvise->lla_lockahead_mode == 0) {
2980 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2981 "rc = %d\n", sbi->ll_fsname,
2982 ladvise->lla_lockahead_mode,
2983 ladvise_names[advice], rc);
2986 case LU_LADVISE_WILLREAD:
2987 case LU_LADVISE_DONTNEED:
2989 /* Note fall through above - These checks apply to all advices
2990 * except LOCKNOEXPAND */
2991 if (flags & ~LF_DEFAULT_MASK) {
2993 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2994 "rc = %d\n", sbi->ll_fsname, flags,
2995 ladvise_names[advice], rc);
2998 if (ladvise->lla_start >= ladvise->lla_end) {
3000 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3001 "for %s: rc = %d\n", sbi->ll_fsname,
3002 ladvise->lla_start, ladvise->lla_end,
3003 ladvise_names[advice], rc);
3015 * Give file access advices
3017 * The ladvise interface is similar to Linux fadvise() system call, except it
3018 * forwards the advices directly from Lustre client to server. The server side
3019 * codes will apply appropriate read-ahead and caching techniques for the
3020 * corresponding files.
3022 * A typical workload for ladvise is e.g. a bunch of different clients are
3023 * doing small random reads of a file, so prefetching pages into OSS cache
3024 * with big linear reads before the random IO is a net benefit. Fetching
3025 * all that data into each client cache with fadvise() may not be, due to
3026 * much more data being sent to the client.
3028 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3029 struct llapi_lu_ladvise *ladvise)
3033 struct cl_ladvise_io *lio;
3038 env = cl_env_get(&refcheck);
3040 RETURN(PTR_ERR(env));
3042 io = vvp_env_thread_io(env);
3043 io->ci_obj = ll_i2info(inode)->lli_clob;
3045 /* initialize parameters for ladvise */
3046 lio = &io->u.ci_ladvise;
3047 lio->li_start = ladvise->lla_start;
3048 lio->li_end = ladvise->lla_end;
3049 lio->li_fid = ll_inode2fid(inode);
3050 lio->li_advice = ladvise->lla_advice;
3051 lio->li_flags = flags;
3053 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3054 rc = cl_io_loop(env, io);
3058 cl_io_fini(env, io);
3059 cl_env_put(env, &refcheck);
3063 static int ll_lock_noexpand(struct file *file, int flags)
3065 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3067 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3072 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3075 struct fsxattr fsxattr;
3077 if (copy_from_user(&fsxattr,
3078 (const struct fsxattr __user *)arg,
3082 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3083 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3084 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3085 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3086 if (copy_to_user((struct fsxattr __user *)arg,
3087 &fsxattr, sizeof(fsxattr)))
3093 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3096 * Project Quota ID state is only allowed to change from within the init
3097 * namespace. Enforce that restriction only if we are trying to change
3098 * the quota ID state. Everything else is allowed in user namespaces.
3100 if (current_user_ns() == &init_user_ns)
3103 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3106 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3107 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3110 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3117 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3121 struct md_op_data *op_data;
3122 struct ptlrpc_request *req = NULL;
3124 struct fsxattr fsxattr;
3125 struct cl_object *obj;
3129 if (copy_from_user(&fsxattr,
3130 (const struct fsxattr __user *)arg,
3134 rc = ll_ioctl_check_project(inode, &fsxattr);
3138 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3139 LUSTRE_OPC_ANY, NULL);
3140 if (IS_ERR(op_data))
3141 RETURN(PTR_ERR(op_data));
3143 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3144 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3145 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3146 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3147 op_data->op_projid = fsxattr.fsx_projid;
3148 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3149 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3151 ptlrpc_req_finished(req);
3153 GOTO(out_fsxattr, rc);
3154 ll_update_inode_flags(inode, op_data->op_attr_flags);
3155 obj = ll_i2info(inode)->lli_clob;
3157 GOTO(out_fsxattr, rc);
3159 OBD_ALLOC_PTR(attr);
3161 GOTO(out_fsxattr, rc = -ENOMEM);
3163 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3164 fsxattr.fsx_xflags);
3167 ll_finish_md_op_data(op_data);
3171 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3174 struct inode *inode = file_inode(file);
3175 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3176 struct ll_inode_info *lli = ll_i2info(inode);
3177 struct obd_client_handle *och = NULL;
3178 struct split_param sp;
3179 struct pcc_param param;
3180 bool lease_broken = false;
3182 enum mds_op_bias bias = 0;
3183 struct file *layout_file = NULL;
3185 size_t data_size = 0;
3186 bool attached = false;
3191 mutex_lock(&lli->lli_och_mutex);
3192 if (fd->fd_lease_och != NULL) {
3193 och = fd->fd_lease_och;
3194 fd->fd_lease_och = NULL;
3196 mutex_unlock(&lli->lli_och_mutex);
3201 fmode = och->och_flags;
3203 switch (ioc->lil_flags) {
3204 case LL_LEASE_RESYNC_DONE:
3205 if (ioc->lil_count > IOC_IDS_MAX)
3206 GOTO(out_lease_close, rc = -EINVAL);
3208 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3209 OBD_ALLOC(data, data_size);
3211 GOTO(out_lease_close, rc = -ENOMEM);
3213 if (copy_from_user(data, (void __user *)arg, data_size))
3214 GOTO(out_lease_close, rc = -EFAULT);
3216 bias = MDS_CLOSE_RESYNC_DONE;
3218 case LL_LEASE_LAYOUT_MERGE: {
3221 if (ioc->lil_count != 1)
3222 GOTO(out_lease_close, rc = -EINVAL);
3224 arg += sizeof(*ioc);
3225 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3226 GOTO(out_lease_close, rc = -EFAULT);
3228 layout_file = fget(fd);
3230 GOTO(out_lease_close, rc = -EBADF);
3232 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3233 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3234 GOTO(out_lease_close, rc = -EPERM);
3236 data = file_inode(layout_file);
3237 bias = MDS_CLOSE_LAYOUT_MERGE;
3240 case LL_LEASE_LAYOUT_SPLIT: {
3244 if (ioc->lil_count != 2)
3245 GOTO(out_lease_close, rc = -EINVAL);
3247 arg += sizeof(*ioc);
3248 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3249 GOTO(out_lease_close, rc = -EFAULT);
3251 arg += sizeof(__u32);
3252 if (copy_from_user(&mirror_id, (void __user *)arg,
3254 GOTO(out_lease_close, rc = -EFAULT);
3256 layout_file = fget(fdv);
3258 GOTO(out_lease_close, rc = -EBADF);
3260 sp.sp_inode = file_inode(layout_file);
3261 sp.sp_mirror_id = (__u16)mirror_id;
3263 bias = MDS_CLOSE_LAYOUT_SPLIT;
3266 case LL_LEASE_PCC_ATTACH:
3267 if (ioc->lil_count != 1)
3270 arg += sizeof(*ioc);
3271 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3273 GOTO(out_lease_close, rc2 = -EFAULT);
3275 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3277 GOTO(out_lease_close, rc2);
3280 /* Grab latest data version */
3281 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3284 GOTO(out_lease_close, rc2);
3287 bias = MDS_PCC_ATTACH;
3290 /* without close intent */
3295 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3299 rc = ll_lease_och_release(inode, file);
3308 switch (ioc->lil_flags) {
3309 case LL_LEASE_RESYNC_DONE:
3311 OBD_FREE(data, data_size);
3313 case LL_LEASE_LAYOUT_MERGE:
3314 case LL_LEASE_LAYOUT_SPLIT:
3318 case LL_LEASE_PCC_ATTACH:
3321 rc = pcc_readwrite_attach_fini(file, inode,
3322 param.pa_layout_gen,
3329 rc = ll_lease_type_from_fmode(fmode);
3333 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3336 struct inode *inode = file_inode(file);
3337 struct ll_inode_info *lli = ll_i2info(inode);
3338 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3339 struct obd_client_handle *och = NULL;
3340 __u64 open_flags = 0;
3346 switch (ioc->lil_mode) {
3347 case LL_LEASE_WRLCK:
3348 if (!(file->f_mode & FMODE_WRITE))
3350 fmode = FMODE_WRITE;
3352 case LL_LEASE_RDLCK:
3353 if (!(file->f_mode & FMODE_READ))
3357 case LL_LEASE_UNLCK:
3358 RETURN(ll_file_unlock_lease(file, ioc, arg));
3363 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3365 /* apply for lease */
3366 if (ioc->lil_flags & LL_LEASE_RESYNC)
3367 open_flags = MDS_OPEN_RESYNC;
3368 och = ll_lease_open(inode, file, fmode, open_flags);
3370 RETURN(PTR_ERR(och));
3372 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3373 rc = ll_lease_file_resync(och, inode, arg);
3375 ll_lease_close(och, inode, NULL);
3378 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3380 ll_lease_close(och, inode, NULL);
3386 mutex_lock(&lli->lli_och_mutex);
3387 if (fd->fd_lease_och == NULL) {
3388 fd->fd_lease_och = och;
3391 mutex_unlock(&lli->lli_och_mutex);
3393 /* impossible now that only excl is supported for now */
3394 ll_lease_close(och, inode, &lease_broken);
3400 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3402 struct ll_inode_info *lli = ll_i2info(inode);
3403 struct ll_sb_info *sbi = ll_i2sbi(inode);
3404 __u64 now = ktime_get_real_seconds();
3407 spin_lock(&lli->lli_heat_lock);
3408 heat->lh_flags = lli->lli_heat_flags;
3409 for (i = 0; i < heat->lh_count; i++)
3410 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3411 now, sbi->ll_heat_decay_weight,
3412 sbi->ll_heat_period_second);
3413 spin_unlock(&lli->lli_heat_lock);
3416 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3418 struct ll_inode_info *lli = ll_i2info(inode);
3421 spin_lock(&lli->lli_heat_lock);
3422 if (flags & LU_HEAT_FLAG_CLEAR)
3423 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3425 if (flags & LU_HEAT_FLAG_OFF)
3426 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3428 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3430 spin_unlock(&lli->lli_heat_lock);
3436 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3438 struct inode *inode = file_inode(file);
3439 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3443 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3444 PFID(ll_inode2fid(inode)), inode, cmd);
3445 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3447 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3448 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3452 case LL_IOC_GETFLAGS:
3453 /* Get the current value of the file flags */
3454 return put_user(fd->fd_flags, (int __user *)arg);
3455 case LL_IOC_SETFLAGS:
3456 case LL_IOC_CLRFLAGS:
3457 /* Set or clear specific file flags */
3458 /* XXX This probably needs checks to ensure the flags are
3459 * not abused, and to handle any flag side effects.
3461 if (get_user(flags, (int __user *) arg))
3464 if (cmd == LL_IOC_SETFLAGS) {
3465 if ((flags & LL_FILE_IGNORE_LOCK) &&
3466 !(file->f_flags & O_DIRECT)) {
3467 CERROR("%s: unable to disable locking on "
3468 "non-O_DIRECT file\n", current->comm);
3472 fd->fd_flags |= flags;
3474 fd->fd_flags &= ~flags;
3477 case LL_IOC_LOV_SETSTRIPE:
3478 case LL_IOC_LOV_SETSTRIPE_NEW:
3479 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3480 case LL_IOC_LOV_SETEA:
3481 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3482 case LL_IOC_LOV_SWAP_LAYOUTS: {
3484 struct lustre_swap_layouts lsl;
3486 if (copy_from_user(&lsl, (char __user *)arg,
3487 sizeof(struct lustre_swap_layouts)))
3490 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3493 file2 = fget(lsl.sl_fd);
3497 /* O_WRONLY or O_RDWR */
3498 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3499 GOTO(out, rc = -EPERM);
3501 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3502 struct inode *inode2;
3503 struct ll_inode_info *lli;
3504 struct obd_client_handle *och = NULL;
3506 lli = ll_i2info(inode);
3507 mutex_lock(&lli->lli_och_mutex);
3508 if (fd->fd_lease_och != NULL) {
3509 och = fd->fd_lease_och;
3510 fd->fd_lease_och = NULL;
3512 mutex_unlock(&lli->lli_och_mutex);
3514 GOTO(out, rc = -ENOLCK);
3515 inode2 = file_inode(file2);
3516 rc = ll_swap_layouts_close(och, inode, inode2);
3518 rc = ll_swap_layouts(file, file2, &lsl);
3524 case LL_IOC_LOV_GETSTRIPE:
3525 case LL_IOC_LOV_GETSTRIPE_NEW:
3526 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3527 case FS_IOC_GETFLAGS:
3528 case FS_IOC_SETFLAGS:
3529 RETURN(ll_iocontrol(inode, file, cmd, arg));
3530 case FSFILT_IOC_GETVERSION:
3531 case FS_IOC_GETVERSION:
3532 RETURN(put_user(inode->i_generation, (int __user *)arg));
3533 /* We need to special case any other ioctls we want to handle,
3534 * to send them to the MDS/OST as appropriate and to properly
3535 * network encode the arg field. */
3536 case FS_IOC_SETVERSION:
3539 case LL_IOC_GROUP_LOCK:
3540 RETURN(ll_get_grouplock(inode, file, arg));
3541 case LL_IOC_GROUP_UNLOCK:
3542 RETURN(ll_put_grouplock(inode, file, arg));
3543 case IOC_OBD_STATFS:
3544 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3546 case LL_IOC_FLUSHCTX:
3547 RETURN(ll_flush_ctx(inode));
3548 case LL_IOC_PATH2FID: {
3549 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3550 sizeof(struct lu_fid)))
3555 case LL_IOC_GETPARENT:
3556 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3558 case OBD_IOC_FID2PATH:
3559 RETURN(ll_fid2path(inode, (void __user *)arg));
3560 case LL_IOC_DATA_VERSION: {
3561 struct ioc_data_version idv;
3564 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3567 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3568 rc = ll_ioc_data_version(inode, &idv);
3571 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3577 case LL_IOC_GET_MDTIDX: {
3580 mdtidx = ll_get_mdt_idx(inode);
3584 if (put_user((int)mdtidx, (int __user *)arg))
3589 case OBD_IOC_GETDTNAME:
3590 case OBD_IOC_GETMDNAME:
3591 RETURN(ll_get_obd_name(inode, cmd, arg));
3592 case LL_IOC_HSM_STATE_GET: {
3593 struct md_op_data *op_data;
3594 struct hsm_user_state *hus;
3601 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3602 LUSTRE_OPC_ANY, hus);
3603 if (IS_ERR(op_data)) {
3605 RETURN(PTR_ERR(op_data));
3608 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3611 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3614 ll_finish_md_op_data(op_data);
3618 case LL_IOC_HSM_STATE_SET: {
3619 struct hsm_state_set *hss;
3626 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3631 rc = ll_hsm_state_set(inode, hss);
3636 case LL_IOC_HSM_ACTION: {
3637 struct md_op_data *op_data;
3638 struct hsm_current_action *hca;
3645 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3646 LUSTRE_OPC_ANY, hca);
3647 if (IS_ERR(op_data)) {
3649 RETURN(PTR_ERR(op_data));
3652 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3655 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3658 ll_finish_md_op_data(op_data);
3662 case LL_IOC_SET_LEASE_OLD: {
3663 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3665 RETURN(ll_file_set_lease(file, &ioc, 0));
3667 case LL_IOC_SET_LEASE: {
3668 struct ll_ioc_lease ioc;
3670 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3673 RETURN(ll_file_set_lease(file, &ioc, arg));
3675 case LL_IOC_GET_LEASE: {
3676 struct ll_inode_info *lli = ll_i2info(inode);
3677 struct ldlm_lock *lock = NULL;
3680 mutex_lock(&lli->lli_och_mutex);
3681 if (fd->fd_lease_och != NULL) {
3682 struct obd_client_handle *och = fd->fd_lease_och;
3684 lock = ldlm_handle2lock(&och->och_lease_handle);
3686 lock_res_and_lock(lock);
3687 if (!ldlm_is_cancel(lock))
3688 fmode = och->och_flags;
3690 unlock_res_and_lock(lock);
3691 LDLM_LOCK_PUT(lock);
3694 mutex_unlock(&lli->lli_och_mutex);
3696 RETURN(ll_lease_type_from_fmode(fmode));
3698 case LL_IOC_HSM_IMPORT: {
3699 struct hsm_user_import *hui;
3705 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3710 rc = ll_hsm_import(inode, file, hui);
3715 case LL_IOC_FUTIMES_3: {
3716 struct ll_futimes_3 lfu;
3718 if (copy_from_user(&lfu,
3719 (const struct ll_futimes_3 __user *)arg,
3723 RETURN(ll_file_futimes_3(file, &lfu));
3725 case LL_IOC_LADVISE: {
3726 struct llapi_ladvise_hdr *k_ladvise_hdr;
3727 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3730 int alloc_size = sizeof(*k_ladvise_hdr);
3733 u_ladvise_hdr = (void __user *)arg;
3734 OBD_ALLOC_PTR(k_ladvise_hdr);
3735 if (k_ladvise_hdr == NULL)
3738 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3739 GOTO(out_ladvise, rc = -EFAULT);
3741 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3742 k_ladvise_hdr->lah_count < 1)
3743 GOTO(out_ladvise, rc = -EINVAL);
3745 num_advise = k_ladvise_hdr->lah_count;
3746 if (num_advise >= LAH_COUNT_MAX)
3747 GOTO(out_ladvise, rc = -EFBIG);
3749 OBD_FREE_PTR(k_ladvise_hdr);
3750 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3751 lah_advise[num_advise]);
3752 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3753 if (k_ladvise_hdr == NULL)
3757 * TODO: submit multiple advices to one server in a single RPC
3759 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3760 GOTO(out_ladvise, rc = -EFAULT);
3762 for (i = 0; i < num_advise; i++) {
3763 struct llapi_lu_ladvise *k_ladvise =
3764 &k_ladvise_hdr->lah_advise[i];
3765 struct llapi_lu_ladvise __user *u_ladvise =
3766 &u_ladvise_hdr->lah_advise[i];
3768 rc = ll_ladvise_sanity(inode, k_ladvise);
3770 GOTO(out_ladvise, rc);
3772 switch (k_ladvise->lla_advice) {
3773 case LU_LADVISE_LOCKNOEXPAND:
3774 rc = ll_lock_noexpand(file,
3775 k_ladvise->lla_peradvice_flags);
3776 GOTO(out_ladvise, rc);
3777 case LU_LADVISE_LOCKAHEAD:
3779 rc = ll_file_lock_ahead(file, k_ladvise);
3782 GOTO(out_ladvise, rc);
3785 &u_ladvise->lla_lockahead_result))
3786 GOTO(out_ladvise, rc = -EFAULT);
3789 rc = ll_ladvise(inode, file,
3790 k_ladvise_hdr->lah_flags,
3793 GOTO(out_ladvise, rc);
3800 OBD_FREE(k_ladvise_hdr, alloc_size);
3803 case LL_IOC_FLR_SET_MIRROR: {
3804 /* mirror I/O must be direct to avoid polluting page cache
3806 if (!(file->f_flags & O_DIRECT))
3809 fd->fd_designated_mirror = (__u32)arg;
3812 case LL_IOC_FSGETXATTR:
3813 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3814 case LL_IOC_FSSETXATTR:
3815 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3817 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3818 case LL_IOC_HEAT_GET: {
3819 struct lu_heat uheat;
3820 struct lu_heat *heat;
3823 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3826 if (uheat.lh_count > OBD_HEAT_COUNT)
3827 uheat.lh_count = OBD_HEAT_COUNT;
3829 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3830 OBD_ALLOC(heat, size);
3834 heat->lh_count = uheat.lh_count;
3835 ll_heat_get(inode, heat);
3836 rc = copy_to_user((char __user *)arg, heat, size);
3837 OBD_FREE(heat, size);
3838 RETURN(rc ? -EFAULT : 0);
3840 case LL_IOC_HEAT_SET: {
3843 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3846 rc = ll_heat_set(inode, flags);
3849 case LL_IOC_PCC_DETACH:
3850 if (!S_ISREG(inode->i_mode))
3853 if (!inode_owner_or_capable(inode))
3856 RETURN(pcc_ioctl_detach(inode));
3857 case LL_IOC_PCC_STATE: {
3858 struct lu_pcc_state __user *ustate =
3859 (struct lu_pcc_state __user *)arg;
3860 struct lu_pcc_state *state;
3862 OBD_ALLOC_PTR(state);
3866 if (copy_from_user(state, ustate, sizeof(*state)))
3867 GOTO(out_state, rc = -EFAULT);
3869 rc = pcc_ioctl_state(file, inode, state);
3871 GOTO(out_state, rc);
3873 if (copy_to_user(ustate, state, sizeof(*state)))
3874 GOTO(out_state, rc = -EFAULT);
3877 OBD_FREE_PTR(state);
3881 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3882 (void __user *)arg));
3886 #ifndef HAVE_FILE_LLSEEK_SIZE
3887 static inline loff_t
3888 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3890 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3892 if (offset > maxsize)
3895 if (offset != file->f_pos) {
3896 file->f_pos = offset;
3897 file->f_version = 0;
3903 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3904 loff_t maxsize, loff_t eof)
3906 struct inode *inode = file_inode(file);
3914 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3915 * position-querying operation. Avoid rewriting the "same"
3916 * f_pos value back to the file because a concurrent read(),
3917 * write() or lseek() might have altered it
3922 * f_lock protects against read/modify/write race with other
3923 * SEEK_CURs. Note that parallel writes and reads behave
3927 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3928 inode_unlock(inode);
3932 * In the generic case the entire file is data, so as long as
3933 * offset isn't at the end of the file then the offset is data.
3940 * There is a virtual hole at the end of the file, so as long as
3941 * offset isn't i_size or larger, return i_size.
3949 return llseek_execute(file, offset, maxsize);
3953 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3955 struct inode *inode = file_inode(file);
3956 loff_t retval, eof = 0;
3959 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3960 (origin == SEEK_CUR) ? file->f_pos : 0);
3961 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3962 PFID(ll_inode2fid(inode)), inode, retval, retval,
3964 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3966 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3967 retval = ll_glimpse_size(inode);
3970 eof = i_size_read(inode);
3973 retval = ll_generic_file_llseek_size(file, offset, origin,
3974 ll_file_maxbytes(inode), eof);
3978 static int ll_flush(struct file *file, fl_owner_t id)
3980 struct inode *inode = file_inode(file);
3981 struct ll_inode_info *lli = ll_i2info(inode);
3982 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3985 LASSERT(!S_ISDIR(inode->i_mode));
3987 /* catch async errors that were recorded back when async writeback
3988 * failed for pages in this mapping. */
3989 rc = lli->lli_async_rc;
3990 lli->lli_async_rc = 0;
3991 if (lli->lli_clob != NULL) {
3992 err = lov_read_and_clear_async_rc(lli->lli_clob);
3997 /* The application has been told write failure already.
3998 * Do not report failure again. */
3999 if (fd->fd_write_failed)
4001 return rc ? -EIO : 0;
4005 * Called to make sure a portion of file has been written out.
4006 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4008 * Return how many pages have been written.
4010 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4011 enum cl_fsync_mode mode, int ignore_layout)
4015 struct cl_fsync_io *fio;
4020 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4021 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4024 env = cl_env_get(&refcheck);
4026 RETURN(PTR_ERR(env));
4028 io = vvp_env_thread_io(env);
4029 io->ci_obj = ll_i2info(inode)->lli_clob;
4030 io->ci_ignore_layout = ignore_layout;
4032 /* initialize parameters for sync */
4033 fio = &io->u.ci_fsync;
4034 fio->fi_start = start;
4036 fio->fi_fid = ll_inode2fid(inode);
4037 fio->fi_mode = mode;
4038 fio->fi_nr_written = 0;
4040 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4041 result = cl_io_loop(env, io);
4043 result = io->ci_result;
4045 result = fio->fi_nr_written;
4046 cl_io_fini(env, io);
4047 cl_env_put(env, &refcheck);
4053 * When dentry is provided (the 'else' case), file_dentry() may be
4054 * null and dentry must be used directly rather than pulled from
4055 * file_dentry() as is done otherwise.
4058 #ifdef HAVE_FILE_FSYNC_4ARGS
4059 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4061 struct dentry *dentry = file_dentry(file);
4062 #elif defined(HAVE_FILE_FSYNC_2ARGS)
4063 int ll_fsync(struct file *file, int datasync)
4065 struct dentry *dentry = file_dentry(file);
4067 loff_t end = LLONG_MAX;
4069 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
4072 loff_t end = LLONG_MAX;
4074 struct inode *inode = dentry->d_inode;
4075 struct ll_inode_info *lli = ll_i2info(inode);
4076 struct ptlrpc_request *req;
4081 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
4082 PFID(ll_inode2fid(inode)), inode);
4083 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4085 #ifdef HAVE_FILE_FSYNC_4ARGS
4086 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4089 /* fsync's caller has already called _fdata{sync,write}, we want
4090 * that IO to finish before calling the osc and mdc sync methods */
4091 rc = filemap_fdatawait(inode->i_mapping);
4094 /* catch async errors that were recorded back when async writeback
4095 * failed for pages in this mapping. */
4096 if (!S_ISDIR(inode->i_mode)) {
4097 err = lli->lli_async_rc;
4098 lli->lli_async_rc = 0;
4101 if (lli->lli_clob != NULL) {
4102 err = lov_read_and_clear_async_rc(lli->lli_clob);
4108 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4112 ptlrpc_req_finished(req);
4114 if (S_ISREG(inode->i_mode)) {
4115 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4118 /* Sync metadata on MDT first, and then sync the cached data
4121 err = pcc_fsync(file, start, end, datasync, &cached);
4123 err = cl_sync_file_range(inode, start, end,
4125 if (rc == 0 && err < 0)
4128 fd->fd_write_failed = true;
4130 fd->fd_write_failed = false;
4133 #ifdef HAVE_FILE_FSYNC_4ARGS
4134 inode_unlock(inode);
4140 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4142 struct inode *inode = file_inode(file);
4143 struct ll_sb_info *sbi = ll_i2sbi(inode);
4144 struct ldlm_enqueue_info einfo = {
4145 .ei_type = LDLM_FLOCK,
4146 .ei_cb_cp = ldlm_flock_completion_ast,
4147 .ei_cbdata = file_lock,
4149 struct md_op_data *op_data;
4150 struct lustre_handle lockh = { 0 };
4151 union ldlm_policy_data flock = { { 0 } };
4152 int fl_type = file_lock->fl_type;
4158 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4159 PFID(ll_inode2fid(inode)), file_lock);
4161 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4163 if (file_lock->fl_flags & FL_FLOCK) {
4164 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4165 /* flocks are whole-file locks */
4166 flock.l_flock.end = OFFSET_MAX;
4167 /* For flocks owner is determined by the local file desctiptor*/
4168 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4169 } else if (file_lock->fl_flags & FL_POSIX) {
4170 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4171 flock.l_flock.start = file_lock->fl_start;
4172 flock.l_flock.end = file_lock->fl_end;
4176 flock.l_flock.pid = file_lock->fl_pid;
4178 /* Somewhat ugly workaround for svc lockd.
4179 * lockd installs custom fl_lmops->lm_compare_owner that checks
4180 * for the fl_owner to be the same (which it always is on local node
4181 * I guess between lockd processes) and then compares pid.
4182 * As such we assign pid to the owner field to make it all work,
4183 * conflict with normal locks is unlikely since pid space and
4184 * pointer space for current->files are not intersecting */
4185 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4186 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4190 einfo.ei_mode = LCK_PR;
4193 /* An unlock request may or may not have any relation to
4194 * existing locks so we may not be able to pass a lock handle
4195 * via a normal ldlm_lock_cancel() request. The request may even
4196 * unlock a byte range in the middle of an existing lock. In
4197 * order to process an unlock request we need all of the same
4198 * information that is given with a normal read or write record
4199 * lock request. To avoid creating another ldlm unlock (cancel)
4200 * message we'll treat a LCK_NL flock request as an unlock. */
4201 einfo.ei_mode = LCK_NL;
4204 einfo.ei_mode = LCK_PW;
4207 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4222 flags = LDLM_FL_BLOCK_NOWAIT;
4228 flags = LDLM_FL_TEST_LOCK;
4231 CERROR("unknown fcntl lock command: %d\n", cmd);
4235 /* Save the old mode so that if the mode in the lock changes we
4236 * can decrement the appropriate reader or writer refcount. */
4237 file_lock->fl_type = einfo.ei_mode;
4239 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4240 LUSTRE_OPC_ANY, NULL);
4241 if (IS_ERR(op_data))
4242 RETURN(PTR_ERR(op_data));
4244 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4245 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4246 flock.l_flock.pid, flags, einfo.ei_mode,
4247 flock.l_flock.start, flock.l_flock.end);
4249 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4252 /* Restore the file lock type if not TEST lock. */
4253 if (!(flags & LDLM_FL_TEST_LOCK))
4254 file_lock->fl_type = fl_type;
4256 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4257 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4258 !(flags & LDLM_FL_TEST_LOCK))
4259 rc2 = locks_lock_file_wait(file, file_lock);
4261 if ((file_lock->fl_flags & FL_FLOCK) &&
4262 (rc == 0 || file_lock->fl_type == F_UNLCK))
4263 rc2 = flock_lock_file_wait(file, file_lock);
4264 if ((file_lock->fl_flags & FL_POSIX) &&
4265 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4266 !(flags & LDLM_FL_TEST_LOCK))
4267 rc2 = posix_lock_file_wait(file, file_lock);
4268 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4270 if (rc2 && file_lock->fl_type != F_UNLCK) {
4271 einfo.ei_mode = LCK_NL;
4272 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4277 ll_finish_md_op_data(op_data);
4282 int ll_get_fid_by_name(struct inode *parent, const char *name,
4283 int namelen, struct lu_fid *fid,
4284 struct inode **inode)
4286 struct md_op_data *op_data = NULL;
4287 struct mdt_body *body;
4288 struct ptlrpc_request *req;
4292 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4293 LUSTRE_OPC_ANY, NULL);
4294 if (IS_ERR(op_data))
4295 RETURN(PTR_ERR(op_data));
4297 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4298 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4299 ll_finish_md_op_data(op_data);
4303 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4305 GOTO(out_req, rc = -EFAULT);
4307 *fid = body->mbo_fid1;
4310 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4312 ptlrpc_req_finished(req);
4316 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4319 struct dentry *dchild = NULL;
4320 struct inode *child_inode = NULL;
4321 struct md_op_data *op_data;
4322 struct ptlrpc_request *request = NULL;
4323 struct obd_client_handle *och = NULL;
4325 struct mdt_body *body;
4326 __u64 data_version = 0;
4327 size_t namelen = strlen(name);
4328 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4332 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4333 PFID(ll_inode2fid(parent)), name,
4334 lum->lum_stripe_offset, lum->lum_stripe_count);
4336 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4337 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4338 lustre_swab_lmv_user_md(lum);
4340 /* Get child FID first */
4341 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4344 dchild = d_lookup(file_dentry(file), &qstr);
4346 if (dchild->d_inode)
4347 child_inode = igrab(dchild->d_inode);
4352 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4361 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4362 OBD_CONNECT2_DIR_MIGRATE)) {
4363 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4364 ll_i2info(child_inode)->lli_lsm_md) {
4365 CERROR("%s: MDT doesn't support stripe directory "
4366 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4367 GOTO(out_iput, rc = -EOPNOTSUPP);
4372 * lfs migrate command needs to be blocked on the client
4373 * by checking the migrate FID against the FID of the
4376 if (child_inode == parent->i_sb->s_root->d_inode)
4377 GOTO(out_iput, rc = -EINVAL);
4379 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4380 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4381 if (IS_ERR(op_data))
4382 GOTO(out_iput, rc = PTR_ERR(op_data));
4384 inode_lock(child_inode);
4385 op_data->op_fid3 = *ll_inode2fid(child_inode);
4386 if (!fid_is_sane(&op_data->op_fid3)) {
4387 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4388 ll_i2sbi(parent)->ll_fsname, name,
4389 PFID(&op_data->op_fid3));
4390 GOTO(out_unlock, rc = -EINVAL);
4393 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4394 op_data->op_data = lum;
4395 op_data->op_data_size = lumlen;
4398 if (S_ISREG(child_inode->i_mode)) {
4399 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4403 GOTO(out_unlock, rc);
4406 rc = ll_data_version(child_inode, &data_version,
4409 GOTO(out_close, rc);
4411 op_data->op_open_handle = och->och_open_handle;
4412 op_data->op_data_version = data_version;
4413 op_data->op_lease_handle = och->och_lease_handle;
4414 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4416 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4417 och->och_mod->mod_open_req->rq_replay = 0;
4418 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4421 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4422 name, namelen, &request);
4424 LASSERT(request != NULL);
4425 ll_update_times(request, parent);
4428 if (rc == 0 || rc == -EAGAIN) {
4429 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4430 LASSERT(body != NULL);
4432 /* If the server does release layout lock, then we cleanup
4433 * the client och here, otherwise release it in out_close: */
4434 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4435 obd_mod_put(och->och_mod);
4436 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4438 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4444 if (request != NULL) {
4445 ptlrpc_req_finished(request);
4449 /* Try again if the lease has cancelled. */
4450 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4455 ll_lease_close(och, child_inode, NULL);
4457 clear_nlink(child_inode);
4459 inode_unlock(child_inode);
4460 ll_finish_md_op_data(op_data);
4467 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4475 * test if some locks matching bits and l_req_mode are acquired
4476 * - bits can be in different locks
4477 * - if found clear the common lock bits in *bits
4478 * - the bits not found, are kept in *bits
4480 * \param bits [IN] searched lock bits [IN]
4481 * \param l_req_mode [IN] searched lock mode
4482 * \retval boolean, true iff all bits are found
4484 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4486 struct lustre_handle lockh;
4487 union ldlm_policy_data policy;
4488 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4489 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4498 fid = &ll_i2info(inode)->lli_fid;
4499 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4500 ldlm_lockname[mode]);
4502 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4503 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4504 policy.l_inodebits.bits = *bits & (1 << i);
4505 if (policy.l_inodebits.bits == 0)
4508 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4509 &policy, mode, &lockh)) {
4510 struct ldlm_lock *lock;
4512 lock = ldlm_handle2lock(&lockh);
4515 ~(lock->l_policy_data.l_inodebits.bits);
4516 LDLM_LOCK_PUT(lock);
4518 *bits &= ~policy.l_inodebits.bits;
4525 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4526 struct lustre_handle *lockh, __u64 flags,
4527 enum ldlm_mode mode)
4529 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4534 fid = &ll_i2info(inode)->lli_fid;
4535 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4537 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4538 fid, LDLM_IBITS, &policy, mode, lockh);
4543 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4545 /* Already unlinked. Just update nlink and return success */
4546 if (rc == -ENOENT) {
4548 /* If it is striped directory, and there is bad stripe
4549 * Let's revalidate the dentry again, instead of returning
4551 if (S_ISDIR(inode->i_mode) &&
4552 ll_i2info(inode)->lli_lsm_md != NULL)
4555 /* This path cannot be hit for regular files unless in
4556 * case of obscure races, so no need to to validate
4558 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4560 } else if (rc != 0) {
4561 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4562 "%s: revalidate FID "DFID" error: rc = %d\n",
4563 ll_i2sbi(inode)->ll_fsname,
4564 PFID(ll_inode2fid(inode)), rc);
4570 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4572 struct inode *inode = dentry->d_inode;
4573 struct obd_export *exp = ll_i2mdexp(inode);
4574 struct lookup_intent oit = {
4577 struct ptlrpc_request *req = NULL;
4578 struct md_op_data *op_data;
4582 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4583 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4585 /* Call getattr by fid, so do not provide name at all. */
4586 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4587 LUSTRE_OPC_ANY, NULL);
4588 if (IS_ERR(op_data))
4589 RETURN(PTR_ERR(op_data));
4591 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4592 ll_finish_md_op_data(op_data);
4594 rc = ll_inode_revalidate_fini(inode, rc);
4598 rc = ll_revalidate_it_finish(req, &oit, dentry);
4600 ll_intent_release(&oit);
4604 /* Unlinked? Unhash dentry, so it is not picked up later by
4605 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4606 * here to preserve get_cwd functionality on 2.6.
4608 if (!dentry->d_inode->i_nlink) {
4609 ll_lock_dcache(inode);
4610 d_lustre_invalidate(dentry, 0);
4611 ll_unlock_dcache(inode);
4614 ll_lookup_finish_locks(&oit, dentry);
4616 ptlrpc_req_finished(req);
4621 static int ll_merge_md_attr(struct inode *inode)
4623 struct ll_inode_info *lli = ll_i2info(inode);
4624 struct cl_attr attr = { 0 };
4627 LASSERT(lli->lli_lsm_md != NULL);
4629 /* foreign dir is not striped dir */
4630 if (lli->lli_lsm_md->lsm_md_magic == LMV_MAGIC_FOREIGN)
4633 down_read(&lli->lli_lsm_sem);
4634 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4635 &attr, ll_md_blocking_ast);
4636 up_read(&lli->lli_lsm_sem);
4640 set_nlink(inode, attr.cat_nlink);
4641 inode->i_blocks = attr.cat_blocks;
4642 i_size_write(inode, attr.cat_size);
4644 ll_i2info(inode)->lli_atime = attr.cat_atime;
4645 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4646 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4651 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4653 struct inode *inode = de->d_inode;
4654 struct ll_sb_info *sbi = ll_i2sbi(inode);
4655 struct ll_inode_info *lli = ll_i2info(inode);
4658 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4660 rc = ll_inode_revalidate(de, IT_GETATTR);
4664 if (S_ISREG(inode->i_mode)) {
4667 rc = pcc_inode_getattr(inode, &cached);
4668 if (cached && rc < 0)
4671 /* In case of restore, the MDT has the right size and has
4672 * already send it back without granting the layout lock,
4673 * inode is up-to-date so glimpse is useless.
4674 * Also to glimpse we need the layout, in case of a running
4675 * restore the MDT holds the layout lock so the glimpse will
4676 * block up to the end of restore (getattr will block)
4678 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4679 rc = ll_glimpse_size(inode);
4684 /* If object isn't regular a file then don't validate size. */
4685 if (S_ISDIR(inode->i_mode) &&
4686 lli->lli_lsm_md != NULL) {
4687 rc = ll_merge_md_attr(inode);
4692 inode->i_atime.tv_sec = lli->lli_atime;
4693 inode->i_mtime.tv_sec = lli->lli_mtime;
4694 inode->i_ctime.tv_sec = lli->lli_ctime;
4697 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4699 if (ll_need_32bit_api(sbi)) {
4700 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4701 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4702 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4704 stat->ino = inode->i_ino;
4705 stat->dev = inode->i_sb->s_dev;
4706 stat->rdev = inode->i_rdev;
4709 stat->mode = inode->i_mode;
4710 stat->uid = inode->i_uid;
4711 stat->gid = inode->i_gid;
4712 stat->atime = inode->i_atime;
4713 stat->mtime = inode->i_mtime;
4714 stat->ctime = inode->i_ctime;
4715 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4717 stat->nlink = inode->i_nlink;
4718 stat->size = i_size_read(inode);
4719 stat->blocks = inode->i_blocks;
4724 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4725 int ll_getattr(const struct path *path, struct kstat *stat,
4726 u32 request_mask, unsigned int flags)
4728 struct dentry *de = path->dentry;
4730 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4733 return ll_getattr_dentry(de, stat);
4736 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4737 __u64 start, __u64 len)
4741 struct fiemap *fiemap;
4742 unsigned int extent_count = fieinfo->fi_extents_max;
4744 num_bytes = sizeof(*fiemap) + (extent_count *
4745 sizeof(struct fiemap_extent));
4746 OBD_ALLOC_LARGE(fiemap, num_bytes);
4751 fiemap->fm_flags = fieinfo->fi_flags;
4752 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4753 fiemap->fm_start = start;
4754 fiemap->fm_length = len;
4755 if (extent_count > 0 &&
4756 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4757 sizeof(struct fiemap_extent)) != 0)
4758 GOTO(out, rc = -EFAULT);
4760 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4762 fieinfo->fi_flags = fiemap->fm_flags;
4763 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4764 if (extent_count > 0 &&
4765 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4766 fiemap->fm_mapped_extents *
4767 sizeof(struct fiemap_extent)) != 0)
4768 GOTO(out, rc = -EFAULT);
4770 OBD_FREE_LARGE(fiemap, num_bytes);
4774 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4776 struct ll_inode_info *lli = ll_i2info(inode);
4777 struct posix_acl *acl = NULL;
4780 spin_lock(&lli->lli_lock);
4781 /* VFS' acl_permission_check->check_acl will release the refcount */
4782 acl = posix_acl_dup(lli->lli_posix_acl);
4783 spin_unlock(&lli->lli_lock);
4788 #ifdef HAVE_IOP_SET_ACL
4789 #ifdef CONFIG_FS_POSIX_ACL
4790 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4792 struct ll_sb_info *sbi = ll_i2sbi(inode);
4793 struct ptlrpc_request *req = NULL;
4794 const char *name = NULL;
4796 size_t value_size = 0;
4801 case ACL_TYPE_ACCESS:
4802 name = XATTR_NAME_POSIX_ACL_ACCESS;
4804 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4807 case ACL_TYPE_DEFAULT:
4808 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4809 if (!S_ISDIR(inode->i_mode))
4810 rc = acl ? -EACCES : 0;
4821 value_size = posix_acl_xattr_size(acl->a_count);
4822 value = kmalloc(value_size, GFP_NOFS);
4824 GOTO(out, rc = -ENOMEM);
4826 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4828 GOTO(out_value, rc);
4831 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4832 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4833 name, value, value_size, 0, 0, &req);
4835 ptlrpc_req_finished(req);
4840 forget_cached_acl(inode, type);
4842 set_cached_acl(inode, type, acl);
4845 #endif /* CONFIG_FS_POSIX_ACL */
4846 #endif /* HAVE_IOP_SET_ACL */
4848 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4850 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4851 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4853 ll_check_acl(struct inode *inode, int mask)
4856 # ifdef CONFIG_FS_POSIX_ACL
4857 struct posix_acl *acl;
4861 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4862 if (flags & IPERM_FLAG_RCU)
4865 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4870 rc = posix_acl_permission(inode, acl, mask);
4871 posix_acl_release(acl);
4874 # else /* !CONFIG_FS_POSIX_ACL */
4876 # endif /* CONFIG_FS_POSIX_ACL */
4878 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4880 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4881 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4883 # ifdef HAVE_INODE_PERMISION_2ARGS
4884 int ll_inode_permission(struct inode *inode, int mask)
4886 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4891 struct ll_sb_info *sbi;
4892 struct root_squash_info *squash;
4893 struct cred *cred = NULL;
4894 const struct cred *old_cred = NULL;
4896 bool squash_id = false;
4899 #ifdef MAY_NOT_BLOCK
4900 if (mask & MAY_NOT_BLOCK)
4902 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4903 if (flags & IPERM_FLAG_RCU)
4907 /* as root inode are NOT getting validated in lookup operation,
4908 * need to do it before permission check. */
4910 if (inode == inode->i_sb->s_root->d_inode) {
4911 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4916 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4917 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4919 /* squash fsuid/fsgid if needed */
4920 sbi = ll_i2sbi(inode);
4921 squash = &sbi->ll_squash;
4922 if (unlikely(squash->rsi_uid != 0 &&
4923 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4924 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4928 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4929 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4930 squash->rsi_uid, squash->rsi_gid);
4932 /* update current process's credentials
4933 * and FS capability */
4934 cred = prepare_creds();
4938 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4939 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4940 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4941 if ((1 << cap) & CFS_CAP_FS_MASK)
4942 cap_lower(cred->cap_effective, cap);
4944 old_cred = override_creds(cred);
4947 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4948 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4949 /* restore current process's credentials and FS capability */
4951 revert_creds(old_cred);
4958 /* -o localflock - only provides locally consistent flock locks */
4959 struct file_operations ll_file_operations = {
4960 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4961 # ifdef HAVE_SYNC_READ_WRITE
4962 .read = new_sync_read,
4963 .write = new_sync_write,
4965 .read_iter = ll_file_read_iter,
4966 .write_iter = ll_file_write_iter,
4967 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4968 .read = ll_file_read,
4969 .aio_read = ll_file_aio_read,
4970 .write = ll_file_write,
4971 .aio_write = ll_file_aio_write,
4972 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4973 .unlocked_ioctl = ll_file_ioctl,
4974 .open = ll_file_open,
4975 .release = ll_file_release,
4976 .mmap = ll_file_mmap,
4977 .llseek = ll_file_seek,
4978 .splice_read = ll_file_splice_read,
4983 struct file_operations ll_file_operations_flock = {
4984 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4985 # ifdef HAVE_SYNC_READ_WRITE
4986 .read = new_sync_read,
4987 .write = new_sync_write,
4988 # endif /* HAVE_SYNC_READ_WRITE */
4989 .read_iter = ll_file_read_iter,
4990 .write_iter = ll_file_write_iter,
4991 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4992 .read = ll_file_read,
4993 .aio_read = ll_file_aio_read,
4994 .write = ll_file_write,
4995 .aio_write = ll_file_aio_write,
4996 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4997 .unlocked_ioctl = ll_file_ioctl,
4998 .open = ll_file_open,
4999 .release = ll_file_release,
5000 .mmap = ll_file_mmap,
5001 .llseek = ll_file_seek,
5002 .splice_read = ll_file_splice_read,
5005 .flock = ll_file_flock,
5006 .lock = ll_file_flock
5009 /* These are for -o noflock - to return ENOSYS on flock calls */
5010 struct file_operations ll_file_operations_noflock = {
5011 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5012 # ifdef HAVE_SYNC_READ_WRITE
5013 .read = new_sync_read,
5014 .write = new_sync_write,
5015 # endif /* HAVE_SYNC_READ_WRITE */
5016 .read_iter = ll_file_read_iter,
5017 .write_iter = ll_file_write_iter,
5018 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5019 .read = ll_file_read,
5020 .aio_read = ll_file_aio_read,
5021 .write = ll_file_write,
5022 .aio_write = ll_file_aio_write,
5023 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5024 .unlocked_ioctl = ll_file_ioctl,
5025 .open = ll_file_open,
5026 .release = ll_file_release,
5027 .mmap = ll_file_mmap,
5028 .llseek = ll_file_seek,
5029 .splice_read = ll_file_splice_read,
5032 .flock = ll_file_noflock,
5033 .lock = ll_file_noflock
5036 struct inode_operations ll_file_inode_operations = {
5037 .setattr = ll_setattr,
5038 .getattr = ll_getattr,
5039 .permission = ll_inode_permission,
5040 #ifdef HAVE_IOP_XATTR
5041 .setxattr = ll_setxattr,
5042 .getxattr = ll_getxattr,
5043 .removexattr = ll_removexattr,
5045 .listxattr = ll_listxattr,
5046 .fiemap = ll_fiemap,
5047 #ifdef HAVE_IOP_GET_ACL
5048 .get_acl = ll_get_acl,
5050 #ifdef HAVE_IOP_SET_ACL
5051 .set_acl = ll_set_acl,
5055 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5057 struct ll_inode_info *lli = ll_i2info(inode);
5058 struct cl_object *obj = lli->lli_clob;
5067 env = cl_env_get(&refcheck);
5069 RETURN(PTR_ERR(env));
5071 rc = cl_conf_set(env, lli->lli_clob, conf);
5075 if (conf->coc_opc == OBJECT_CONF_SET) {
5076 struct ldlm_lock *lock = conf->coc_lock;
5077 struct cl_layout cl = {
5081 LASSERT(lock != NULL);
5082 LASSERT(ldlm_has_layout(lock));
5084 /* it can only be allowed to match after layout is
5085 * applied to inode otherwise false layout would be
5086 * seen. Applying layout shoud happen before dropping
5087 * the intent lock. */
5088 ldlm_lock_allow_match(lock);
5090 rc = cl_object_layout_get(env, obj, &cl);
5095 DFID": layout version change: %u -> %u\n",
5096 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5098 ll_layout_version_set(lli, cl.cl_layout_gen);
5102 cl_env_put(env, &refcheck);
5107 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5108 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5111 struct ll_sb_info *sbi = ll_i2sbi(inode);
5112 struct ptlrpc_request *req;
5119 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5120 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5121 lock->l_lvb_data, lock->l_lvb_len);
5123 if (lock->l_lvb_data != NULL)
5126 /* if layout lock was granted right away, the layout is returned
5127 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5128 * blocked and then granted via completion ast, we have to fetch
5129 * layout here. Please note that we can't use the LVB buffer in
5130 * completion AST because it doesn't have a large enough buffer */
5131 rc = ll_get_default_mdsize(sbi, &lmmsize);
5135 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5136 XATTR_NAME_LOV, lmmsize, &req);
5139 GOTO(out, rc = 0); /* empty layout */
5146 if (lmmsize == 0) /* empty layout */
5149 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5151 GOTO(out, rc = -EFAULT);
5153 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5154 if (lvbdata == NULL)
5155 GOTO(out, rc = -ENOMEM);
5157 memcpy(lvbdata, lmm, lmmsize);
5158 lock_res_and_lock(lock);
5159 if (unlikely(lock->l_lvb_data == NULL)) {
5160 lock->l_lvb_type = LVB_T_LAYOUT;
5161 lock->l_lvb_data = lvbdata;
5162 lock->l_lvb_len = lmmsize;
5165 unlock_res_and_lock(lock);
5168 OBD_FREE_LARGE(lvbdata, lmmsize);
5173 ptlrpc_req_finished(req);
5178 * Apply the layout to the inode. Layout lock is held and will be released
5181 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5182 struct inode *inode)
5184 struct ll_inode_info *lli = ll_i2info(inode);
5185 struct ll_sb_info *sbi = ll_i2sbi(inode);
5186 struct ldlm_lock *lock;
5187 struct cl_object_conf conf;
5190 bool wait_layout = false;
5193 LASSERT(lustre_handle_is_used(lockh));
5195 lock = ldlm_handle2lock(lockh);
5196 LASSERT(lock != NULL);
5197 LASSERT(ldlm_has_layout(lock));
5199 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5200 PFID(&lli->lli_fid), inode);
5202 /* in case this is a caching lock and reinstate with new inode */
5203 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5205 lock_res_and_lock(lock);
5206 lvb_ready = ldlm_is_lvb_ready(lock);
5207 unlock_res_and_lock(lock);
5209 /* checking lvb_ready is racy but this is okay. The worst case is
5210 * that multi processes may configure the file on the same time. */
5214 rc = ll_layout_fetch(inode, lock);
5218 /* for layout lock, lmm is stored in lock's lvb.
5219 * lvb_data is immutable if the lock is held so it's safe to access it
5222 * set layout to file. Unlikely this will fail as old layout was
5223 * surely eliminated */
5224 memset(&conf, 0, sizeof conf);
5225 conf.coc_opc = OBJECT_CONF_SET;
5226 conf.coc_inode = inode;
5227 conf.coc_lock = lock;
5228 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5229 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5230 rc = ll_layout_conf(inode, &conf);
5232 /* refresh layout failed, need to wait */
5233 wait_layout = rc == -EBUSY;
5236 LDLM_LOCK_PUT(lock);
5237 ldlm_lock_decref(lockh, mode);
5239 /* wait for IO to complete if it's still being used. */
5241 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5242 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5244 memset(&conf, 0, sizeof conf);
5245 conf.coc_opc = OBJECT_CONF_WAIT;
5246 conf.coc_inode = inode;
5247 rc = ll_layout_conf(inode, &conf);
5251 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5252 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5258 * Issue layout intent RPC to MDS.
5259 * \param inode [in] file inode
5260 * \param intent [in] layout intent
5262 * \retval 0 on success
5263 * \retval < 0 error code
5265 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5267 struct ll_inode_info *lli = ll_i2info(inode);
5268 struct ll_sb_info *sbi = ll_i2sbi(inode);
5269 struct md_op_data *op_data;
5270 struct lookup_intent it;
5271 struct ptlrpc_request *req;
5275 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5276 0, 0, LUSTRE_OPC_ANY, NULL);
5277 if (IS_ERR(op_data))
5278 RETURN(PTR_ERR(op_data));
5280 op_data->op_data = intent;
5281 op_data->op_data_size = sizeof(*intent);
5283 memset(&it, 0, sizeof(it));
5284 it.it_op = IT_LAYOUT;
5285 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5286 intent->li_opc == LAYOUT_INTENT_TRUNC)
5287 it.it_flags = FMODE_WRITE;
5289 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5290 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5292 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5293 &ll_md_blocking_ast, 0);
5294 if (it.it_request != NULL)
5295 ptlrpc_req_finished(it.it_request);
5296 it.it_request = NULL;
5298 ll_finish_md_op_data(op_data);
5300 /* set lock data in case this is a new lock */
5302 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5304 ll_intent_drop_lock(&it);
5310 * This function checks if there exists a LAYOUT lock on the client side,
5311 * or enqueues it if it doesn't have one in cache.
5313 * This function will not hold layout lock so it may be revoked any time after
5314 * this function returns. Any operations depend on layout should be redone
5317 * This function should be called before lov_io_init() to get an uptodate
5318 * layout version, the caller should save the version number and after IO
5319 * is finished, this function should be called again to verify that layout
5320 * is not changed during IO time.
5322 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5324 struct ll_inode_info *lli = ll_i2info(inode);
5325 struct ll_sb_info *sbi = ll_i2sbi(inode);
5326 struct lustre_handle lockh;
5327 struct layout_intent intent = {
5328 .li_opc = LAYOUT_INTENT_ACCESS,
5330 enum ldlm_mode mode;
5334 *gen = ll_layout_version_get(lli);
5335 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5339 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5340 LASSERT(S_ISREG(inode->i_mode));
5342 /* take layout lock mutex to enqueue layout lock exclusively. */
5343 mutex_lock(&lli->lli_layout_mutex);
5346 /* mostly layout lock is caching on the local side, so try to
5347 * match it before grabbing layout lock mutex. */
5348 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5349 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5350 if (mode != 0) { /* hit cached lock */
5351 rc = ll_layout_lock_set(&lockh, mode, inode);
5357 rc = ll_layout_intent(inode, &intent);
5363 *gen = ll_layout_version_get(lli);
5364 mutex_unlock(&lli->lli_layout_mutex);
5370 * Issue layout intent RPC indicating where in a file an IO is about to write.
5372 * \param[in] inode file inode.
5373 * \param[in] ext write range with start offset of fille in bytes where
5374 * an IO is about to write, and exclusive end offset in
5377 * \retval 0 on success
5378 * \retval < 0 error code
5380 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5381 struct lu_extent *ext)
5383 struct layout_intent intent = {
5385 .li_extent.e_start = ext->e_start,
5386 .li_extent.e_end = ext->e_end,
5391 rc = ll_layout_intent(inode, &intent);
5397 * This function send a restore request to the MDT
5399 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5401 struct hsm_user_request *hur;
5405 len = sizeof(struct hsm_user_request) +
5406 sizeof(struct hsm_user_item);
5407 OBD_ALLOC(hur, len);
5411 hur->hur_request.hr_action = HUA_RESTORE;
5412 hur->hur_request.hr_archive_id = 0;
5413 hur->hur_request.hr_flags = 0;
5414 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5415 sizeof(hur->hur_user_item[0].hui_fid));
5416 hur->hur_user_item[0].hui_extent.offset = offset;
5417 hur->hur_user_item[0].hui_extent.length = length;
5418 hur->hur_request.hr_itemcount = 1;
5419 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,