4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
166 case MDS_CLOSE_LAYOUT_SPLIT:
167 case MDS_CLOSE_LAYOUT_SWAP: {
168 struct split_param *sp = data;
170 LASSERT(data != NULL);
171 op_data->op_bias |= bias;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
175 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
176 op_data->op_mirror_id = sp->sp_mirror_id;
178 op_data->op_fid2 = *ll_inode2fid(data);
183 case MDS_CLOSE_RESYNC_DONE: {
184 struct ll_ioc_lease *ioc = data;
186 LASSERT(data != NULL);
187 op_data->op_attr_blocks +=
188 ioc->lil_count * op_data->op_attr_blocks;
189 op_data->op_attr.ia_valid |= ATTR_SIZE;
190 op_data->op_xvalid |= OP_XVALID_BLOCKS;
191 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_data = &ioc->lil_ids[0];
195 op_data->op_data_size =
196 ioc->lil_count * sizeof(ioc->lil_ids[0]);
200 case MDS_PCC_ATTACH: {
201 struct pcc_param *param = data;
203 LASSERT(data != NULL);
204 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
205 op_data->op_archive_id = param->pa_archive_id;
206 op_data->op_data_version = param->pa_data_version;
207 op_data->op_lease_handle = och->och_lease_handle;
211 case MDS_HSM_RELEASE:
212 LASSERT(data != NULL);
213 op_data->op_bias |= MDS_HSM_RELEASE;
214 op_data->op_data_version = *(__u64 *)data;
215 op_data->op_lease_handle = och->och_lease_handle;
216 op_data->op_attr.ia_valid |= ATTR_SIZE;
217 op_data->op_xvalid |= OP_XVALID_BLOCKS;
221 LASSERT(data == NULL);
225 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
226 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
227 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
228 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
230 rc = md_close(md_exp, op_data, och->och_mod, &req);
231 if (rc != 0 && rc != -EINTR)
232 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
233 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
235 if (rc == 0 && op_data->op_bias & bias) {
236 struct mdt_body *body;
238 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
239 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 if (bias & MDS_PCC_ATTACH) {
243 struct pcc_param *param = data;
245 param->pa_layout_gen = body->mbo_layout_gen;
249 ll_finish_md_op_data(op_data);
253 md_clear_open_replay_data(md_exp, och);
254 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
257 ptlrpc_req_finished(req); /* This is close request */
261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
263 struct ll_inode_info *lli = ll_i2info(inode);
264 struct obd_client_handle **och_p;
265 struct obd_client_handle *och;
270 if (fmode & FMODE_WRITE) {
271 och_p = &lli->lli_mds_write_och;
272 och_usecount = &lli->lli_open_fd_write_count;
273 } else if (fmode & FMODE_EXEC) {
274 och_p = &lli->lli_mds_exec_och;
275 och_usecount = &lli->lli_open_fd_exec_count;
277 LASSERT(fmode & FMODE_READ);
278 och_p = &lli->lli_mds_read_och;
279 och_usecount = &lli->lli_open_fd_read_count;
282 mutex_lock(&lli->lli_och_mutex);
283 if (*och_usecount > 0) {
284 /* There are still users of this handle, so skip
286 mutex_unlock(&lli->lli_och_mutex);
292 mutex_unlock(&lli->lli_och_mutex);
295 /* There might be a race and this handle may already
297 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
303 static int ll_md_close(struct inode *inode, struct file *file)
305 union ldlm_policy_data policy = {
306 .l_inodebits = { MDS_INODELOCK_OPEN },
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310 struct ll_inode_info *lli = ll_i2info(inode);
311 struct lustre_handle lockh;
312 enum ldlm_mode lockmode;
316 /* clear group lock, if present */
317 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
318 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
320 if (fd->fd_lease_och != NULL) {
323 /* Usually the lease is not released when the
324 * application crashed, we need to release here. */
325 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
326 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
327 PFID(&lli->lli_fid), rc, lease_broken);
329 fd->fd_lease_och = NULL;
332 if (fd->fd_och != NULL) {
333 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
338 /* Let's see if we have good enough OPEN lock on the file and if
339 we can skip talking to MDS */
340 mutex_lock(&lli->lli_och_mutex);
341 if (fd->fd_omode & FMODE_WRITE) {
343 LASSERT(lli->lli_open_fd_write_count);
344 lli->lli_open_fd_write_count--;
345 } else if (fd->fd_omode & FMODE_EXEC) {
347 LASSERT(lli->lli_open_fd_exec_count);
348 lli->lli_open_fd_exec_count--;
351 LASSERT(lli->lli_open_fd_read_count);
352 lli->lli_open_fd_read_count--;
354 mutex_unlock(&lli->lli_och_mutex);
356 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
357 LDLM_IBITS, &policy, lockmode, &lockh))
358 rc = ll_md_real_close(inode, fd->fd_omode);
361 LUSTRE_FPRIVATE(file) = NULL;
362 ll_file_data_put(fd);
367 /* While this returns an error code, fput() the caller does not, so we need
368 * to make every effort to clean up all of our state here. Also, applications
369 * rarely check close errors and even if an error is returned they will not
370 * re-try the close call.
372 int ll_file_release(struct inode *inode, struct file *file)
374 struct ll_file_data *fd;
375 struct ll_sb_info *sbi = ll_i2sbi(inode);
376 struct ll_inode_info *lli = ll_i2info(inode);
380 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
381 PFID(ll_inode2fid(inode)), inode);
383 if (inode->i_sb->s_root != file_dentry(file))
384 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
385 fd = LUSTRE_FPRIVATE(file);
388 /* The last ref on @file, maybe not the the owner pid of statahead,
389 * because parent and child process can share the same file handle. */
390 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
391 ll_deauthorize_statahead(inode, fd);
393 if (inode->i_sb->s_root == file_dentry(file)) {
394 LUSTRE_FPRIVATE(file) = NULL;
395 ll_file_data_put(fd);
399 pcc_file_release(inode, file);
401 if (!S_ISDIR(inode->i_mode)) {
402 if (lli->lli_clob != NULL)
403 lov_read_and_clear_async_rc(lli->lli_clob);
404 lli->lli_async_rc = 0;
407 rc = ll_md_close(inode, file);
409 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
410 libcfs_debug_dumplog();
415 static inline int ll_dom_readpage(void *data, struct page *page)
417 struct niobuf_local *lnb = data;
420 kaddr = ll_kmap_atomic(page, KM_USER0);
421 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
422 if (lnb->lnb_len < PAGE_SIZE)
423 memset(kaddr + lnb->lnb_len, 0,
424 PAGE_SIZE - lnb->lnb_len);
425 flush_dcache_page(page);
426 SetPageUptodate(page);
427 ll_kunmap_atomic(kaddr, KM_USER0);
433 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
434 struct lookup_intent *it)
436 struct ll_inode_info *lli = ll_i2info(inode);
437 struct cl_object *obj = lli->lli_clob;
438 struct address_space *mapping = inode->i_mapping;
440 struct niobuf_remote *rnb;
442 unsigned long index, start;
443 struct niobuf_local lnb;
450 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
454 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
455 if (rnb == NULL || rnb->rnb_len == 0)
458 /* LU-11595: Server may return whole file and that is OK always or
459 * it may return just file tail and its offset must be aligned with
460 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
461 * smaller then offset may be not aligned and that data is just ignored.
463 if (rnb->rnb_offset % PAGE_SIZE)
466 /* Server returns whole file or just file tail if it fills in
467 * reply buffer, in both cases total size should be inode size.
469 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
470 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
471 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
472 rnb->rnb_len, i_size_read(inode));
476 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
477 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
479 data = (char *)rnb + sizeof(*rnb);
481 lnb.lnb_file_offset = rnb->rnb_offset;
482 start = lnb.lnb_file_offset / PAGE_SIZE;
484 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
485 lnb.lnb_page_offset = 0;
487 lnb.lnb_data = data + (index << PAGE_SHIFT);
488 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
489 if (lnb.lnb_len > PAGE_SIZE)
490 lnb.lnb_len = PAGE_SIZE;
492 vmpage = read_cache_page(mapping, index + start,
493 ll_dom_readpage, &lnb);
494 if (IS_ERR(vmpage)) {
495 CWARN("%s: cannot fill page %lu for "DFID
496 " with data: rc = %li\n",
497 ll_i2sbi(inode)->ll_fsname, index + start,
498 PFID(lu_object_fid(&obj->co_lu)),
504 } while (rnb->rnb_len > (index << PAGE_SHIFT));
508 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
509 struct lookup_intent *itp)
511 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
512 struct dentry *parent = de->d_parent;
515 struct md_op_data *op_data;
516 struct ptlrpc_request *req = NULL;
520 LASSERT(parent != NULL);
521 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
523 /* if server supports open-by-fid, or file name is invalid, don't pack
524 * name in open request */
525 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
526 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
528 len = de->d_name.len;
529 name = kmalloc(len + 1, GFP_NOFS);
534 spin_lock(&de->d_lock);
535 if (len != de->d_name.len) {
536 spin_unlock(&de->d_lock);
540 memcpy(name, de->d_name.name, len);
542 spin_unlock(&de->d_lock);
544 if (!lu_name_is_valid_2(name, len)) {
550 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
551 name, len, 0, LUSTRE_OPC_ANY, NULL);
552 if (IS_ERR(op_data)) {
554 RETURN(PTR_ERR(op_data));
556 op_data->op_data = lmm;
557 op_data->op_data_size = lmmsize;
559 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
560 &ll_md_blocking_ast, 0);
562 ll_finish_md_op_data(op_data);
564 /* reason for keep own exit path - don`t flood log
565 * with messages with -ESTALE errors.
567 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
568 it_open_error(DISP_OPEN_OPEN, itp))
570 ll_release_openhandle(de, itp);
574 if (it_disposition(itp, DISP_LOOKUP_NEG))
575 GOTO(out, rc = -ENOENT);
577 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
578 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
579 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
583 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
585 if (!rc && itp->it_lock_mode) {
586 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
587 struct ldlm_lock *lock;
588 bool has_dom_bit = false;
590 /* If we got a lock back and it has a LOOKUP bit set,
591 * make sure the dentry is marked as valid so we can find it.
592 * We don't need to care about actual hashing since other bits
593 * of kernel will deal with that later.
595 lock = ldlm_handle2lock(&handle);
597 has_dom_bit = ldlm_has_dom(lock);
598 if (lock->l_policy_data.l_inodebits.bits &
599 MDS_INODELOCK_LOOKUP)
600 d_lustre_revalidate(de);
604 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
606 ll_dom_finish_open(de->d_inode, req, itp);
610 ptlrpc_req_finished(req);
611 ll_intent_drop_lock(itp);
613 /* We did open by fid, but by the time we got to the server,
614 * the object disappeared. If this is a create, we cannot really
615 * tell the userspace that the file it was trying to create
616 * does not exist. Instead let's return -ESTALE, and the VFS will
617 * retry the create with LOOKUP_REVAL that we are going to catch
618 * in ll_revalidate_dentry() and use lookup then.
620 if (rc == -ENOENT && itp->it_op & IT_CREAT)
626 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
627 struct obd_client_handle *och)
629 struct mdt_body *body;
631 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
632 och->och_open_handle = body->mbo_open_handle;
633 och->och_fid = body->mbo_fid1;
634 och->och_lease_handle.cookie = it->it_lock_handle;
635 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
636 och->och_flags = it->it_flags;
638 return md_set_open_replay_data(md_exp, och, it);
641 static int ll_local_open(struct file *file, struct lookup_intent *it,
642 struct ll_file_data *fd, struct obd_client_handle *och)
644 struct inode *inode = file_inode(file);
647 LASSERT(!LUSTRE_FPRIVATE(file));
654 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
659 LUSTRE_FPRIVATE(file) = fd;
660 ll_readahead_init(inode, &fd->fd_ras);
661 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
663 /* ll_cl_context initialize */
664 rwlock_init(&fd->fd_lock);
665 INIT_LIST_HEAD(&fd->fd_lccs);
670 /* Open a file, and (for the very first open) create objects on the OSTs at
671 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
672 * creation or open until ll_lov_setstripe() ioctl is called.
674 * If we already have the stripe MD locally then we don't request it in
675 * md_open(), by passing a lmm_size = 0.
677 * It is up to the application to ensure no other processes open this file
678 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
679 * used. We might be able to avoid races of that sort by getting lli_open_sem
680 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
681 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
683 int ll_file_open(struct inode *inode, struct file *file)
685 struct ll_inode_info *lli = ll_i2info(inode);
686 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
687 .it_flags = file->f_flags };
688 struct obd_client_handle **och_p = NULL;
689 __u64 *och_usecount = NULL;
690 struct ll_file_data *fd;
694 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
695 PFID(ll_inode2fid(inode)), inode, file->f_flags);
697 it = file->private_data; /* XXX: compat macro */
698 file->private_data = NULL; /* prevent ll_local_open assertion */
700 fd = ll_file_data_get();
702 GOTO(out_nofiledata, rc = -ENOMEM);
705 if (S_ISDIR(inode->i_mode))
706 ll_authorize_statahead(inode, fd);
708 if (inode->i_sb->s_root == file_dentry(file)) {
709 LUSTRE_FPRIVATE(file) = fd;
713 if (!it || !it->it_disposition) {
714 /* Convert f_flags into access mode. We cannot use file->f_mode,
715 * because everything but O_ACCMODE mask was stripped from
717 if ((oit.it_flags + 1) & O_ACCMODE)
719 if (file->f_flags & O_TRUNC)
720 oit.it_flags |= FMODE_WRITE;
722 /* kernel only call f_op->open in dentry_open. filp_open calls
723 * dentry_open after call to open_namei that checks permissions.
724 * Only nfsd_open call dentry_open directly without checking
725 * permissions and because of that this code below is safe.
727 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
728 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
730 /* We do not want O_EXCL here, presumably we opened the file
731 * already? XXX - NFS implications? */
732 oit.it_flags &= ~O_EXCL;
734 /* bug20584, if "it_flags" contains O_CREAT, the file will be
735 * created if necessary, then "IT_CREAT" should be set to keep
736 * consistent with it */
737 if (oit.it_flags & O_CREAT)
738 oit.it_op |= IT_CREAT;
744 /* Let's see if we have file open on MDS already. */
745 if (it->it_flags & FMODE_WRITE) {
746 och_p = &lli->lli_mds_write_och;
747 och_usecount = &lli->lli_open_fd_write_count;
748 } else if (it->it_flags & FMODE_EXEC) {
749 och_p = &lli->lli_mds_exec_och;
750 och_usecount = &lli->lli_open_fd_exec_count;
752 och_p = &lli->lli_mds_read_och;
753 och_usecount = &lli->lli_open_fd_read_count;
756 mutex_lock(&lli->lli_och_mutex);
757 if (*och_p) { /* Open handle is present */
758 if (it_disposition(it, DISP_OPEN_OPEN)) {
759 /* Well, there's extra open request that we do not need,
760 let's close it somehow. This will decref request. */
761 rc = it_open_error(DISP_OPEN_OPEN, it);
763 mutex_unlock(&lli->lli_och_mutex);
764 GOTO(out_openerr, rc);
767 ll_release_openhandle(file_dentry(file), it);
771 rc = ll_local_open(file, it, fd, NULL);
774 mutex_unlock(&lli->lli_och_mutex);
775 GOTO(out_openerr, rc);
778 LASSERT(*och_usecount == 0);
779 if (!it->it_disposition) {
780 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
781 /* We cannot just request lock handle now, new ELC code
782 means that one of other OPEN locks for this file
783 could be cancelled, and since blocking ast handler
784 would attempt to grab och_mutex as well, that would
785 result in a deadlock */
786 mutex_unlock(&lli->lli_och_mutex);
788 * Normally called under two situations:
790 * 2. A race/condition on MDS resulting in no open
791 * handle to be returned from LOOKUP|OPEN request,
792 * for example if the target entry was a symlink.
794 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
795 * marked by a bit set in ll_iget_for_nfs. Clear the
796 * bit so that it's not confusing later callers.
798 * NB; when ldd is NULL, it must have come via normal
799 * lookup path only, since ll_iget_for_nfs always calls
802 if (ldd && ldd->lld_nfs_dentry) {
803 ldd->lld_nfs_dentry = 0;
804 it->it_flags |= MDS_OPEN_LOCK;
808 * Always specify MDS_OPEN_BY_FID because we don't want
809 * to get file with different fid.
811 it->it_flags |= MDS_OPEN_BY_FID;
812 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
815 GOTO(out_openerr, rc);
819 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
821 GOTO(out_och_free, rc = -ENOMEM);
825 /* md_intent_lock() didn't get a request ref if there was an
826 * open error, so don't do cleanup on the request here
828 /* XXX (green): Should not we bail out on any error here, not
829 * just open error? */
830 rc = it_open_error(DISP_OPEN_OPEN, it);
832 GOTO(out_och_free, rc);
834 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
835 "inode %p: disposition %x, status %d\n", inode,
836 it_disposition(it, ~0), it->it_status);
838 rc = ll_local_open(file, it, fd, *och_p);
840 GOTO(out_och_free, rc);
842 rc = pcc_file_open(inode, file);
844 GOTO(out_och_free, rc);
846 mutex_unlock(&lli->lli_och_mutex);
849 /* Must do this outside lli_och_mutex lock to prevent deadlock where
850 different kind of OPEN lock for this same inode gets cancelled
851 by ldlm_cancel_lru */
852 if (!S_ISREG(inode->i_mode))
853 GOTO(out_och_free, rc);
855 cl_lov_delay_create_clear(&file->f_flags);
856 GOTO(out_och_free, rc);
860 if (och_p && *och_p) {
861 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
862 *och_p = NULL; /* OBD_FREE writes some magic there */
865 mutex_unlock(&lli->lli_och_mutex);
868 if (lli->lli_opendir_key == fd)
869 ll_deauthorize_statahead(inode, fd);
872 ll_file_data_put(fd);
874 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
878 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
879 ptlrpc_req_finished(it->it_request);
880 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
886 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
887 struct ldlm_lock_desc *desc, void *data, int flag)
890 struct lustre_handle lockh;
894 case LDLM_CB_BLOCKING:
895 ldlm_lock2handle(lock, &lockh);
896 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
898 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
902 case LDLM_CB_CANCELING:
910 * When setting a lease on a file, we take ownership of the lli_mds_*_och
911 * and save it as fd->fd_och so as to force client to reopen the file even
912 * if it has an open lock in cache already.
914 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
915 struct lustre_handle *old_open_handle)
917 struct ll_inode_info *lli = ll_i2info(inode);
918 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
919 struct obd_client_handle **och_p;
924 /* Get the openhandle of the file */
925 mutex_lock(&lli->lli_och_mutex);
926 if (fd->fd_lease_och != NULL)
927 GOTO(out_unlock, rc = -EBUSY);
929 if (fd->fd_och == NULL) {
930 if (file->f_mode & FMODE_WRITE) {
931 LASSERT(lli->lli_mds_write_och != NULL);
932 och_p = &lli->lli_mds_write_och;
933 och_usecount = &lli->lli_open_fd_write_count;
935 LASSERT(lli->lli_mds_read_och != NULL);
936 och_p = &lli->lli_mds_read_och;
937 och_usecount = &lli->lli_open_fd_read_count;
940 if (*och_usecount > 1)
941 GOTO(out_unlock, rc = -EBUSY);
948 *old_open_handle = fd->fd_och->och_open_handle;
952 mutex_unlock(&lli->lli_och_mutex);
957 * Release ownership on lli_mds_*_och when putting back a file lease.
959 static int ll_lease_och_release(struct inode *inode, struct file *file)
961 struct ll_inode_info *lli = ll_i2info(inode);
962 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
963 struct obd_client_handle **och_p;
964 struct obd_client_handle *old_och = NULL;
969 mutex_lock(&lli->lli_och_mutex);
970 if (file->f_mode & FMODE_WRITE) {
971 och_p = &lli->lli_mds_write_och;
972 och_usecount = &lli->lli_open_fd_write_count;
974 och_p = &lli->lli_mds_read_och;
975 och_usecount = &lli->lli_open_fd_read_count;
978 /* The file may have been open by another process (broken lease) so
979 * *och_p is not NULL. In this case we should simply increase usecount
982 if (*och_p != NULL) {
983 old_och = fd->fd_och;
990 mutex_unlock(&lli->lli_och_mutex);
993 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
999 * Acquire a lease and open the file.
1001 static struct obd_client_handle *
1002 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1005 struct lookup_intent it = { .it_op = IT_OPEN };
1006 struct ll_sb_info *sbi = ll_i2sbi(inode);
1007 struct md_op_data *op_data;
1008 struct ptlrpc_request *req = NULL;
1009 struct lustre_handle old_open_handle = { 0 };
1010 struct obd_client_handle *och = NULL;
1015 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1016 RETURN(ERR_PTR(-EINVAL));
1019 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1020 RETURN(ERR_PTR(-EPERM));
1022 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1024 RETURN(ERR_PTR(rc));
1029 RETURN(ERR_PTR(-ENOMEM));
1031 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1032 LUSTRE_OPC_ANY, NULL);
1033 if (IS_ERR(op_data))
1034 GOTO(out, rc = PTR_ERR(op_data));
1036 /* To tell the MDT this openhandle is from the same owner */
1037 op_data->op_open_handle = old_open_handle;
1039 it.it_flags = fmode | open_flags;
1040 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1041 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1042 &ll_md_blocking_lease_ast,
1043 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1044 * it can be cancelled which may mislead applications that the lease is
1046 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1047 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1048 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1049 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1050 ll_finish_md_op_data(op_data);
1051 ptlrpc_req_finished(req);
1053 GOTO(out_release_it, rc);
1055 if (it_disposition(&it, DISP_LOOKUP_NEG))
1056 GOTO(out_release_it, rc = -ENOENT);
1058 rc = it_open_error(DISP_OPEN_OPEN, &it);
1060 GOTO(out_release_it, rc);
1062 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1063 ll_och_fill(sbi->ll_md_exp, &it, och);
1065 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1066 GOTO(out_close, rc = -EOPNOTSUPP);
1068 /* already get lease, handle lease lock */
1069 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1070 if (it.it_lock_mode == 0 ||
1071 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1072 /* open lock must return for lease */
1073 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1074 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1076 GOTO(out_close, rc = -EPROTO);
1079 ll_intent_release(&it);
1083 /* Cancel open lock */
1084 if (it.it_lock_mode != 0) {
1085 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1087 it.it_lock_mode = 0;
1088 och->och_lease_handle.cookie = 0ULL;
1090 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1092 CERROR("%s: error closing file "DFID": %d\n",
1093 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1094 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1096 ll_intent_release(&it);
1100 RETURN(ERR_PTR(rc));
1104 * Check whether a layout swap can be done between two inodes.
1106 * \param[in] inode1 First inode to check
1107 * \param[in] inode2 Second inode to check
1109 * \retval 0 on success, layout swap can be performed between both inodes
1110 * \retval negative error code if requirements are not met
1112 static int ll_check_swap_layouts_validity(struct inode *inode1,
1113 struct inode *inode2)
1115 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1118 if (inode_permission(inode1, MAY_WRITE) ||
1119 inode_permission(inode2, MAY_WRITE))
1122 if (inode1->i_sb != inode2->i_sb)
1128 static int ll_swap_layouts_close(struct obd_client_handle *och,
1129 struct inode *inode, struct inode *inode2)
1131 const struct lu_fid *fid1 = ll_inode2fid(inode);
1132 const struct lu_fid *fid2;
1136 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1137 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1139 rc = ll_check_swap_layouts_validity(inode, inode2);
1141 GOTO(out_free_och, rc);
1143 /* We now know that inode2 is a lustre inode */
1144 fid2 = ll_inode2fid(inode2);
1146 rc = lu_fid_cmp(fid1, fid2);
1148 GOTO(out_free_och, rc = -EINVAL);
1150 /* Close the file and {swap,merge} layouts between inode & inode2.
1151 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1152 * because we still need it to pack l_remote_handle to MDT. */
1153 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1156 och = NULL; /* freed in ll_close_inode_openhandle() */
1166 * Release lease and close the file.
1167 * It will check if the lease has ever broken.
1169 static int ll_lease_close_intent(struct obd_client_handle *och,
1170 struct inode *inode,
1171 bool *lease_broken, enum mds_op_bias bias,
1174 struct ldlm_lock *lock;
1175 bool cancelled = true;
1179 lock = ldlm_handle2lock(&och->och_lease_handle);
1181 lock_res_and_lock(lock);
1182 cancelled = ldlm_is_cancel(lock);
1183 unlock_res_and_lock(lock);
1184 LDLM_LOCK_PUT(lock);
1187 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1188 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1190 if (lease_broken != NULL)
1191 *lease_broken = cancelled;
1193 if (!cancelled && !bias)
1194 ldlm_cli_cancel(&och->och_lease_handle, 0);
1196 if (cancelled) { /* no need to excute intent */
1201 rc = ll_close_inode_openhandle(inode, och, bias, data);
1205 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1208 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1212 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1214 static int ll_lease_file_resync(struct obd_client_handle *och,
1215 struct inode *inode, unsigned long arg)
1217 struct ll_sb_info *sbi = ll_i2sbi(inode);
1218 struct md_op_data *op_data;
1219 struct ll_ioc_lease_id ioc;
1220 __u64 data_version_unused;
1224 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1225 LUSTRE_OPC_ANY, NULL);
1226 if (IS_ERR(op_data))
1227 RETURN(PTR_ERR(op_data));
1229 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1233 /* before starting file resync, it's necessary to clean up page cache
1234 * in client memory, otherwise once the layout version is increased,
1235 * writing back cached data will be denied the OSTs. */
1236 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1240 op_data->op_lease_handle = och->och_lease_handle;
1241 op_data->op_mirror_id = ioc.lil_mirror_id;
1242 rc = md_file_resync(sbi->ll_md_exp, op_data);
1248 ll_finish_md_op_data(op_data);
1252 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1254 struct ll_inode_info *lli = ll_i2info(inode);
1255 struct cl_object *obj = lli->lli_clob;
1256 struct cl_attr *attr = vvp_env_thread_attr(env);
1264 ll_inode_size_lock(inode);
1266 /* Merge timestamps the most recently obtained from MDS with
1267 * timestamps obtained from OSTs.
1269 * Do not overwrite atime of inode because it may be refreshed
1270 * by file_accessed() function. If the read was served by cache
1271 * data, there is no RPC to be sent so that atime may not be
1272 * transferred to OSTs at all. MDT only updates atime at close time
1273 * if it's at least 'mdd.*.atime_diff' older.
1274 * All in all, the atime in Lustre does not strictly comply with
1275 * POSIX. Solving this problem needs to send an RPC to MDT for each
1276 * read, this will hurt performance.
1278 if (inode->i_atime.tv_sec < lli->lli_atime ||
1279 lli->lli_update_atime) {
1280 inode->i_atime.tv_sec = lli->lli_atime;
1281 lli->lli_update_atime = 0;
1283 inode->i_mtime.tv_sec = lli->lli_mtime;
1284 inode->i_ctime.tv_sec = lli->lli_ctime;
1286 mtime = inode->i_mtime.tv_sec;
1287 atime = inode->i_atime.tv_sec;
1288 ctime = inode->i_ctime.tv_sec;
1290 cl_object_attr_lock(obj);
1291 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1294 rc = cl_object_attr_get(env, obj, attr);
1295 cl_object_attr_unlock(obj);
1298 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1300 if (atime < attr->cat_atime)
1301 atime = attr->cat_atime;
1303 if (ctime < attr->cat_ctime)
1304 ctime = attr->cat_ctime;
1306 if (mtime < attr->cat_mtime)
1307 mtime = attr->cat_mtime;
1309 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1310 PFID(&lli->lli_fid), attr->cat_size);
1312 i_size_write(inode, attr->cat_size);
1313 inode->i_blocks = attr->cat_blocks;
1315 inode->i_mtime.tv_sec = mtime;
1316 inode->i_atime.tv_sec = atime;
1317 inode->i_ctime.tv_sec = ctime;
1320 ll_inode_size_unlock(inode);
1326 * Set designated mirror for I/O.
1328 * So far only read, write, and truncated can support to issue I/O to
1329 * designated mirror.
1331 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1333 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1335 /* clear layout version for generic(non-resync) I/O in case it carries
1336 * stale layout version due to I/O restart */
1337 io->ci_layout_version = 0;
1339 /* FLR: disable non-delay for designated mirror I/O because obviously
1340 * only one mirror is available */
1341 if (fd->fd_designated_mirror > 0) {
1343 io->ci_designated_mirror = fd->fd_designated_mirror;
1344 io->ci_layout_version = fd->fd_layout_version;
1347 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1348 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1351 static bool file_is_noatime(const struct file *file)
1353 const struct vfsmount *mnt = file->f_path.mnt;
1354 const struct inode *inode = file_inode((struct file *)file);
1356 /* Adapted from file_accessed() and touch_atime().*/
1357 if (file->f_flags & O_NOATIME)
1360 if (inode->i_flags & S_NOATIME)
1363 if (IS_NOATIME(inode))
1366 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1369 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1372 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1378 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1380 struct inode *inode = file_inode(file);
1381 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1383 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1384 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1386 if (iot == CIT_WRITE) {
1387 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1388 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1389 file->f_flags & O_DIRECT ||
1392 io->ci_obj = ll_i2info(inode)->lli_clob;
1393 io->ci_lockreq = CILR_MAYBE;
1394 if (ll_file_nolock(file)) {
1395 io->ci_lockreq = CILR_NEVER;
1396 io->ci_no_srvlock = 1;
1397 } else if (file->f_flags & O_APPEND) {
1398 io->ci_lockreq = CILR_MANDATORY;
1400 io->ci_noatime = file_is_noatime(file);
1402 /* FLR: only use non-delay I/O for read as there is only one
1403 * avaliable mirror for write. */
1404 io->ci_ndelay = !(iot == CIT_WRITE);
1406 ll_io_set_mirror(io, file);
1409 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1412 struct ll_inode_info *lli = ll_i2info(inode);
1413 struct ll_sb_info *sbi = ll_i2sbi(inode);
1414 enum obd_heat_type sample_type;
1415 enum obd_heat_type iobyte_type;
1416 __u64 now = ktime_get_real_seconds();
1418 if (!ll_sbi_has_file_heat(sbi) ||
1419 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1422 if (iot == CIT_READ) {
1423 sample_type = OBD_HEAT_READSAMPLE;
1424 iobyte_type = OBD_HEAT_READBYTE;
1425 } else if (iot == CIT_WRITE) {
1426 sample_type = OBD_HEAT_WRITESAMPLE;
1427 iobyte_type = OBD_HEAT_WRITEBYTE;
1432 spin_lock(&lli->lli_heat_lock);
1433 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1434 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1435 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1436 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1437 spin_unlock(&lli->lli_heat_lock);
1441 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1442 struct file *file, enum cl_io_type iot,
1443 loff_t *ppos, size_t count)
1445 struct vvp_io *vio = vvp_env_io(env);
1446 struct inode *inode = file_inode(file);
1447 struct ll_inode_info *lli = ll_i2info(inode);
1448 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1449 struct range_lock range;
1453 unsigned retried = 0;
1454 bool restarted = false;
1458 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1459 file_dentry(file)->d_name.name,
1460 iot == CIT_READ ? "read" : "write", *ppos, count);
1463 io = vvp_env_thread_io(env);
1464 ll_io_init(io, file, iot);
1465 io->ci_ndelay_tried = retried;
1467 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1468 bool range_locked = false;
1470 if (file->f_flags & O_APPEND)
1471 range_lock_init(&range, 0, LUSTRE_EOF);
1473 range_lock_init(&range, *ppos, *ppos + count - 1);
1475 vio->vui_fd = LUSTRE_FPRIVATE(file);
1476 vio->vui_io_subtype = args->via_io_subtype;
1478 switch (vio->vui_io_subtype) {
1480 vio->vui_iter = args->u.normal.via_iter;
1481 vio->vui_iocb = args->u.normal.via_iocb;
1482 /* Direct IO reads must also take range lock,
1483 * or multiple reads will try to work on the same pages
1484 * See LU-6227 for details. */
1485 if (((iot == CIT_WRITE) ||
1486 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1487 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1488 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1490 rc = range_lock(&lli->lli_write_tree, &range);
1494 range_locked = true;
1498 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1499 vio->u.splice.vui_flags = args->u.splice.via_flags;
1502 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1506 ll_cl_add(file, env, io, LCC_RW);
1507 rc = cl_io_loop(env, io);
1508 ll_cl_remove(file, env);
1511 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1513 range_unlock(&lli->lli_write_tree, &range);
1516 /* cl_io_rw_init() handled IO */
1520 if (io->ci_nob > 0) {
1521 result += io->ci_nob;
1522 count -= io->ci_nob;
1523 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1525 /* prepare IO restart */
1526 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1527 args->u.normal.via_iter = vio->vui_iter;
1530 cl_io_fini(env, io);
1533 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1534 file->f_path.dentry->d_name.name,
1535 iot, rc, result, io->ci_need_restart);
1537 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1539 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1540 file_dentry(file)->d_name.name,
1541 iot == CIT_READ ? "read" : "write",
1542 *ppos, count, result, rc);
1543 /* preserve the tried count for FLR */
1544 retried = io->ci_ndelay_tried;
1549 if (iot == CIT_READ) {
1551 ll_stats_ops_tally(ll_i2sbi(inode),
1552 LPROC_LL_READ_BYTES, result);
1553 } else if (iot == CIT_WRITE) {
1555 ll_stats_ops_tally(ll_i2sbi(inode),
1556 LPROC_LL_WRITE_BYTES, result);
1557 fd->fd_write_failed = false;
1558 } else if (result == 0 && rc == 0) {
1561 fd->fd_write_failed = true;
1563 fd->fd_write_failed = false;
1564 } else if (rc != -ERESTARTSYS) {
1565 fd->fd_write_failed = true;
1569 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1571 ll_heat_add(inode, iot, result);
1573 RETURN(result > 0 ? result : rc);
1577 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1578 * especially for small I/O.
1580 * To serve a read request, CLIO has to create and initialize a cl_io and
1581 * then request DLM lock. This has turned out to have siginificant overhead
1582 * and affects the performance of small I/O dramatically.
1584 * It's not necessary to create a cl_io for each I/O. Under the help of read
1585 * ahead, most of the pages being read are already in memory cache and we can
1586 * read those pages directly because if the pages exist, the corresponding DLM
1587 * lock must exist so that page content must be valid.
1589 * In fast read implementation, the llite speculatively finds and reads pages
1590 * in memory cache. There are three scenarios for fast read:
1591 * - If the page exists and is uptodate, kernel VM will provide the data and
1592 * CLIO won't be intervened;
1593 * - If the page was brought into memory by read ahead, it will be exported
1594 * and read ahead parameters will be updated;
1595 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1596 * it will go back and invoke normal read, i.e., a cl_io will be created
1597 * and DLM lock will be requested.
1599 * POSIX compliance: posix standard states that read is intended to be atomic.
1600 * Lustre read implementation is in line with Linux kernel read implementation
1601 * and neither of them complies with POSIX standard in this matter. Fast read
1602 * doesn't make the situation worse on single node but it may interleave write
1603 * results from multiple nodes due to short read handling in ll_file_aio_read().
1605 * \param env - lu_env
1606 * \param iocb - kiocb from kernel
1607 * \param iter - user space buffers where the data will be copied
1609 * \retval - number of bytes have been read, or error code if error occurred.
1612 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1616 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1619 /* NB: we can't do direct IO for fast read because it will need a lock
1620 * to make IO engine happy. */
1621 if (iocb->ki_filp->f_flags & O_DIRECT)
1624 result = generic_file_read_iter(iocb, iter);
1626 /* If the first page is not in cache, generic_file_aio_read() will be
1627 * returned with -ENODATA.
1628 * See corresponding code in ll_readpage(). */
1629 if (result == -ENODATA)
1633 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1634 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1635 LPROC_LL_READ_BYTES, result);
1642 * Read from a file (through the page cache).
1644 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1647 struct vvp_io_args *args;
1654 * Currently when PCC read failed, we do not fall back to the
1655 * normal read path, just return the error.
1656 * The resaon is that: for RW-PCC, the file data may be modified
1657 * in the PCC and inconsistent with the data on OSTs (or file
1658 * data has been removed from the Lustre file system), at this
1659 * time, fallback to the normal read path may read the wrong
1661 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1662 * path: read data from data copy on OSTs.
1664 result = pcc_file_read_iter(iocb, to, &cached);
1668 ll_ras_enter(iocb->ki_filp);
1670 result = ll_do_fast_read(iocb, to);
1671 if (result < 0 || iov_iter_count(to) == 0)
1674 env = cl_env_get(&refcheck);
1676 return PTR_ERR(env);
1678 args = ll_env_args(env, IO_NORMAL);
1679 args->u.normal.via_iter = to;
1680 args->u.normal.via_iocb = iocb;
1682 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1683 &iocb->ki_pos, iov_iter_count(to));
1686 else if (result == 0)
1689 cl_env_put(env, &refcheck);
1695 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1696 * If a page is already in the page cache and dirty (and some other things -
1697 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1698 * write to it without doing a full I/O, because Lustre already knows about it
1699 * and will write it out. This saves a lot of processing time.
1701 * All writes here are within one page, so exclusion is handled by the page
1702 * lock on the vm page. We do not do tiny writes for writes which touch
1703 * multiple pages because it's very unlikely multiple sequential pages are
1704 * are already dirty.
1706 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1707 * and are unlikely to be to already dirty pages.
1709 * Attribute updates are important here, we do them in ll_tiny_write_end.
1711 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1713 ssize_t count = iov_iter_count(iter);
1714 struct file *file = iocb->ki_filp;
1715 struct inode *inode = file_inode(file);
1716 bool lock_inode = !IS_NOSEC(inode);
1721 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1722 * of function for why.
1724 if (count >= PAGE_SIZE ||
1725 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1728 if (unlikely(lock_inode))
1730 result = __generic_file_write_iter(iocb, iter);
1732 if (unlikely(lock_inode))
1733 inode_unlock(inode);
1735 /* If the page is not already dirty, ll_tiny_write_begin returns
1736 * -ENODATA. We continue on to normal write.
1738 if (result == -ENODATA)
1742 ll_heat_add(inode, CIT_WRITE, result);
1743 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1745 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1748 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1754 * Write to a file (through the page cache).
1756 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1758 struct vvp_io_args *args;
1760 ssize_t rc_tiny = 0, rc_normal;
1768 * When PCC write failed, we usually do not fall back to the normal
1769 * write path, just return the error. But there is a special case when
1770 * returned error code is -ENOSPC due to running out of space on PCC HSM
1771 * bakcend. At this time, it will fall back to normal I/O path and
1772 * retry the I/O. As the file is in HSM released state, it will restore
1773 * the file data to OSTs first and redo the write again. And the
1774 * restore process will revoke the layout lock and detach the file
1775 * from PCC cache automatically.
1777 result = pcc_file_write_iter(iocb, from, &cached);
1778 if (cached && result != -ENOSPC)
1781 /* NB: we can't do direct IO for tiny writes because they use the page
1782 * cache, we can't do sync writes because tiny writes can't flush
1783 * pages, and we can't do append writes because we can't guarantee the
1784 * required DLM locks are held to protect file size.
1786 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1787 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1788 rc_tiny = ll_do_tiny_write(iocb, from);
1790 /* In case of error, go on and try normal write - Only stop if tiny
1791 * write completed I/O.
1793 if (iov_iter_count(from) == 0)
1794 GOTO(out, rc_normal = rc_tiny);
1796 env = cl_env_get(&refcheck);
1798 return PTR_ERR(env);
1800 args = ll_env_args(env, IO_NORMAL);
1801 args->u.normal.via_iter = from;
1802 args->u.normal.via_iocb = iocb;
1804 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1805 &iocb->ki_pos, iov_iter_count(from));
1807 /* On success, combine bytes written. */
1808 if (rc_tiny >= 0 && rc_normal > 0)
1809 rc_normal += rc_tiny;
1810 /* On error, only return error from normal write if tiny write did not
1811 * write any bytes. Otherwise return bytes written by tiny write.
1813 else if (rc_tiny > 0)
1814 rc_normal = rc_tiny;
1816 cl_env_put(env, &refcheck);
1821 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1823 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1825 static int ll_file_get_iov_count(const struct iovec *iov,
1826 unsigned long *nr_segs, size_t *count)
1831 for (seg = 0; seg < *nr_segs; seg++) {
1832 const struct iovec *iv = &iov[seg];
1835 * If any segment has a negative length, or the cumulative
1836 * length ever wraps negative then return -EINVAL.
1839 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1841 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1846 cnt -= iv->iov_len; /* This segment is no good */
1853 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1854 unsigned long nr_segs, loff_t pos)
1861 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1865 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1866 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1867 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1868 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1869 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1871 result = ll_file_read_iter(iocb, &to);
1876 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1879 struct iovec iov = { .iov_base = buf, .iov_len = count };
1884 init_sync_kiocb(&kiocb, file);
1885 kiocb.ki_pos = *ppos;
1886 #ifdef HAVE_KIOCB_KI_LEFT
1887 kiocb.ki_left = count;
1888 #elif defined(HAVE_KI_NBYTES)
1889 kiocb.i_nbytes = count;
1892 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1893 *ppos = kiocb.ki_pos;
1899 * Write to a file (through the page cache).
1902 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1903 unsigned long nr_segs, loff_t pos)
1905 struct iov_iter from;
1910 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1914 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1915 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1916 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1917 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1918 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1920 result = ll_file_write_iter(iocb, &from);
1925 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1926 size_t count, loff_t *ppos)
1928 struct iovec iov = { .iov_base = (void __user *)buf,
1935 init_sync_kiocb(&kiocb, file);
1936 kiocb.ki_pos = *ppos;
1937 #ifdef HAVE_KIOCB_KI_LEFT
1938 kiocb.ki_left = count;
1939 #elif defined(HAVE_KI_NBYTES)
1940 kiocb.ki_nbytes = count;
1943 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1944 *ppos = kiocb.ki_pos;
1948 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1951 * Send file content (through pagecache) somewhere with helper
1953 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1954 struct pipe_inode_info *pipe, size_t count,
1958 struct vvp_io_args *args;
1965 result = pcc_file_splice_read(in_file, ppos, pipe,
1966 count, flags, &cached);
1970 ll_ras_enter(in_file);
1972 env = cl_env_get(&refcheck);
1974 RETURN(PTR_ERR(env));
1976 args = ll_env_args(env, IO_SPLICE);
1977 args->u.splice.via_pipe = pipe;
1978 args->u.splice.via_flags = flags;
1980 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1981 cl_env_put(env, &refcheck);
1985 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1986 __u64 flags, struct lov_user_md *lum, int lum_size)
1988 struct lookup_intent oit = {
1990 .it_flags = flags | MDS_OPEN_BY_FID,
1995 ll_inode_size_lock(inode);
1996 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1998 GOTO(out_unlock, rc);
2000 ll_release_openhandle(dentry, &oit);
2003 ll_inode_size_unlock(inode);
2004 ll_intent_release(&oit);
2009 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2010 struct lov_mds_md **lmmp, int *lmm_size,
2011 struct ptlrpc_request **request)
2013 struct ll_sb_info *sbi = ll_i2sbi(inode);
2014 struct mdt_body *body;
2015 struct lov_mds_md *lmm = NULL;
2016 struct ptlrpc_request *req = NULL;
2017 struct md_op_data *op_data;
2020 rc = ll_get_default_mdsize(sbi, &lmmsize);
2024 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2025 strlen(filename), lmmsize,
2026 LUSTRE_OPC_ANY, NULL);
2027 if (IS_ERR(op_data))
2028 RETURN(PTR_ERR(op_data));
2030 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2031 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2032 ll_finish_md_op_data(op_data);
2034 CDEBUG(D_INFO, "md_getattr_name failed "
2035 "on %s: rc %d\n", filename, rc);
2039 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2040 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2042 lmmsize = body->mbo_eadatasize;
2044 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2046 GOTO(out, rc = -ENODATA);
2049 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2050 LASSERT(lmm != NULL);
2052 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2053 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2054 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2055 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2056 GOTO(out, rc = -EPROTO);
2059 * This is coming from the MDS, so is probably in
2060 * little endian. We convert it to host endian before
2061 * passing it to userspace.
2063 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2066 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2067 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2068 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2069 if (le32_to_cpu(lmm->lmm_pattern) &
2070 LOV_PATTERN_F_RELEASED)
2074 /* if function called for directory - we should
2075 * avoid swab not existent lsm objects */
2076 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2077 lustre_swab_lov_user_md_v1(
2078 (struct lov_user_md_v1 *)lmm);
2079 if (S_ISREG(body->mbo_mode))
2080 lustre_swab_lov_user_md_objects(
2081 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2083 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2084 lustre_swab_lov_user_md_v3(
2085 (struct lov_user_md_v3 *)lmm);
2086 if (S_ISREG(body->mbo_mode))
2087 lustre_swab_lov_user_md_objects(
2088 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2090 } else if (lmm->lmm_magic ==
2091 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2092 lustre_swab_lov_comp_md_v1(
2093 (struct lov_comp_md_v1 *)lmm);
2094 } else if (lmm->lmm_magic ==
2095 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2096 struct lov_foreign_md *lfm;
2098 lfm = (struct lov_foreign_md *)lmm;
2099 __swab32s(&lfm->lfm_magic);
2100 __swab32s(&lfm->lfm_length);
2101 __swab32s(&lfm->lfm_type);
2102 __swab32s(&lfm->lfm_flags);
2108 *lmm_size = lmmsize;
2113 static int ll_lov_setea(struct inode *inode, struct file *file,
2116 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2117 struct lov_user_md *lump;
2118 int lum_size = sizeof(struct lov_user_md) +
2119 sizeof(struct lov_user_ost_data);
2123 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2126 OBD_ALLOC_LARGE(lump, lum_size);
2130 if (copy_from_user(lump, arg, lum_size))
2131 GOTO(out_lump, rc = -EFAULT);
2133 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2135 cl_lov_delay_create_clear(&file->f_flags);
2138 OBD_FREE_LARGE(lump, lum_size);
2142 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2149 env = cl_env_get(&refcheck);
2151 RETURN(PTR_ERR(env));
2153 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2154 cl_env_put(env, &refcheck);
2158 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2161 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2162 struct lov_user_md *klum;
2164 __u64 flags = FMODE_WRITE;
2167 rc = ll_copy_user_md(lum, &klum);
2172 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2177 rc = put_user(0, &lum->lmm_stripe_count);
2181 rc = ll_layout_refresh(inode, &gen);
2185 rc = ll_file_getstripe(inode, arg, lum_size);
2187 cl_lov_delay_create_clear(&file->f_flags);
2190 OBD_FREE(klum, lum_size);
2195 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2197 struct ll_inode_info *lli = ll_i2info(inode);
2198 struct cl_object *obj = lli->lli_clob;
2199 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2200 struct ll_grouplock grouplock;
2205 CWARN("group id for group lock must not be 0\n");
2209 if (ll_file_nolock(file))
2210 RETURN(-EOPNOTSUPP);
2212 spin_lock(&lli->lli_lock);
2213 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2214 CWARN("group lock already existed with gid %lu\n",
2215 fd->fd_grouplock.lg_gid);
2216 spin_unlock(&lli->lli_lock);
2219 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2220 spin_unlock(&lli->lli_lock);
2223 * XXX: group lock needs to protect all OST objects while PFL
2224 * can add new OST objects during the IO, so we'd instantiate
2225 * all OST objects before getting its group lock.
2230 struct cl_layout cl = {
2231 .cl_is_composite = false,
2233 struct lu_extent ext = {
2235 .e_end = OBD_OBJECT_EOF,
2238 env = cl_env_get(&refcheck);
2240 RETURN(PTR_ERR(env));
2242 rc = cl_object_layout_get(env, obj, &cl);
2243 if (!rc && cl.cl_is_composite)
2244 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2247 cl_env_put(env, &refcheck);
2252 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2253 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2257 spin_lock(&lli->lli_lock);
2258 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2259 spin_unlock(&lli->lli_lock);
2260 CERROR("another thread just won the race\n");
2261 cl_put_grouplock(&grouplock);
2265 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2266 fd->fd_grouplock = grouplock;
2267 spin_unlock(&lli->lli_lock);
2269 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2273 static int ll_put_grouplock(struct inode *inode, struct file *file,
2276 struct ll_inode_info *lli = ll_i2info(inode);
2277 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2278 struct ll_grouplock grouplock;
2281 spin_lock(&lli->lli_lock);
2282 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2283 spin_unlock(&lli->lli_lock);
2284 CWARN("no group lock held\n");
2288 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2290 if (fd->fd_grouplock.lg_gid != arg) {
2291 CWARN("group lock %lu doesn't match current id %lu\n",
2292 arg, fd->fd_grouplock.lg_gid);
2293 spin_unlock(&lli->lli_lock);
2297 grouplock = fd->fd_grouplock;
2298 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2299 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2300 spin_unlock(&lli->lli_lock);
2302 cl_put_grouplock(&grouplock);
2303 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2308 * Close inode open handle
2310 * \param dentry [in] dentry which contains the inode
2311 * \param it [in,out] intent which contains open info and result
2314 * \retval <0 failure
2316 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2318 struct inode *inode = dentry->d_inode;
2319 struct obd_client_handle *och;
2325 /* Root ? Do nothing. */
2326 if (dentry->d_inode->i_sb->s_root == dentry)
2329 /* No open handle to close? Move away */
2330 if (!it_disposition(it, DISP_OPEN_OPEN))
2333 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2335 OBD_ALLOC(och, sizeof(*och));
2337 GOTO(out, rc = -ENOMEM);
2339 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2341 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2343 /* this one is in place of ll_file_open */
2344 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2345 ptlrpc_req_finished(it->it_request);
2346 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2352 * Get size for inode for which FIEMAP mapping is requested.
2353 * Make the FIEMAP get_info call and returns the result.
2354 * \param fiemap kernel buffer to hold extens
2355 * \param num_bytes kernel buffer size
2357 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2363 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2366 /* Checks for fiemap flags */
2367 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2368 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2372 /* Check for FIEMAP_FLAG_SYNC */
2373 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2374 rc = filemap_fdatawrite(inode->i_mapping);
2379 env = cl_env_get(&refcheck);
2381 RETURN(PTR_ERR(env));
2383 if (i_size_read(inode) == 0) {
2384 rc = ll_glimpse_size(inode);
2389 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2390 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2391 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2393 /* If filesize is 0, then there would be no objects for mapping */
2394 if (fmkey.lfik_oa.o_size == 0) {
2395 fiemap->fm_mapped_extents = 0;
2399 fmkey.lfik_fiemap = *fiemap;
2401 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2402 &fmkey, fiemap, &num_bytes);
2404 cl_env_put(env, &refcheck);
2408 int ll_fid2path(struct inode *inode, void __user *arg)
2410 struct obd_export *exp = ll_i2mdexp(inode);
2411 const struct getinfo_fid2path __user *gfin = arg;
2413 struct getinfo_fid2path *gfout;
2419 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2420 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2423 /* Only need to get the buflen */
2424 if (get_user(pathlen, &gfin->gf_pathlen))
2427 if (pathlen > PATH_MAX)
2430 outsize = sizeof(*gfout) + pathlen;
2431 OBD_ALLOC(gfout, outsize);
2435 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2436 GOTO(gf_free, rc = -EFAULT);
2437 /* append root FID after gfout to let MDT know the root FID so that it
2438 * can lookup the correct path, this is mainly for fileset.
2439 * old server without fileset mount support will ignore this. */
2440 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2442 /* Call mdc_iocontrol */
2443 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2447 if (copy_to_user(arg, gfout, outsize))
2451 OBD_FREE(gfout, outsize);
2456 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2458 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2466 ioc->idv_version = 0;
2467 ioc->idv_layout_version = UINT_MAX;
2469 /* If no file object initialized, we consider its version is 0. */
2473 env = cl_env_get(&refcheck);
2475 RETURN(PTR_ERR(env));
2477 io = vvp_env_thread_io(env);
2479 io->u.ci_data_version.dv_data_version = 0;
2480 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2481 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2484 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2485 result = cl_io_loop(env, io);
2487 result = io->ci_result;
2489 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2490 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2492 cl_io_fini(env, io);
2494 if (unlikely(io->ci_need_restart))
2497 cl_env_put(env, &refcheck);
2503 * Read the data_version for inode.
2505 * This value is computed using stripe object version on OST.
2506 * Version is computed using server side locking.
2508 * @param flags if do sync on the OST side;
2510 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2511 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2513 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2515 struct ioc_data_version ioc = { .idv_flags = flags };
2518 rc = ll_ioc_data_version(inode, &ioc);
2520 *data_version = ioc.idv_version;
2526 * Trigger a HSM release request for the provided inode.
2528 int ll_hsm_release(struct inode *inode)
2531 struct obd_client_handle *och = NULL;
2532 __u64 data_version = 0;
2537 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2538 ll_i2sbi(inode)->ll_fsname,
2539 PFID(&ll_i2info(inode)->lli_fid));
2541 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2543 GOTO(out, rc = PTR_ERR(och));
2545 /* Grab latest data_version and [am]time values */
2546 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2550 env = cl_env_get(&refcheck);
2552 GOTO(out, rc = PTR_ERR(env));
2554 rc = ll_merge_attr(env, inode);
2555 cl_env_put(env, &refcheck);
2557 /* If error happen, we have the wrong size for a file.
2563 /* Release the file.
2564 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2565 * we still need it to pack l_remote_handle to MDT. */
2566 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2572 if (och != NULL && !IS_ERR(och)) /* close the file */
2573 ll_lease_close(och, inode, NULL);
2578 struct ll_swap_stack {
2581 struct inode *inode1;
2582 struct inode *inode2;
2587 static int ll_swap_layouts(struct file *file1, struct file *file2,
2588 struct lustre_swap_layouts *lsl)
2590 struct mdc_swap_layouts msl;
2591 struct md_op_data *op_data;
2594 struct ll_swap_stack *llss = NULL;
2597 OBD_ALLOC_PTR(llss);
2601 llss->inode1 = file_inode(file1);
2602 llss->inode2 = file_inode(file2);
2604 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2608 /* we use 2 bool because it is easier to swap than 2 bits */
2609 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2610 llss->check_dv1 = true;
2612 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2613 llss->check_dv2 = true;
2615 /* we cannot use lsl->sl_dvX directly because we may swap them */
2616 llss->dv1 = lsl->sl_dv1;
2617 llss->dv2 = lsl->sl_dv2;
2619 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2620 if (rc == 0) /* same file, done! */
2623 if (rc < 0) { /* sequentialize it */
2624 swap(llss->inode1, llss->inode2);
2626 swap(llss->dv1, llss->dv2);
2627 swap(llss->check_dv1, llss->check_dv2);
2631 if (gid != 0) { /* application asks to flush dirty cache */
2632 rc = ll_get_grouplock(llss->inode1, file1, gid);
2636 rc = ll_get_grouplock(llss->inode2, file2, gid);
2638 ll_put_grouplock(llss->inode1, file1, gid);
2643 /* ultimate check, before swaping the layouts we check if
2644 * dataversion has changed (if requested) */
2645 if (llss->check_dv1) {
2646 rc = ll_data_version(llss->inode1, &dv, 0);
2649 if (dv != llss->dv1)
2650 GOTO(putgl, rc = -EAGAIN);
2653 if (llss->check_dv2) {
2654 rc = ll_data_version(llss->inode2, &dv, 0);
2657 if (dv != llss->dv2)
2658 GOTO(putgl, rc = -EAGAIN);
2661 /* struct md_op_data is used to send the swap args to the mdt
2662 * only flags is missing, so we use struct mdc_swap_layouts
2663 * through the md_op_data->op_data */
2664 /* flags from user space have to be converted before they are send to
2665 * server, no flag is sent today, they are only used on the client */
2668 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2669 0, LUSTRE_OPC_ANY, &msl);
2670 if (IS_ERR(op_data))
2671 GOTO(free, rc = PTR_ERR(op_data));
2673 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2674 sizeof(*op_data), op_data, NULL);
2675 ll_finish_md_op_data(op_data);
2682 ll_put_grouplock(llss->inode2, file2, gid);
2683 ll_put_grouplock(llss->inode1, file1, gid);
2693 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2695 struct obd_export *exp = ll_i2mdexp(inode);
2696 struct md_op_data *op_data;
2700 /* Detect out-of range masks */
2701 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2704 /* Non-root users are forbidden to set or clear flags which are
2705 * NOT defined in HSM_USER_MASK. */
2706 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2707 !cfs_capable(CFS_CAP_SYS_ADMIN))
2710 if (!exp_connect_archive_id_array(exp)) {
2711 /* Detect out-of range archive id */
2712 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2713 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2717 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2718 LUSTRE_OPC_ANY, hss);
2719 if (IS_ERR(op_data))
2720 RETURN(PTR_ERR(op_data));
2722 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2725 ll_finish_md_op_data(op_data);
2730 static int ll_hsm_import(struct inode *inode, struct file *file,
2731 struct hsm_user_import *hui)
2733 struct hsm_state_set *hss = NULL;
2734 struct iattr *attr = NULL;
2738 if (!S_ISREG(inode->i_mode))
2744 GOTO(out, rc = -ENOMEM);
2746 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2747 hss->hss_archive_id = hui->hui_archive_id;
2748 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2749 rc = ll_hsm_state_set(inode, hss);
2753 OBD_ALLOC_PTR(attr);
2755 GOTO(out, rc = -ENOMEM);
2757 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2758 attr->ia_mode |= S_IFREG;
2759 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2760 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2761 attr->ia_size = hui->hui_size;
2762 attr->ia_mtime.tv_sec = hui->hui_mtime;
2763 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2764 attr->ia_atime.tv_sec = hui->hui_atime;
2765 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2767 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2768 ATTR_UID | ATTR_GID |
2769 ATTR_MTIME | ATTR_MTIME_SET |
2770 ATTR_ATIME | ATTR_ATIME_SET;
2774 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2778 inode_unlock(inode);
2790 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2792 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2793 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2796 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2798 struct inode *inode = file_inode(file);
2800 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2801 ATTR_MTIME | ATTR_MTIME_SET |
2804 .tv_sec = lfu->lfu_atime_sec,
2805 .tv_nsec = lfu->lfu_atime_nsec,
2808 .tv_sec = lfu->lfu_mtime_sec,
2809 .tv_nsec = lfu->lfu_mtime_nsec,
2812 .tv_sec = lfu->lfu_ctime_sec,
2813 .tv_nsec = lfu->lfu_ctime_nsec,
2819 if (!capable(CAP_SYS_ADMIN))
2822 if (!S_ISREG(inode->i_mode))
2826 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2828 inode_unlock(inode);
2833 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2836 case MODE_READ_USER:
2838 case MODE_WRITE_USER:
2845 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2847 /* Used to allow the upper layers of the client to request an LDLM lock
2848 * without doing an actual read or write.
2850 * Used for ladvise lockahead to manually request specific locks.
2852 * \param[in] file file this ladvise lock request is on
2853 * \param[in] ladvise ladvise struct describing this lock request
2855 * \retval 0 success, no detailed result available (sync requests
2856 * and requests sent to the server [not handled locally]
2857 * cannot return detailed results)
2858 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2859 * see definitions for details.
2860 * \retval negative negative errno on error
2862 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2864 struct lu_env *env = NULL;
2865 struct cl_io *io = NULL;
2866 struct cl_lock *lock = NULL;
2867 struct cl_lock_descr *descr = NULL;
2868 struct dentry *dentry = file->f_path.dentry;
2869 struct inode *inode = dentry->d_inode;
2870 enum cl_lock_mode cl_mode;
2871 off_t start = ladvise->lla_start;
2872 off_t end = ladvise->lla_end;
2878 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2879 "start=%llu, end=%llu\n", dentry->d_name.len,
2880 dentry->d_name.name, dentry->d_inode,
2881 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2884 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2886 GOTO(out, result = cl_mode);
2888 /* Get IO environment */
2889 result = cl_io_get(inode, &env, &io, &refcheck);
2893 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2896 * nothing to do for this io. This currently happens when
2897 * stripe sub-object's are not yet created.
2899 result = io->ci_result;
2900 } else if (result == 0) {
2901 lock = vvp_env_lock(env);
2902 descr = &lock->cll_descr;
2904 descr->cld_obj = io->ci_obj;
2905 /* Convert byte offsets to pages */
2906 descr->cld_start = cl_index(io->ci_obj, start);
2907 descr->cld_end = cl_index(io->ci_obj, end);
2908 descr->cld_mode = cl_mode;
2909 /* CEF_MUST is used because we do not want to convert a
2910 * lockahead request to a lockless lock */
2911 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2914 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2915 descr->cld_enq_flags |= CEF_SPECULATIVE;
2917 result = cl_lock_request(env, io, lock);
2919 /* On success, we need to release the lock */
2921 cl_lock_release(env, lock);
2923 cl_io_fini(env, io);
2924 cl_env_put(env, &refcheck);
2926 /* -ECANCELED indicates a matching lock with a different extent
2927 * was already present, and -EEXIST indicates a matching lock
2928 * on exactly the same extent was already present.
2929 * We convert them to positive values for userspace to make
2930 * recognizing true errors easier.
2931 * Note we can only return these detailed results on async requests,
2932 * as sync requests look the same as i/o requests for locking. */
2933 if (result == -ECANCELED)
2934 result = LLA_RESULT_DIFFERENT;
2935 else if (result == -EEXIST)
2936 result = LLA_RESULT_SAME;
2941 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2943 static int ll_ladvise_sanity(struct inode *inode,
2944 struct llapi_lu_ladvise *ladvise)
2946 struct ll_sb_info *sbi = ll_i2sbi(inode);
2947 enum lu_ladvise_type advice = ladvise->lla_advice;
2948 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2949 * be in the first 32 bits of enum ladvise_flags */
2950 __u32 flags = ladvise->lla_peradvice_flags;
2951 /* 3 lines at 80 characters per line, should be plenty */
2954 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2956 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2957 "last supported advice is %s (value '%d'): rc = %d\n",
2958 sbi->ll_fsname, advice,
2959 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2963 /* Per-advice checks */
2965 case LU_LADVISE_LOCKNOEXPAND:
2966 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2968 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2969 "rc = %d\n", sbi->ll_fsname, flags,
2970 ladvise_names[advice], rc);
2974 case LU_LADVISE_LOCKAHEAD:
2975 /* Currently only READ and WRITE modes can be requested */
2976 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2977 ladvise->lla_lockahead_mode == 0) {
2979 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2980 "rc = %d\n", sbi->ll_fsname,
2981 ladvise->lla_lockahead_mode,
2982 ladvise_names[advice], rc);
2985 case LU_LADVISE_WILLREAD:
2986 case LU_LADVISE_DONTNEED:
2988 /* Note fall through above - These checks apply to all advices
2989 * except LOCKNOEXPAND */
2990 if (flags & ~LF_DEFAULT_MASK) {
2992 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2993 "rc = %d\n", sbi->ll_fsname, flags,
2994 ladvise_names[advice], rc);
2997 if (ladvise->lla_start >= ladvise->lla_end) {
2999 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3000 "for %s: rc = %d\n", sbi->ll_fsname,
3001 ladvise->lla_start, ladvise->lla_end,
3002 ladvise_names[advice], rc);
3014 * Give file access advices
3016 * The ladvise interface is similar to Linux fadvise() system call, except it
3017 * forwards the advices directly from Lustre client to server. The server side
3018 * codes will apply appropriate read-ahead and caching techniques for the
3019 * corresponding files.
3021 * A typical workload for ladvise is e.g. a bunch of different clients are
3022 * doing small random reads of a file, so prefetching pages into OSS cache
3023 * with big linear reads before the random IO is a net benefit. Fetching
3024 * all that data into each client cache with fadvise() may not be, due to
3025 * much more data being sent to the client.
3027 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3028 struct llapi_lu_ladvise *ladvise)
3032 struct cl_ladvise_io *lio;
3037 env = cl_env_get(&refcheck);
3039 RETURN(PTR_ERR(env));
3041 io = vvp_env_thread_io(env);
3042 io->ci_obj = ll_i2info(inode)->lli_clob;
3044 /* initialize parameters for ladvise */
3045 lio = &io->u.ci_ladvise;
3046 lio->li_start = ladvise->lla_start;
3047 lio->li_end = ladvise->lla_end;
3048 lio->li_fid = ll_inode2fid(inode);
3049 lio->li_advice = ladvise->lla_advice;
3050 lio->li_flags = flags;
3052 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3053 rc = cl_io_loop(env, io);
3057 cl_io_fini(env, io);
3058 cl_env_put(env, &refcheck);
3062 static int ll_lock_noexpand(struct file *file, int flags)
3064 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3066 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3071 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3074 struct fsxattr fsxattr;
3076 if (copy_from_user(&fsxattr,
3077 (const struct fsxattr __user *)arg,
3081 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3082 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3083 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3084 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3085 if (copy_to_user((struct fsxattr __user *)arg,
3086 &fsxattr, sizeof(fsxattr)))
3092 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3095 * Project Quota ID state is only allowed to change from within the init
3096 * namespace. Enforce that restriction only if we are trying to change
3097 * the quota ID state. Everything else is allowed in user namespaces.
3099 if (current_user_ns() == &init_user_ns)
3102 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3105 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3106 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3109 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3116 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3120 struct md_op_data *op_data;
3121 struct ptlrpc_request *req = NULL;
3123 struct fsxattr fsxattr;
3124 struct cl_object *obj;
3128 if (copy_from_user(&fsxattr,
3129 (const struct fsxattr __user *)arg,
3133 rc = ll_ioctl_check_project(inode, &fsxattr);
3137 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3138 LUSTRE_OPC_ANY, NULL);
3139 if (IS_ERR(op_data))
3140 RETURN(PTR_ERR(op_data));
3142 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3143 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3144 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3145 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3146 op_data->op_projid = fsxattr.fsx_projid;
3147 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3148 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3150 ptlrpc_req_finished(req);
3152 GOTO(out_fsxattr, rc);
3153 ll_update_inode_flags(inode, op_data->op_attr_flags);
3154 obj = ll_i2info(inode)->lli_clob;
3156 GOTO(out_fsxattr, rc);
3158 OBD_ALLOC_PTR(attr);
3160 GOTO(out_fsxattr, rc = -ENOMEM);
3162 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3163 fsxattr.fsx_xflags);
3166 ll_finish_md_op_data(op_data);
3170 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3173 struct inode *inode = file_inode(file);
3174 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3175 struct ll_inode_info *lli = ll_i2info(inode);
3176 struct obd_client_handle *och = NULL;
3177 struct split_param sp;
3178 struct pcc_param param;
3179 bool lease_broken = false;
3181 enum mds_op_bias bias = 0;
3182 struct file *layout_file = NULL;
3184 size_t data_size = 0;
3185 bool attached = false;
3190 mutex_lock(&lli->lli_och_mutex);
3191 if (fd->fd_lease_och != NULL) {
3192 och = fd->fd_lease_och;
3193 fd->fd_lease_och = NULL;
3195 mutex_unlock(&lli->lli_och_mutex);
3200 fmode = och->och_flags;
3202 switch (ioc->lil_flags) {
3203 case LL_LEASE_RESYNC_DONE:
3204 if (ioc->lil_count > IOC_IDS_MAX)
3205 GOTO(out_lease_close, rc = -EINVAL);
3207 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3208 OBD_ALLOC(data, data_size);
3210 GOTO(out_lease_close, rc = -ENOMEM);
3212 if (copy_from_user(data, (void __user *)arg, data_size))
3213 GOTO(out_lease_close, rc = -EFAULT);
3215 bias = MDS_CLOSE_RESYNC_DONE;
3217 case LL_LEASE_LAYOUT_MERGE: {
3220 if (ioc->lil_count != 1)
3221 GOTO(out_lease_close, rc = -EINVAL);
3223 arg += sizeof(*ioc);
3224 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3225 GOTO(out_lease_close, rc = -EFAULT);
3227 layout_file = fget(fd);
3229 GOTO(out_lease_close, rc = -EBADF);
3231 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3232 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3233 GOTO(out_lease_close, rc = -EPERM);
3235 data = file_inode(layout_file);
3236 bias = MDS_CLOSE_LAYOUT_MERGE;
3239 case LL_LEASE_LAYOUT_SPLIT: {
3243 if (ioc->lil_count != 2)
3244 GOTO(out_lease_close, rc = -EINVAL);
3246 arg += sizeof(*ioc);
3247 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3248 GOTO(out_lease_close, rc = -EFAULT);
3250 arg += sizeof(__u32);
3251 if (copy_from_user(&mirror_id, (void __user *)arg,
3253 GOTO(out_lease_close, rc = -EFAULT);
3255 layout_file = fget(fdv);
3257 GOTO(out_lease_close, rc = -EBADF);
3259 sp.sp_inode = file_inode(layout_file);
3260 sp.sp_mirror_id = (__u16)mirror_id;
3262 bias = MDS_CLOSE_LAYOUT_SPLIT;
3265 case LL_LEASE_PCC_ATTACH:
3266 if (ioc->lil_count != 1)
3269 arg += sizeof(*ioc);
3270 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3272 GOTO(out_lease_close, rc2 = -EFAULT);
3274 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3276 GOTO(out_lease_close, rc2);
3279 /* Grab latest data version */
3280 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3283 GOTO(out_lease_close, rc2);
3286 bias = MDS_PCC_ATTACH;
3289 /* without close intent */
3294 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3298 rc = ll_lease_och_release(inode, file);
3307 switch (ioc->lil_flags) {
3308 case LL_LEASE_RESYNC_DONE:
3310 OBD_FREE(data, data_size);
3312 case LL_LEASE_LAYOUT_MERGE:
3313 case LL_LEASE_LAYOUT_SPLIT:
3317 case LL_LEASE_PCC_ATTACH:
3320 rc = pcc_readwrite_attach_fini(file, inode,
3321 param.pa_layout_gen,
3328 rc = ll_lease_type_from_fmode(fmode);
3332 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3335 struct inode *inode = file_inode(file);
3336 struct ll_inode_info *lli = ll_i2info(inode);
3337 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3338 struct obd_client_handle *och = NULL;
3339 __u64 open_flags = 0;
3345 switch (ioc->lil_mode) {
3346 case LL_LEASE_WRLCK:
3347 if (!(file->f_mode & FMODE_WRITE))
3349 fmode = FMODE_WRITE;
3351 case LL_LEASE_RDLCK:
3352 if (!(file->f_mode & FMODE_READ))
3356 case LL_LEASE_UNLCK:
3357 RETURN(ll_file_unlock_lease(file, ioc, arg));
3362 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3364 /* apply for lease */
3365 if (ioc->lil_flags & LL_LEASE_RESYNC)
3366 open_flags = MDS_OPEN_RESYNC;
3367 och = ll_lease_open(inode, file, fmode, open_flags);
3369 RETURN(PTR_ERR(och));
3371 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3372 rc = ll_lease_file_resync(och, inode, arg);
3374 ll_lease_close(och, inode, NULL);
3377 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3379 ll_lease_close(och, inode, NULL);
3385 mutex_lock(&lli->lli_och_mutex);
3386 if (fd->fd_lease_och == NULL) {
3387 fd->fd_lease_och = och;
3390 mutex_unlock(&lli->lli_och_mutex);
3392 /* impossible now that only excl is supported for now */
3393 ll_lease_close(och, inode, &lease_broken);
3399 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3401 struct ll_inode_info *lli = ll_i2info(inode);
3402 struct ll_sb_info *sbi = ll_i2sbi(inode);
3403 __u64 now = ktime_get_real_seconds();
3406 spin_lock(&lli->lli_heat_lock);
3407 heat->lh_flags = lli->lli_heat_flags;
3408 for (i = 0; i < heat->lh_count; i++)
3409 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3410 now, sbi->ll_heat_decay_weight,
3411 sbi->ll_heat_period_second);
3412 spin_unlock(&lli->lli_heat_lock);
3415 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3417 struct ll_inode_info *lli = ll_i2info(inode);
3420 spin_lock(&lli->lli_heat_lock);
3421 if (flags & LU_HEAT_FLAG_CLEAR)
3422 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3424 if (flags & LU_HEAT_FLAG_OFF)
3425 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3427 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3429 spin_unlock(&lli->lli_heat_lock);
3435 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3437 struct inode *inode = file_inode(file);
3438 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3442 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3443 PFID(ll_inode2fid(inode)), inode, cmd);
3444 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3446 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3447 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3451 case LL_IOC_GETFLAGS:
3452 /* Get the current value of the file flags */
3453 return put_user(fd->fd_flags, (int __user *)arg);
3454 case LL_IOC_SETFLAGS:
3455 case LL_IOC_CLRFLAGS:
3456 /* Set or clear specific file flags */
3457 /* XXX This probably needs checks to ensure the flags are
3458 * not abused, and to handle any flag side effects.
3460 if (get_user(flags, (int __user *) arg))
3463 if (cmd == LL_IOC_SETFLAGS) {
3464 if ((flags & LL_FILE_IGNORE_LOCK) &&
3465 !(file->f_flags & O_DIRECT)) {
3466 CERROR("%s: unable to disable locking on "
3467 "non-O_DIRECT file\n", current->comm);
3471 fd->fd_flags |= flags;
3473 fd->fd_flags &= ~flags;
3476 case LL_IOC_LOV_SETSTRIPE:
3477 case LL_IOC_LOV_SETSTRIPE_NEW:
3478 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3479 case LL_IOC_LOV_SETEA:
3480 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3481 case LL_IOC_LOV_SWAP_LAYOUTS: {
3483 struct lustre_swap_layouts lsl;
3485 if (copy_from_user(&lsl, (char __user *)arg,
3486 sizeof(struct lustre_swap_layouts)))
3489 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3492 file2 = fget(lsl.sl_fd);
3496 /* O_WRONLY or O_RDWR */
3497 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3498 GOTO(out, rc = -EPERM);
3500 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3501 struct inode *inode2;
3502 struct ll_inode_info *lli;
3503 struct obd_client_handle *och = NULL;
3505 lli = ll_i2info(inode);
3506 mutex_lock(&lli->lli_och_mutex);
3507 if (fd->fd_lease_och != NULL) {
3508 och = fd->fd_lease_och;
3509 fd->fd_lease_och = NULL;
3511 mutex_unlock(&lli->lli_och_mutex);
3513 GOTO(out, rc = -ENOLCK);
3514 inode2 = file_inode(file2);
3515 rc = ll_swap_layouts_close(och, inode, inode2);
3517 rc = ll_swap_layouts(file, file2, &lsl);
3523 case LL_IOC_LOV_GETSTRIPE:
3524 case LL_IOC_LOV_GETSTRIPE_NEW:
3525 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3526 case FS_IOC_GETFLAGS:
3527 case FS_IOC_SETFLAGS:
3528 RETURN(ll_iocontrol(inode, file, cmd, arg));
3529 case FSFILT_IOC_GETVERSION:
3530 case FS_IOC_GETVERSION:
3531 RETURN(put_user(inode->i_generation, (int __user *)arg));
3532 /* We need to special case any other ioctls we want to handle,
3533 * to send them to the MDS/OST as appropriate and to properly
3534 * network encode the arg field. */
3535 case FS_IOC_SETVERSION:
3538 case LL_IOC_GROUP_LOCK:
3539 RETURN(ll_get_grouplock(inode, file, arg));
3540 case LL_IOC_GROUP_UNLOCK:
3541 RETURN(ll_put_grouplock(inode, file, arg));
3542 case IOC_OBD_STATFS:
3543 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3545 case LL_IOC_FLUSHCTX:
3546 RETURN(ll_flush_ctx(inode));
3547 case LL_IOC_PATH2FID: {
3548 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3549 sizeof(struct lu_fid)))
3554 case LL_IOC_GETPARENT:
3555 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3557 case OBD_IOC_FID2PATH:
3558 RETURN(ll_fid2path(inode, (void __user *)arg));
3559 case LL_IOC_DATA_VERSION: {
3560 struct ioc_data_version idv;
3563 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3566 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3567 rc = ll_ioc_data_version(inode, &idv);
3570 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3576 case LL_IOC_GET_MDTIDX: {
3579 mdtidx = ll_get_mdt_idx(inode);
3583 if (put_user((int)mdtidx, (int __user *)arg))
3588 case OBD_IOC_GETDTNAME:
3589 case OBD_IOC_GETMDNAME:
3590 RETURN(ll_get_obd_name(inode, cmd, arg));
3591 case LL_IOC_HSM_STATE_GET: {
3592 struct md_op_data *op_data;
3593 struct hsm_user_state *hus;
3600 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3601 LUSTRE_OPC_ANY, hus);
3602 if (IS_ERR(op_data)) {
3604 RETURN(PTR_ERR(op_data));
3607 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3610 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3613 ll_finish_md_op_data(op_data);
3617 case LL_IOC_HSM_STATE_SET: {
3618 struct hsm_state_set *hss;
3625 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3630 rc = ll_hsm_state_set(inode, hss);
3635 case LL_IOC_HSM_ACTION: {
3636 struct md_op_data *op_data;
3637 struct hsm_current_action *hca;
3644 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3645 LUSTRE_OPC_ANY, hca);
3646 if (IS_ERR(op_data)) {
3648 RETURN(PTR_ERR(op_data));
3651 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3654 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3657 ll_finish_md_op_data(op_data);
3661 case LL_IOC_SET_LEASE_OLD: {
3662 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3664 RETURN(ll_file_set_lease(file, &ioc, 0));
3666 case LL_IOC_SET_LEASE: {
3667 struct ll_ioc_lease ioc;
3669 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3672 RETURN(ll_file_set_lease(file, &ioc, arg));
3674 case LL_IOC_GET_LEASE: {
3675 struct ll_inode_info *lli = ll_i2info(inode);
3676 struct ldlm_lock *lock = NULL;
3679 mutex_lock(&lli->lli_och_mutex);
3680 if (fd->fd_lease_och != NULL) {
3681 struct obd_client_handle *och = fd->fd_lease_och;
3683 lock = ldlm_handle2lock(&och->och_lease_handle);
3685 lock_res_and_lock(lock);
3686 if (!ldlm_is_cancel(lock))
3687 fmode = och->och_flags;
3689 unlock_res_and_lock(lock);
3690 LDLM_LOCK_PUT(lock);
3693 mutex_unlock(&lli->lli_och_mutex);
3695 RETURN(ll_lease_type_from_fmode(fmode));
3697 case LL_IOC_HSM_IMPORT: {
3698 struct hsm_user_import *hui;
3704 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3709 rc = ll_hsm_import(inode, file, hui);
3714 case LL_IOC_FUTIMES_3: {
3715 struct ll_futimes_3 lfu;
3717 if (copy_from_user(&lfu,
3718 (const struct ll_futimes_3 __user *)arg,
3722 RETURN(ll_file_futimes_3(file, &lfu));
3724 case LL_IOC_LADVISE: {
3725 struct llapi_ladvise_hdr *k_ladvise_hdr;
3726 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3729 int alloc_size = sizeof(*k_ladvise_hdr);
3732 u_ladvise_hdr = (void __user *)arg;
3733 OBD_ALLOC_PTR(k_ladvise_hdr);
3734 if (k_ladvise_hdr == NULL)
3737 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3738 GOTO(out_ladvise, rc = -EFAULT);
3740 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3741 k_ladvise_hdr->lah_count < 1)
3742 GOTO(out_ladvise, rc = -EINVAL);
3744 num_advise = k_ladvise_hdr->lah_count;
3745 if (num_advise >= LAH_COUNT_MAX)
3746 GOTO(out_ladvise, rc = -EFBIG);
3748 OBD_FREE_PTR(k_ladvise_hdr);
3749 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3750 lah_advise[num_advise]);
3751 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3752 if (k_ladvise_hdr == NULL)
3756 * TODO: submit multiple advices to one server in a single RPC
3758 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3759 GOTO(out_ladvise, rc = -EFAULT);
3761 for (i = 0; i < num_advise; i++) {
3762 struct llapi_lu_ladvise *k_ladvise =
3763 &k_ladvise_hdr->lah_advise[i];
3764 struct llapi_lu_ladvise __user *u_ladvise =
3765 &u_ladvise_hdr->lah_advise[i];
3767 rc = ll_ladvise_sanity(inode, k_ladvise);
3769 GOTO(out_ladvise, rc);
3771 switch (k_ladvise->lla_advice) {
3772 case LU_LADVISE_LOCKNOEXPAND:
3773 rc = ll_lock_noexpand(file,
3774 k_ladvise->lla_peradvice_flags);
3775 GOTO(out_ladvise, rc);
3776 case LU_LADVISE_LOCKAHEAD:
3778 rc = ll_file_lock_ahead(file, k_ladvise);
3781 GOTO(out_ladvise, rc);
3784 &u_ladvise->lla_lockahead_result))
3785 GOTO(out_ladvise, rc = -EFAULT);
3788 rc = ll_ladvise(inode, file,
3789 k_ladvise_hdr->lah_flags,
3792 GOTO(out_ladvise, rc);
3799 OBD_FREE(k_ladvise_hdr, alloc_size);
3802 case LL_IOC_FLR_SET_MIRROR: {
3803 /* mirror I/O must be direct to avoid polluting page cache
3805 if (!(file->f_flags & O_DIRECT))
3808 fd->fd_designated_mirror = (__u32)arg;
3811 case LL_IOC_FSGETXATTR:
3812 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3813 case LL_IOC_FSSETXATTR:
3814 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3816 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3817 case LL_IOC_HEAT_GET: {
3818 struct lu_heat uheat;
3819 struct lu_heat *heat;
3822 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3825 if (uheat.lh_count > OBD_HEAT_COUNT)
3826 uheat.lh_count = OBD_HEAT_COUNT;
3828 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3829 OBD_ALLOC(heat, size);
3833 heat->lh_count = uheat.lh_count;
3834 ll_heat_get(inode, heat);
3835 rc = copy_to_user((char __user *)arg, heat, size);
3836 OBD_FREE(heat, size);
3837 RETURN(rc ? -EFAULT : 0);
3839 case LL_IOC_HEAT_SET: {
3842 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3845 rc = ll_heat_set(inode, flags);
3848 case LL_IOC_PCC_DETACH:
3849 if (!S_ISREG(inode->i_mode))
3852 if (!inode_owner_or_capable(inode))
3855 RETURN(pcc_ioctl_detach(inode));
3856 case LL_IOC_PCC_STATE: {
3857 struct lu_pcc_state __user *ustate =
3858 (struct lu_pcc_state __user *)arg;
3859 struct lu_pcc_state *state;
3861 OBD_ALLOC_PTR(state);
3865 if (copy_from_user(state, ustate, sizeof(*state)))
3866 GOTO(out_state, rc = -EFAULT);
3868 rc = pcc_ioctl_state(file, inode, state);
3870 GOTO(out_state, rc);
3872 if (copy_to_user(ustate, state, sizeof(*state)))
3873 GOTO(out_state, rc = -EFAULT);
3876 OBD_FREE_PTR(state);
3880 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3881 (void __user *)arg));
3885 #ifndef HAVE_FILE_LLSEEK_SIZE
3886 static inline loff_t
3887 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3889 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3891 if (offset > maxsize)
3894 if (offset != file->f_pos) {
3895 file->f_pos = offset;
3896 file->f_version = 0;
3902 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3903 loff_t maxsize, loff_t eof)
3905 struct inode *inode = file_inode(file);
3913 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3914 * position-querying operation. Avoid rewriting the "same"
3915 * f_pos value back to the file because a concurrent read(),
3916 * write() or lseek() might have altered it
3921 * f_lock protects against read/modify/write race with other
3922 * SEEK_CURs. Note that parallel writes and reads behave
3926 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3927 inode_unlock(inode);
3931 * In the generic case the entire file is data, so as long as
3932 * offset isn't at the end of the file then the offset is data.
3939 * There is a virtual hole at the end of the file, so as long as
3940 * offset isn't i_size or larger, return i_size.
3948 return llseek_execute(file, offset, maxsize);
3952 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3954 struct inode *inode = file_inode(file);
3955 loff_t retval, eof = 0;
3958 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3959 (origin == SEEK_CUR) ? file->f_pos : 0);
3960 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3961 PFID(ll_inode2fid(inode)), inode, retval, retval,
3963 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3965 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3966 retval = ll_glimpse_size(inode);
3969 eof = i_size_read(inode);
3972 retval = ll_generic_file_llseek_size(file, offset, origin,
3973 ll_file_maxbytes(inode), eof);
3977 static int ll_flush(struct file *file, fl_owner_t id)
3979 struct inode *inode = file_inode(file);
3980 struct ll_inode_info *lli = ll_i2info(inode);
3981 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3984 LASSERT(!S_ISDIR(inode->i_mode));
3986 /* catch async errors that were recorded back when async writeback
3987 * failed for pages in this mapping. */
3988 rc = lli->lli_async_rc;
3989 lli->lli_async_rc = 0;
3990 if (lli->lli_clob != NULL) {
3991 err = lov_read_and_clear_async_rc(lli->lli_clob);
3996 /* The application has been told write failure already.
3997 * Do not report failure again. */
3998 if (fd->fd_write_failed)
4000 return rc ? -EIO : 0;
4004 * Called to make sure a portion of file has been written out.
4005 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4007 * Return how many pages have been written.
4009 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4010 enum cl_fsync_mode mode, int ignore_layout)
4014 struct cl_fsync_io *fio;
4019 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4020 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4023 env = cl_env_get(&refcheck);
4025 RETURN(PTR_ERR(env));
4027 io = vvp_env_thread_io(env);
4028 io->ci_obj = ll_i2info(inode)->lli_clob;
4029 io->ci_ignore_layout = ignore_layout;
4031 /* initialize parameters for sync */
4032 fio = &io->u.ci_fsync;
4033 fio->fi_start = start;
4035 fio->fi_fid = ll_inode2fid(inode);
4036 fio->fi_mode = mode;
4037 fio->fi_nr_written = 0;
4039 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4040 result = cl_io_loop(env, io);
4042 result = io->ci_result;
4044 result = fio->fi_nr_written;
4045 cl_io_fini(env, io);
4046 cl_env_put(env, &refcheck);
4052 * When dentry is provided (the 'else' case), file_dentry() may be
4053 * null and dentry must be used directly rather than pulled from
4054 * file_dentry() as is done otherwise.
4057 #ifdef HAVE_FILE_FSYNC_4ARGS
4058 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4060 struct dentry *dentry = file_dentry(file);
4061 #elif defined(HAVE_FILE_FSYNC_2ARGS)
4062 int ll_fsync(struct file *file, int datasync)
4064 struct dentry *dentry = file_dentry(file);
4066 loff_t end = LLONG_MAX;
4068 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
4071 loff_t end = LLONG_MAX;
4073 struct inode *inode = dentry->d_inode;
4074 struct ll_inode_info *lli = ll_i2info(inode);
4075 struct ptlrpc_request *req;
4080 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
4081 PFID(ll_inode2fid(inode)), inode);
4082 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4084 #ifdef HAVE_FILE_FSYNC_4ARGS
4085 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4088 /* fsync's caller has already called _fdata{sync,write}, we want
4089 * that IO to finish before calling the osc and mdc sync methods */
4090 rc = filemap_fdatawait(inode->i_mapping);
4093 /* catch async errors that were recorded back when async writeback
4094 * failed for pages in this mapping. */
4095 if (!S_ISDIR(inode->i_mode)) {
4096 err = lli->lli_async_rc;
4097 lli->lli_async_rc = 0;
4100 if (lli->lli_clob != NULL) {
4101 err = lov_read_and_clear_async_rc(lli->lli_clob);
4107 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4111 ptlrpc_req_finished(req);
4113 if (S_ISREG(inode->i_mode)) {
4114 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4117 /* Sync metadata on MDT first, and then sync the cached data
4120 err = pcc_fsync(file, start, end, datasync, &cached);
4122 err = cl_sync_file_range(inode, start, end,
4124 if (rc == 0 && err < 0)
4127 fd->fd_write_failed = true;
4129 fd->fd_write_failed = false;
4132 #ifdef HAVE_FILE_FSYNC_4ARGS
4133 inode_unlock(inode);
4139 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4141 struct inode *inode = file_inode(file);
4142 struct ll_sb_info *sbi = ll_i2sbi(inode);
4143 struct ldlm_enqueue_info einfo = {
4144 .ei_type = LDLM_FLOCK,
4145 .ei_cb_cp = ldlm_flock_completion_ast,
4146 .ei_cbdata = file_lock,
4148 struct md_op_data *op_data;
4149 struct lustre_handle lockh = { 0 };
4150 union ldlm_policy_data flock = { { 0 } };
4151 int fl_type = file_lock->fl_type;
4157 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4158 PFID(ll_inode2fid(inode)), file_lock);
4160 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4162 if (file_lock->fl_flags & FL_FLOCK) {
4163 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4164 /* flocks are whole-file locks */
4165 flock.l_flock.end = OFFSET_MAX;
4166 /* For flocks owner is determined by the local file desctiptor*/
4167 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4168 } else if (file_lock->fl_flags & FL_POSIX) {
4169 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4170 flock.l_flock.start = file_lock->fl_start;
4171 flock.l_flock.end = file_lock->fl_end;
4175 flock.l_flock.pid = file_lock->fl_pid;
4177 /* Somewhat ugly workaround for svc lockd.
4178 * lockd installs custom fl_lmops->lm_compare_owner that checks
4179 * for the fl_owner to be the same (which it always is on local node
4180 * I guess between lockd processes) and then compares pid.
4181 * As such we assign pid to the owner field to make it all work,
4182 * conflict with normal locks is unlikely since pid space and
4183 * pointer space for current->files are not intersecting */
4184 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4185 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4189 einfo.ei_mode = LCK_PR;
4192 /* An unlock request may or may not have any relation to
4193 * existing locks so we may not be able to pass a lock handle
4194 * via a normal ldlm_lock_cancel() request. The request may even
4195 * unlock a byte range in the middle of an existing lock. In
4196 * order to process an unlock request we need all of the same
4197 * information that is given with a normal read or write record
4198 * lock request. To avoid creating another ldlm unlock (cancel)
4199 * message we'll treat a LCK_NL flock request as an unlock. */
4200 einfo.ei_mode = LCK_NL;
4203 einfo.ei_mode = LCK_PW;
4206 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4221 flags = LDLM_FL_BLOCK_NOWAIT;
4227 flags = LDLM_FL_TEST_LOCK;
4230 CERROR("unknown fcntl lock command: %d\n", cmd);
4234 /* Save the old mode so that if the mode in the lock changes we
4235 * can decrement the appropriate reader or writer refcount. */
4236 file_lock->fl_type = einfo.ei_mode;
4238 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4239 LUSTRE_OPC_ANY, NULL);
4240 if (IS_ERR(op_data))
4241 RETURN(PTR_ERR(op_data));
4243 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4244 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4245 flock.l_flock.pid, flags, einfo.ei_mode,
4246 flock.l_flock.start, flock.l_flock.end);
4248 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4251 /* Restore the file lock type if not TEST lock. */
4252 if (!(flags & LDLM_FL_TEST_LOCK))
4253 file_lock->fl_type = fl_type;
4255 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4256 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4257 !(flags & LDLM_FL_TEST_LOCK))
4258 rc2 = locks_lock_file_wait(file, file_lock);
4260 if ((file_lock->fl_flags & FL_FLOCK) &&
4261 (rc == 0 || file_lock->fl_type == F_UNLCK))
4262 rc2 = flock_lock_file_wait(file, file_lock);
4263 if ((file_lock->fl_flags & FL_POSIX) &&
4264 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4265 !(flags & LDLM_FL_TEST_LOCK))
4266 rc2 = posix_lock_file_wait(file, file_lock);
4267 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4269 if (rc2 && file_lock->fl_type != F_UNLCK) {
4270 einfo.ei_mode = LCK_NL;
4271 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4276 ll_finish_md_op_data(op_data);
4281 int ll_get_fid_by_name(struct inode *parent, const char *name,
4282 int namelen, struct lu_fid *fid,
4283 struct inode **inode)
4285 struct md_op_data *op_data = NULL;
4286 struct mdt_body *body;
4287 struct ptlrpc_request *req;
4291 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4292 LUSTRE_OPC_ANY, NULL);
4293 if (IS_ERR(op_data))
4294 RETURN(PTR_ERR(op_data));
4296 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4297 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4298 ll_finish_md_op_data(op_data);
4302 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4304 GOTO(out_req, rc = -EFAULT);
4306 *fid = body->mbo_fid1;
4309 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4311 ptlrpc_req_finished(req);
4315 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4318 struct dentry *dchild = NULL;
4319 struct inode *child_inode = NULL;
4320 struct md_op_data *op_data;
4321 struct ptlrpc_request *request = NULL;
4322 struct obd_client_handle *och = NULL;
4324 struct mdt_body *body;
4325 __u64 data_version = 0;
4326 size_t namelen = strlen(name);
4327 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4331 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4332 PFID(ll_inode2fid(parent)), name,
4333 lum->lum_stripe_offset, lum->lum_stripe_count);
4335 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4336 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4337 lustre_swab_lmv_user_md(lum);
4339 /* Get child FID first */
4340 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4343 dchild = d_lookup(file_dentry(file), &qstr);
4345 if (dchild->d_inode)
4346 child_inode = igrab(dchild->d_inode);
4351 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4360 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4361 OBD_CONNECT2_DIR_MIGRATE)) {
4362 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4363 ll_i2info(child_inode)->lli_lsm_md) {
4364 CERROR("%s: MDT doesn't support stripe directory "
4365 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4366 GOTO(out_iput, rc = -EOPNOTSUPP);
4371 * lfs migrate command needs to be blocked on the client
4372 * by checking the migrate FID against the FID of the
4375 if (child_inode == parent->i_sb->s_root->d_inode)
4376 GOTO(out_iput, rc = -EINVAL);
4378 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4379 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4380 if (IS_ERR(op_data))
4381 GOTO(out_iput, rc = PTR_ERR(op_data));
4383 inode_lock(child_inode);
4384 op_data->op_fid3 = *ll_inode2fid(child_inode);
4385 if (!fid_is_sane(&op_data->op_fid3)) {
4386 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4387 ll_i2sbi(parent)->ll_fsname, name,
4388 PFID(&op_data->op_fid3));
4389 GOTO(out_unlock, rc = -EINVAL);
4392 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4393 op_data->op_data = lum;
4394 op_data->op_data_size = lumlen;
4397 if (S_ISREG(child_inode->i_mode)) {
4398 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4402 GOTO(out_unlock, rc);
4405 rc = ll_data_version(child_inode, &data_version,
4408 GOTO(out_close, rc);
4410 op_data->op_open_handle = och->och_open_handle;
4411 op_data->op_data_version = data_version;
4412 op_data->op_lease_handle = och->och_lease_handle;
4413 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4415 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4416 och->och_mod->mod_open_req->rq_replay = 0;
4417 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4420 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4421 name, namelen, &request);
4423 LASSERT(request != NULL);
4424 ll_update_times(request, parent);
4427 if (rc == 0 || rc == -EAGAIN) {
4428 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4429 LASSERT(body != NULL);
4431 /* If the server does release layout lock, then we cleanup
4432 * the client och here, otherwise release it in out_close: */
4433 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4434 obd_mod_put(och->och_mod);
4435 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4437 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4443 if (request != NULL) {
4444 ptlrpc_req_finished(request);
4448 /* Try again if the lease has cancelled. */
4449 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4454 ll_lease_close(och, child_inode, NULL);
4456 clear_nlink(child_inode);
4458 inode_unlock(child_inode);
4459 ll_finish_md_op_data(op_data);
4466 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4474 * test if some locks matching bits and l_req_mode are acquired
4475 * - bits can be in different locks
4476 * - if found clear the common lock bits in *bits
4477 * - the bits not found, are kept in *bits
4479 * \param bits [IN] searched lock bits [IN]
4480 * \param l_req_mode [IN] searched lock mode
4481 * \retval boolean, true iff all bits are found
4483 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4485 struct lustre_handle lockh;
4486 union ldlm_policy_data policy;
4487 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4488 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4497 fid = &ll_i2info(inode)->lli_fid;
4498 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4499 ldlm_lockname[mode]);
4501 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4502 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4503 policy.l_inodebits.bits = *bits & (1 << i);
4504 if (policy.l_inodebits.bits == 0)
4507 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4508 &policy, mode, &lockh)) {
4509 struct ldlm_lock *lock;
4511 lock = ldlm_handle2lock(&lockh);
4514 ~(lock->l_policy_data.l_inodebits.bits);
4515 LDLM_LOCK_PUT(lock);
4517 *bits &= ~policy.l_inodebits.bits;
4524 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4525 struct lustre_handle *lockh, __u64 flags,
4526 enum ldlm_mode mode)
4528 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4533 fid = &ll_i2info(inode)->lli_fid;
4534 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4536 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4537 fid, LDLM_IBITS, &policy, mode, lockh);
4542 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4544 /* Already unlinked. Just update nlink and return success */
4545 if (rc == -ENOENT) {
4547 /* If it is striped directory, and there is bad stripe
4548 * Let's revalidate the dentry again, instead of returning
4550 if (S_ISDIR(inode->i_mode) &&
4551 ll_i2info(inode)->lli_lsm_md != NULL)
4554 /* This path cannot be hit for regular files unless in
4555 * case of obscure races, so no need to to validate
4557 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4559 } else if (rc != 0) {
4560 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4561 "%s: revalidate FID "DFID" error: rc = %d\n",
4562 ll_i2sbi(inode)->ll_fsname,
4563 PFID(ll_inode2fid(inode)), rc);
4569 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4571 struct inode *inode = dentry->d_inode;
4572 struct obd_export *exp = ll_i2mdexp(inode);
4573 struct lookup_intent oit = {
4576 struct ptlrpc_request *req = NULL;
4577 struct md_op_data *op_data;
4581 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4582 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4584 /* Call getattr by fid, so do not provide name at all. */
4585 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4586 LUSTRE_OPC_ANY, NULL);
4587 if (IS_ERR(op_data))
4588 RETURN(PTR_ERR(op_data));
4590 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4591 ll_finish_md_op_data(op_data);
4593 rc = ll_inode_revalidate_fini(inode, rc);
4597 rc = ll_revalidate_it_finish(req, &oit, dentry);
4599 ll_intent_release(&oit);
4603 /* Unlinked? Unhash dentry, so it is not picked up later by
4604 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4605 * here to preserve get_cwd functionality on 2.6.
4607 if (!dentry->d_inode->i_nlink) {
4608 ll_lock_dcache(inode);
4609 d_lustre_invalidate(dentry, 0);
4610 ll_unlock_dcache(inode);
4613 ll_lookup_finish_locks(&oit, dentry);
4615 ptlrpc_req_finished(req);
4620 static int ll_merge_md_attr(struct inode *inode)
4622 struct ll_inode_info *lli = ll_i2info(inode);
4623 struct cl_attr attr = { 0 };
4626 LASSERT(lli->lli_lsm_md != NULL);
4628 /* foreign dir is not striped dir */
4629 if (lli->lli_lsm_md->lsm_md_magic == LMV_MAGIC_FOREIGN)
4632 down_read(&lli->lli_lsm_sem);
4633 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4634 &attr, ll_md_blocking_ast);
4635 up_read(&lli->lli_lsm_sem);
4639 set_nlink(inode, attr.cat_nlink);
4640 inode->i_blocks = attr.cat_blocks;
4641 i_size_write(inode, attr.cat_size);
4643 ll_i2info(inode)->lli_atime = attr.cat_atime;
4644 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4645 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4650 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4652 struct inode *inode = de->d_inode;
4653 struct ll_sb_info *sbi = ll_i2sbi(inode);
4654 struct ll_inode_info *lli = ll_i2info(inode);
4657 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4659 rc = ll_inode_revalidate(de, IT_GETATTR);
4663 if (S_ISREG(inode->i_mode)) {
4666 rc = pcc_inode_getattr(inode, &cached);
4667 if (cached && rc < 0)
4670 /* In case of restore, the MDT has the right size and has
4671 * already send it back without granting the layout lock,
4672 * inode is up-to-date so glimpse is useless.
4673 * Also to glimpse we need the layout, in case of a running
4674 * restore the MDT holds the layout lock so the glimpse will
4675 * block up to the end of restore (getattr will block)
4677 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4678 rc = ll_glimpse_size(inode);
4683 /* If object isn't regular a file then don't validate size. */
4684 if (S_ISDIR(inode->i_mode) &&
4685 lli->lli_lsm_md != NULL) {
4686 rc = ll_merge_md_attr(inode);
4691 inode->i_atime.tv_sec = lli->lli_atime;
4692 inode->i_mtime.tv_sec = lli->lli_mtime;
4693 inode->i_ctime.tv_sec = lli->lli_ctime;
4696 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4698 if (ll_need_32bit_api(sbi)) {
4699 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4700 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4701 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4703 stat->ino = inode->i_ino;
4704 stat->dev = inode->i_sb->s_dev;
4705 stat->rdev = inode->i_rdev;
4708 stat->mode = inode->i_mode;
4709 stat->uid = inode->i_uid;
4710 stat->gid = inode->i_gid;
4711 stat->atime = inode->i_atime;
4712 stat->mtime = inode->i_mtime;
4713 stat->ctime = inode->i_ctime;
4714 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4716 stat->nlink = inode->i_nlink;
4717 stat->size = i_size_read(inode);
4718 stat->blocks = inode->i_blocks;
4723 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4724 int ll_getattr(const struct path *path, struct kstat *stat,
4725 u32 request_mask, unsigned int flags)
4727 struct dentry *de = path->dentry;
4729 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4732 return ll_getattr_dentry(de, stat);
4735 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4736 __u64 start, __u64 len)
4740 struct fiemap *fiemap;
4741 unsigned int extent_count = fieinfo->fi_extents_max;
4743 num_bytes = sizeof(*fiemap) + (extent_count *
4744 sizeof(struct fiemap_extent));
4745 OBD_ALLOC_LARGE(fiemap, num_bytes);
4750 fiemap->fm_flags = fieinfo->fi_flags;
4751 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4752 fiemap->fm_start = start;
4753 fiemap->fm_length = len;
4754 if (extent_count > 0 &&
4755 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4756 sizeof(struct fiemap_extent)) != 0)
4757 GOTO(out, rc = -EFAULT);
4759 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4761 fieinfo->fi_flags = fiemap->fm_flags;
4762 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4763 if (extent_count > 0 &&
4764 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4765 fiemap->fm_mapped_extents *
4766 sizeof(struct fiemap_extent)) != 0)
4767 GOTO(out, rc = -EFAULT);
4769 OBD_FREE_LARGE(fiemap, num_bytes);
4773 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4775 struct ll_inode_info *lli = ll_i2info(inode);
4776 struct posix_acl *acl = NULL;
4779 spin_lock(&lli->lli_lock);
4780 /* VFS' acl_permission_check->check_acl will release the refcount */
4781 acl = posix_acl_dup(lli->lli_posix_acl);
4782 spin_unlock(&lli->lli_lock);
4787 #ifdef HAVE_IOP_SET_ACL
4788 #ifdef CONFIG_FS_POSIX_ACL
4789 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4791 struct ll_sb_info *sbi = ll_i2sbi(inode);
4792 struct ptlrpc_request *req = NULL;
4793 const char *name = NULL;
4795 size_t value_size = 0;
4800 case ACL_TYPE_ACCESS:
4801 name = XATTR_NAME_POSIX_ACL_ACCESS;
4803 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4806 case ACL_TYPE_DEFAULT:
4807 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4808 if (!S_ISDIR(inode->i_mode))
4809 rc = acl ? -EACCES : 0;
4820 value_size = posix_acl_xattr_size(acl->a_count);
4821 value = kmalloc(value_size, GFP_NOFS);
4823 GOTO(out, rc = -ENOMEM);
4825 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4827 GOTO(out_value, rc);
4830 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4831 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4832 name, value, value_size, 0, 0, &req);
4834 ptlrpc_req_finished(req);
4839 forget_cached_acl(inode, type);
4841 set_cached_acl(inode, type, acl);
4844 #endif /* CONFIG_FS_POSIX_ACL */
4845 #endif /* HAVE_IOP_SET_ACL */
4847 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4849 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4850 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4852 ll_check_acl(struct inode *inode, int mask)
4855 # ifdef CONFIG_FS_POSIX_ACL
4856 struct posix_acl *acl;
4860 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4861 if (flags & IPERM_FLAG_RCU)
4864 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4869 rc = posix_acl_permission(inode, acl, mask);
4870 posix_acl_release(acl);
4873 # else /* !CONFIG_FS_POSIX_ACL */
4875 # endif /* CONFIG_FS_POSIX_ACL */
4877 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4879 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4880 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4882 # ifdef HAVE_INODE_PERMISION_2ARGS
4883 int ll_inode_permission(struct inode *inode, int mask)
4885 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4890 struct ll_sb_info *sbi;
4891 struct root_squash_info *squash;
4892 struct cred *cred = NULL;
4893 const struct cred *old_cred = NULL;
4895 bool squash_id = false;
4898 #ifdef MAY_NOT_BLOCK
4899 if (mask & MAY_NOT_BLOCK)
4901 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4902 if (flags & IPERM_FLAG_RCU)
4906 /* as root inode are NOT getting validated in lookup operation,
4907 * need to do it before permission check. */
4909 if (inode == inode->i_sb->s_root->d_inode) {
4910 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4915 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4916 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4918 /* squash fsuid/fsgid if needed */
4919 sbi = ll_i2sbi(inode);
4920 squash = &sbi->ll_squash;
4921 if (unlikely(squash->rsi_uid != 0 &&
4922 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4923 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4927 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4928 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4929 squash->rsi_uid, squash->rsi_gid);
4931 /* update current process's credentials
4932 * and FS capability */
4933 cred = prepare_creds();
4937 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4938 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4939 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4940 if ((1 << cap) & CFS_CAP_FS_MASK)
4941 cap_lower(cred->cap_effective, cap);
4943 old_cred = override_creds(cred);
4946 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4947 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4948 /* restore current process's credentials and FS capability */
4950 revert_creds(old_cred);
4957 /* -o localflock - only provides locally consistent flock locks */
4958 struct file_operations ll_file_operations = {
4959 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4960 # ifdef HAVE_SYNC_READ_WRITE
4961 .read = new_sync_read,
4962 .write = new_sync_write,
4964 .read_iter = ll_file_read_iter,
4965 .write_iter = ll_file_write_iter,
4966 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4967 .read = ll_file_read,
4968 .aio_read = ll_file_aio_read,
4969 .write = ll_file_write,
4970 .aio_write = ll_file_aio_write,
4971 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4972 .unlocked_ioctl = ll_file_ioctl,
4973 .open = ll_file_open,
4974 .release = ll_file_release,
4975 .mmap = ll_file_mmap,
4976 .llseek = ll_file_seek,
4977 .splice_read = ll_file_splice_read,
4982 struct file_operations ll_file_operations_flock = {
4983 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4984 # ifdef HAVE_SYNC_READ_WRITE
4985 .read = new_sync_read,
4986 .write = new_sync_write,
4987 # endif /* HAVE_SYNC_READ_WRITE */
4988 .read_iter = ll_file_read_iter,
4989 .write_iter = ll_file_write_iter,
4990 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4991 .read = ll_file_read,
4992 .aio_read = ll_file_aio_read,
4993 .write = ll_file_write,
4994 .aio_write = ll_file_aio_write,
4995 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4996 .unlocked_ioctl = ll_file_ioctl,
4997 .open = ll_file_open,
4998 .release = ll_file_release,
4999 .mmap = ll_file_mmap,
5000 .llseek = ll_file_seek,
5001 .splice_read = ll_file_splice_read,
5004 .flock = ll_file_flock,
5005 .lock = ll_file_flock
5008 /* These are for -o noflock - to return ENOSYS on flock calls */
5009 struct file_operations ll_file_operations_noflock = {
5010 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5011 # ifdef HAVE_SYNC_READ_WRITE
5012 .read = new_sync_read,
5013 .write = new_sync_write,
5014 # endif /* HAVE_SYNC_READ_WRITE */
5015 .read_iter = ll_file_read_iter,
5016 .write_iter = ll_file_write_iter,
5017 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5018 .read = ll_file_read,
5019 .aio_read = ll_file_aio_read,
5020 .write = ll_file_write,
5021 .aio_write = ll_file_aio_write,
5022 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5023 .unlocked_ioctl = ll_file_ioctl,
5024 .open = ll_file_open,
5025 .release = ll_file_release,
5026 .mmap = ll_file_mmap,
5027 .llseek = ll_file_seek,
5028 .splice_read = ll_file_splice_read,
5031 .flock = ll_file_noflock,
5032 .lock = ll_file_noflock
5035 struct inode_operations ll_file_inode_operations = {
5036 .setattr = ll_setattr,
5037 .getattr = ll_getattr,
5038 .permission = ll_inode_permission,
5039 #ifdef HAVE_IOP_XATTR
5040 .setxattr = ll_setxattr,
5041 .getxattr = ll_getxattr,
5042 .removexattr = ll_removexattr,
5044 .listxattr = ll_listxattr,
5045 .fiemap = ll_fiemap,
5046 #ifdef HAVE_IOP_GET_ACL
5047 .get_acl = ll_get_acl,
5049 #ifdef HAVE_IOP_SET_ACL
5050 .set_acl = ll_set_acl,
5054 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5056 struct ll_inode_info *lli = ll_i2info(inode);
5057 struct cl_object *obj = lli->lli_clob;
5066 env = cl_env_get(&refcheck);
5068 RETURN(PTR_ERR(env));
5070 rc = cl_conf_set(env, lli->lli_clob, conf);
5074 if (conf->coc_opc == OBJECT_CONF_SET) {
5075 struct ldlm_lock *lock = conf->coc_lock;
5076 struct cl_layout cl = {
5080 LASSERT(lock != NULL);
5081 LASSERT(ldlm_has_layout(lock));
5083 /* it can only be allowed to match after layout is
5084 * applied to inode otherwise false layout would be
5085 * seen. Applying layout shoud happen before dropping
5086 * the intent lock. */
5087 ldlm_lock_allow_match(lock);
5089 rc = cl_object_layout_get(env, obj, &cl);
5094 DFID": layout version change: %u -> %u\n",
5095 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5097 ll_layout_version_set(lli, cl.cl_layout_gen);
5101 cl_env_put(env, &refcheck);
5106 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5107 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5110 struct ll_sb_info *sbi = ll_i2sbi(inode);
5111 struct ptlrpc_request *req;
5118 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5119 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5120 lock->l_lvb_data, lock->l_lvb_len);
5122 if (lock->l_lvb_data != NULL)
5125 /* if layout lock was granted right away, the layout is returned
5126 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5127 * blocked and then granted via completion ast, we have to fetch
5128 * layout here. Please note that we can't use the LVB buffer in
5129 * completion AST because it doesn't have a large enough buffer */
5130 rc = ll_get_default_mdsize(sbi, &lmmsize);
5134 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5135 XATTR_NAME_LOV, lmmsize, &req);
5138 GOTO(out, rc = 0); /* empty layout */
5145 if (lmmsize == 0) /* empty layout */
5148 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5150 GOTO(out, rc = -EFAULT);
5152 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5153 if (lvbdata == NULL)
5154 GOTO(out, rc = -ENOMEM);
5156 memcpy(lvbdata, lmm, lmmsize);
5157 lock_res_and_lock(lock);
5158 if (unlikely(lock->l_lvb_data == NULL)) {
5159 lock->l_lvb_type = LVB_T_LAYOUT;
5160 lock->l_lvb_data = lvbdata;
5161 lock->l_lvb_len = lmmsize;
5164 unlock_res_and_lock(lock);
5167 OBD_FREE_LARGE(lvbdata, lmmsize);
5172 ptlrpc_req_finished(req);
5177 * Apply the layout to the inode. Layout lock is held and will be released
5180 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5181 struct inode *inode)
5183 struct ll_inode_info *lli = ll_i2info(inode);
5184 struct ll_sb_info *sbi = ll_i2sbi(inode);
5185 struct ldlm_lock *lock;
5186 struct cl_object_conf conf;
5189 bool wait_layout = false;
5192 LASSERT(lustre_handle_is_used(lockh));
5194 lock = ldlm_handle2lock(lockh);
5195 LASSERT(lock != NULL);
5196 LASSERT(ldlm_has_layout(lock));
5198 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5199 PFID(&lli->lli_fid), inode);
5201 /* in case this is a caching lock and reinstate with new inode */
5202 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5204 lock_res_and_lock(lock);
5205 lvb_ready = ldlm_is_lvb_ready(lock);
5206 unlock_res_and_lock(lock);
5208 /* checking lvb_ready is racy but this is okay. The worst case is
5209 * that multi processes may configure the file on the same time. */
5213 rc = ll_layout_fetch(inode, lock);
5217 /* for layout lock, lmm is stored in lock's lvb.
5218 * lvb_data is immutable if the lock is held so it's safe to access it
5221 * set layout to file. Unlikely this will fail as old layout was
5222 * surely eliminated */
5223 memset(&conf, 0, sizeof conf);
5224 conf.coc_opc = OBJECT_CONF_SET;
5225 conf.coc_inode = inode;
5226 conf.coc_lock = lock;
5227 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5228 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5229 rc = ll_layout_conf(inode, &conf);
5231 /* refresh layout failed, need to wait */
5232 wait_layout = rc == -EBUSY;
5235 LDLM_LOCK_PUT(lock);
5236 ldlm_lock_decref(lockh, mode);
5238 /* wait for IO to complete if it's still being used. */
5240 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5241 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5243 memset(&conf, 0, sizeof conf);
5244 conf.coc_opc = OBJECT_CONF_WAIT;
5245 conf.coc_inode = inode;
5246 rc = ll_layout_conf(inode, &conf);
5250 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5251 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5257 * Issue layout intent RPC to MDS.
5258 * \param inode [in] file inode
5259 * \param intent [in] layout intent
5261 * \retval 0 on success
5262 * \retval < 0 error code
5264 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5266 struct ll_inode_info *lli = ll_i2info(inode);
5267 struct ll_sb_info *sbi = ll_i2sbi(inode);
5268 struct md_op_data *op_data;
5269 struct lookup_intent it;
5270 struct ptlrpc_request *req;
5274 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5275 0, 0, LUSTRE_OPC_ANY, NULL);
5276 if (IS_ERR(op_data))
5277 RETURN(PTR_ERR(op_data));
5279 op_data->op_data = intent;
5280 op_data->op_data_size = sizeof(*intent);
5282 memset(&it, 0, sizeof(it));
5283 it.it_op = IT_LAYOUT;
5284 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5285 intent->li_opc == LAYOUT_INTENT_TRUNC)
5286 it.it_flags = FMODE_WRITE;
5288 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5289 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5291 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5292 &ll_md_blocking_ast, 0);
5293 if (it.it_request != NULL)
5294 ptlrpc_req_finished(it.it_request);
5295 it.it_request = NULL;
5297 ll_finish_md_op_data(op_data);
5299 /* set lock data in case this is a new lock */
5301 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5303 ll_intent_drop_lock(&it);
5309 * This function checks if there exists a LAYOUT lock on the client side,
5310 * or enqueues it if it doesn't have one in cache.
5312 * This function will not hold layout lock so it may be revoked any time after
5313 * this function returns. Any operations depend on layout should be redone
5316 * This function should be called before lov_io_init() to get an uptodate
5317 * layout version, the caller should save the version number and after IO
5318 * is finished, this function should be called again to verify that layout
5319 * is not changed during IO time.
5321 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5323 struct ll_inode_info *lli = ll_i2info(inode);
5324 struct ll_sb_info *sbi = ll_i2sbi(inode);
5325 struct lustre_handle lockh;
5326 struct layout_intent intent = {
5327 .li_opc = LAYOUT_INTENT_ACCESS,
5329 enum ldlm_mode mode;
5333 *gen = ll_layout_version_get(lli);
5334 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5338 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5339 LASSERT(S_ISREG(inode->i_mode));
5341 /* take layout lock mutex to enqueue layout lock exclusively. */
5342 mutex_lock(&lli->lli_layout_mutex);
5345 /* mostly layout lock is caching on the local side, so try to
5346 * match it before grabbing layout lock mutex. */
5347 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5348 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5349 if (mode != 0) { /* hit cached lock */
5350 rc = ll_layout_lock_set(&lockh, mode, inode);
5356 rc = ll_layout_intent(inode, &intent);
5362 *gen = ll_layout_version_get(lli);
5363 mutex_unlock(&lli->lli_layout_mutex);
5369 * Issue layout intent RPC indicating where in a file an IO is about to write.
5371 * \param[in] inode file inode.
5372 * \param[in] ext write range with start offset of fille in bytes where
5373 * an IO is about to write, and exclusive end offset in
5376 * \retval 0 on success
5377 * \retval < 0 error code
5379 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5380 struct lu_extent *ext)
5382 struct layout_intent intent = {
5384 .li_extent.e_start = ext->e_start,
5385 .li_extent.e_end = ext->e_end,
5390 rc = ll_layout_intent(inode, &intent);
5396 * This function send a restore request to the MDT
5398 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5400 struct hsm_user_request *hur;
5404 len = sizeof(struct hsm_user_request) +
5405 sizeof(struct hsm_user_item);
5406 OBD_ALLOC(hur, len);
5410 hur->hur_request.hr_action = HUA_RESTORE;
5411 hur->hur_request.hr_archive_id = 0;
5412 hur->hur_request.hr_flags = 0;
5413 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5414 sizeof(hur->hur_user_item[0].hui_fid));
5415 hur->hur_user_item[0].hui_extent.offset = offset;
5416 hur->hur_user_item[0].hui_extent.length = length;
5417 hur->hur_request.hr_itemcount = 1;
5418 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,