4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
166 case MDS_CLOSE_LAYOUT_SPLIT:
167 case MDS_CLOSE_LAYOUT_SWAP: {
168 struct split_param *sp = data;
170 LASSERT(data != NULL);
171 op_data->op_bias |= bias;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
175 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
176 op_data->op_mirror_id = sp->sp_mirror_id;
178 op_data->op_fid2 = *ll_inode2fid(data);
183 case MDS_CLOSE_RESYNC_DONE: {
184 struct ll_ioc_lease *ioc = data;
186 LASSERT(data != NULL);
187 op_data->op_attr_blocks +=
188 ioc->lil_count * op_data->op_attr_blocks;
189 op_data->op_attr.ia_valid |= ATTR_SIZE;
190 op_data->op_xvalid |= OP_XVALID_BLOCKS;
191 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_data = &ioc->lil_ids[0];
195 op_data->op_data_size =
196 ioc->lil_count * sizeof(ioc->lil_ids[0]);
200 case MDS_PCC_ATTACH: {
201 struct pcc_param *param = data;
203 LASSERT(data != NULL);
204 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
205 op_data->op_archive_id = param->pa_archive_id;
206 op_data->op_data_version = param->pa_data_version;
207 op_data->op_lease_handle = och->och_lease_handle;
211 case MDS_HSM_RELEASE:
212 LASSERT(data != NULL);
213 op_data->op_bias |= MDS_HSM_RELEASE;
214 op_data->op_data_version = *(__u64 *)data;
215 op_data->op_lease_handle = och->och_lease_handle;
216 op_data->op_attr.ia_valid |= ATTR_SIZE;
217 op_data->op_xvalid |= OP_XVALID_BLOCKS;
221 LASSERT(data == NULL);
225 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
226 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
227 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
228 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
230 rc = md_close(md_exp, op_data, och->och_mod, &req);
231 if (rc != 0 && rc != -EINTR)
232 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
233 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
235 if (rc == 0 && op_data->op_bias & bias) {
236 struct mdt_body *body;
238 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
239 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 if (bias & MDS_PCC_ATTACH) {
243 struct pcc_param *param = data;
245 param->pa_layout_gen = body->mbo_layout_gen;
249 ll_finish_md_op_data(op_data);
253 md_clear_open_replay_data(md_exp, och);
254 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
257 ptlrpc_req_finished(req); /* This is close request */
261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
263 struct ll_inode_info *lli = ll_i2info(inode);
264 struct obd_client_handle **och_p;
265 struct obd_client_handle *och;
270 if (fmode & FMODE_WRITE) {
271 och_p = &lli->lli_mds_write_och;
272 och_usecount = &lli->lli_open_fd_write_count;
273 } else if (fmode & FMODE_EXEC) {
274 och_p = &lli->lli_mds_exec_och;
275 och_usecount = &lli->lli_open_fd_exec_count;
277 LASSERT(fmode & FMODE_READ);
278 och_p = &lli->lli_mds_read_och;
279 och_usecount = &lli->lli_open_fd_read_count;
282 mutex_lock(&lli->lli_och_mutex);
283 if (*och_usecount > 0) {
284 /* There are still users of this handle, so skip
286 mutex_unlock(&lli->lli_och_mutex);
292 mutex_unlock(&lli->lli_och_mutex);
295 /* There might be a race and this handle may already
297 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
303 static int ll_md_close(struct inode *inode, struct file *file)
305 union ldlm_policy_data policy = {
306 .l_inodebits = { MDS_INODELOCK_OPEN },
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310 struct ll_inode_info *lli = ll_i2info(inode);
311 struct lustre_handle lockh;
312 enum ldlm_mode lockmode;
316 /* clear group lock, if present */
317 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
318 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
320 if (fd->fd_lease_och != NULL) {
323 /* Usually the lease is not released when the
324 * application crashed, we need to release here. */
325 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
326 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
327 PFID(&lli->lli_fid), rc, lease_broken);
329 fd->fd_lease_och = NULL;
332 if (fd->fd_och != NULL) {
333 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
338 /* Let's see if we have good enough OPEN lock on the file and if
339 we can skip talking to MDS */
340 mutex_lock(&lli->lli_och_mutex);
341 if (fd->fd_omode & FMODE_WRITE) {
343 LASSERT(lli->lli_open_fd_write_count);
344 lli->lli_open_fd_write_count--;
345 } else if (fd->fd_omode & FMODE_EXEC) {
347 LASSERT(lli->lli_open_fd_exec_count);
348 lli->lli_open_fd_exec_count--;
351 LASSERT(lli->lli_open_fd_read_count);
352 lli->lli_open_fd_read_count--;
354 mutex_unlock(&lli->lli_och_mutex);
356 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
357 LDLM_IBITS, &policy, lockmode, &lockh))
358 rc = ll_md_real_close(inode, fd->fd_omode);
361 LUSTRE_FPRIVATE(file) = NULL;
362 ll_file_data_put(fd);
367 /* While this returns an error code, fput() the caller does not, so we need
368 * to make every effort to clean up all of our state here. Also, applications
369 * rarely check close errors and even if an error is returned they will not
370 * re-try the close call.
372 int ll_file_release(struct inode *inode, struct file *file)
374 struct ll_file_data *fd;
375 struct ll_sb_info *sbi = ll_i2sbi(inode);
376 struct ll_inode_info *lli = ll_i2info(inode);
380 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
381 PFID(ll_inode2fid(inode)), inode);
383 if (inode->i_sb->s_root != file_dentry(file))
384 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
385 fd = LUSTRE_FPRIVATE(file);
388 /* The last ref on @file, maybe not the the owner pid of statahead,
389 * because parent and child process can share the same file handle. */
390 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
391 ll_deauthorize_statahead(inode, fd);
393 if (inode->i_sb->s_root == file_dentry(file)) {
394 LUSTRE_FPRIVATE(file) = NULL;
395 ll_file_data_put(fd);
399 pcc_file_release(inode, file);
401 if (!S_ISDIR(inode->i_mode)) {
402 if (lli->lli_clob != NULL)
403 lov_read_and_clear_async_rc(lli->lli_clob);
404 lli->lli_async_rc = 0;
407 rc = ll_md_close(inode, file);
409 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
410 libcfs_debug_dumplog();
415 static inline int ll_dom_readpage(void *data, struct page *page)
417 struct niobuf_local *lnb = data;
420 kaddr = ll_kmap_atomic(page, KM_USER0);
421 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
422 if (lnb->lnb_len < PAGE_SIZE)
423 memset(kaddr + lnb->lnb_len, 0,
424 PAGE_SIZE - lnb->lnb_len);
425 flush_dcache_page(page);
426 SetPageUptodate(page);
427 ll_kunmap_atomic(kaddr, KM_USER0);
433 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
434 struct lookup_intent *it)
436 struct ll_inode_info *lli = ll_i2info(inode);
437 struct cl_object *obj = lli->lli_clob;
438 struct address_space *mapping = inode->i_mapping;
440 struct niobuf_remote *rnb;
441 struct mdt_body *body;
443 unsigned long index, start;
444 struct niobuf_local lnb;
451 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
455 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
456 if (rnb == NULL || rnb->rnb_len == 0)
459 /* LU-11595: Server may return whole file and that is OK always or
460 * it may return just file tail and its offset must be aligned with
461 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
462 * smaller then offset may be not aligned and that data is just ignored.
464 if (rnb->rnb_offset % PAGE_SIZE)
467 /* Server returns whole file or just file tail if it fills in reply
468 * buffer, in both cases total size should be equal to the file size.
470 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
471 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
472 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
473 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
474 rnb->rnb_len, body->mbo_dom_size);
478 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
479 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
481 data = (char *)rnb + sizeof(*rnb);
483 lnb.lnb_file_offset = rnb->rnb_offset;
484 start = lnb.lnb_file_offset / PAGE_SIZE;
486 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
487 lnb.lnb_page_offset = 0;
489 lnb.lnb_data = data + (index << PAGE_SHIFT);
490 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
491 if (lnb.lnb_len > PAGE_SIZE)
492 lnb.lnb_len = PAGE_SIZE;
494 vmpage = read_cache_page(mapping, index + start,
495 ll_dom_readpage, &lnb);
496 if (IS_ERR(vmpage)) {
497 CWARN("%s: cannot fill page %lu for "DFID
498 " with data: rc = %li\n",
499 ll_i2sbi(inode)->ll_fsname, index + start,
500 PFID(lu_object_fid(&obj->co_lu)),
506 } while (rnb->rnb_len > (index << PAGE_SHIFT));
510 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
511 struct lookup_intent *itp)
513 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
514 struct dentry *parent = de->d_parent;
517 struct md_op_data *op_data;
518 struct ptlrpc_request *req = NULL;
522 LASSERT(parent != NULL);
523 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
525 /* if server supports open-by-fid, or file name is invalid, don't pack
526 * name in open request */
527 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
528 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
530 len = de->d_name.len;
531 name = kmalloc(len + 1, GFP_NOFS);
536 spin_lock(&de->d_lock);
537 if (len != de->d_name.len) {
538 spin_unlock(&de->d_lock);
542 memcpy(name, de->d_name.name, len);
544 spin_unlock(&de->d_lock);
546 if (!lu_name_is_valid_2(name, len)) {
552 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
553 name, len, 0, LUSTRE_OPC_ANY, NULL);
554 if (IS_ERR(op_data)) {
556 RETURN(PTR_ERR(op_data));
558 op_data->op_data = lmm;
559 op_data->op_data_size = lmmsize;
561 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
562 &ll_md_blocking_ast, 0);
564 ll_finish_md_op_data(op_data);
566 /* reason for keep own exit path - don`t flood log
567 * with messages with -ESTALE errors.
569 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
570 it_open_error(DISP_OPEN_OPEN, itp))
572 ll_release_openhandle(de, itp);
576 if (it_disposition(itp, DISP_LOOKUP_NEG))
577 GOTO(out, rc = -ENOENT);
579 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
580 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
581 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
585 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
587 if (!rc && itp->it_lock_mode) {
588 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
589 struct ldlm_lock *lock;
590 bool has_dom_bit = false;
592 /* If we got a lock back and it has a LOOKUP bit set,
593 * make sure the dentry is marked as valid so we can find it.
594 * We don't need to care about actual hashing since other bits
595 * of kernel will deal with that later.
597 lock = ldlm_handle2lock(&handle);
599 has_dom_bit = ldlm_has_dom(lock);
600 if (lock->l_policy_data.l_inodebits.bits &
601 MDS_INODELOCK_LOOKUP)
602 d_lustre_revalidate(de);
606 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
608 ll_dom_finish_open(de->d_inode, req, itp);
612 ptlrpc_req_finished(req);
613 ll_intent_drop_lock(itp);
615 /* We did open by fid, but by the time we got to the server,
616 * the object disappeared. If this is a create, we cannot really
617 * tell the userspace that the file it was trying to create
618 * does not exist. Instead let's return -ESTALE, and the VFS will
619 * retry the create with LOOKUP_REVAL that we are going to catch
620 * in ll_revalidate_dentry() and use lookup then.
622 if (rc == -ENOENT && itp->it_op & IT_CREAT)
628 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
629 struct obd_client_handle *och)
631 struct mdt_body *body;
633 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
634 och->och_open_handle = body->mbo_open_handle;
635 och->och_fid = body->mbo_fid1;
636 och->och_lease_handle.cookie = it->it_lock_handle;
637 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
638 och->och_flags = it->it_flags;
640 return md_set_open_replay_data(md_exp, och, it);
643 static int ll_local_open(struct file *file, struct lookup_intent *it,
644 struct ll_file_data *fd, struct obd_client_handle *och)
646 struct inode *inode = file_inode(file);
649 LASSERT(!LUSTRE_FPRIVATE(file));
656 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
661 LUSTRE_FPRIVATE(file) = fd;
662 ll_readahead_init(inode, &fd->fd_ras);
663 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
665 /* ll_cl_context initialize */
666 rwlock_init(&fd->fd_lock);
667 INIT_LIST_HEAD(&fd->fd_lccs);
672 /* Open a file, and (for the very first open) create objects on the OSTs at
673 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
674 * creation or open until ll_lov_setstripe() ioctl is called.
676 * If we already have the stripe MD locally then we don't request it in
677 * md_open(), by passing a lmm_size = 0.
679 * It is up to the application to ensure no other processes open this file
680 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
681 * used. We might be able to avoid races of that sort by getting lli_open_sem
682 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
683 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
685 int ll_file_open(struct inode *inode, struct file *file)
687 struct ll_inode_info *lli = ll_i2info(inode);
688 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
689 .it_flags = file->f_flags };
690 struct obd_client_handle **och_p = NULL;
691 __u64 *och_usecount = NULL;
692 struct ll_file_data *fd;
696 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
697 PFID(ll_inode2fid(inode)), inode, file->f_flags);
699 it = file->private_data; /* XXX: compat macro */
700 file->private_data = NULL; /* prevent ll_local_open assertion */
702 fd = ll_file_data_get();
704 GOTO(out_nofiledata, rc = -ENOMEM);
707 if (S_ISDIR(inode->i_mode))
708 ll_authorize_statahead(inode, fd);
710 if (inode->i_sb->s_root == file_dentry(file)) {
711 LUSTRE_FPRIVATE(file) = fd;
715 if (!it || !it->it_disposition) {
716 /* Convert f_flags into access mode. We cannot use file->f_mode,
717 * because everything but O_ACCMODE mask was stripped from
719 if ((oit.it_flags + 1) & O_ACCMODE)
721 if (file->f_flags & O_TRUNC)
722 oit.it_flags |= FMODE_WRITE;
724 /* kernel only call f_op->open in dentry_open. filp_open calls
725 * dentry_open after call to open_namei that checks permissions.
726 * Only nfsd_open call dentry_open directly without checking
727 * permissions and because of that this code below is safe.
729 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
730 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
732 /* We do not want O_EXCL here, presumably we opened the file
733 * already? XXX - NFS implications? */
734 oit.it_flags &= ~O_EXCL;
736 /* bug20584, if "it_flags" contains O_CREAT, the file will be
737 * created if necessary, then "IT_CREAT" should be set to keep
738 * consistent with it */
739 if (oit.it_flags & O_CREAT)
740 oit.it_op |= IT_CREAT;
746 /* Let's see if we have file open on MDS already. */
747 if (it->it_flags & FMODE_WRITE) {
748 och_p = &lli->lli_mds_write_och;
749 och_usecount = &lli->lli_open_fd_write_count;
750 } else if (it->it_flags & FMODE_EXEC) {
751 och_p = &lli->lli_mds_exec_och;
752 och_usecount = &lli->lli_open_fd_exec_count;
754 och_p = &lli->lli_mds_read_och;
755 och_usecount = &lli->lli_open_fd_read_count;
758 mutex_lock(&lli->lli_och_mutex);
759 if (*och_p) { /* Open handle is present */
760 if (it_disposition(it, DISP_OPEN_OPEN)) {
761 /* Well, there's extra open request that we do not need,
762 let's close it somehow. This will decref request. */
763 rc = it_open_error(DISP_OPEN_OPEN, it);
765 mutex_unlock(&lli->lli_och_mutex);
766 GOTO(out_openerr, rc);
769 ll_release_openhandle(file_dentry(file), it);
773 rc = ll_local_open(file, it, fd, NULL);
776 mutex_unlock(&lli->lli_och_mutex);
777 GOTO(out_openerr, rc);
780 LASSERT(*och_usecount == 0);
781 if (!it->it_disposition) {
782 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
783 /* We cannot just request lock handle now, new ELC code
784 means that one of other OPEN locks for this file
785 could be cancelled, and since blocking ast handler
786 would attempt to grab och_mutex as well, that would
787 result in a deadlock */
788 mutex_unlock(&lli->lli_och_mutex);
790 * Normally called under two situations:
792 * 2. A race/condition on MDS resulting in no open
793 * handle to be returned from LOOKUP|OPEN request,
794 * for example if the target entry was a symlink.
796 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
797 * marked by a bit set in ll_iget_for_nfs. Clear the
798 * bit so that it's not confusing later callers.
800 * NB; when ldd is NULL, it must have come via normal
801 * lookup path only, since ll_iget_for_nfs always calls
804 if (ldd && ldd->lld_nfs_dentry) {
805 ldd->lld_nfs_dentry = 0;
806 it->it_flags |= MDS_OPEN_LOCK;
810 * Always specify MDS_OPEN_BY_FID because we don't want
811 * to get file with different fid.
813 it->it_flags |= MDS_OPEN_BY_FID;
814 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
817 GOTO(out_openerr, rc);
821 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
823 GOTO(out_och_free, rc = -ENOMEM);
827 /* md_intent_lock() didn't get a request ref if there was an
828 * open error, so don't do cleanup on the request here
830 /* XXX (green): Should not we bail out on any error here, not
831 * just open error? */
832 rc = it_open_error(DISP_OPEN_OPEN, it);
834 GOTO(out_och_free, rc);
836 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
837 "inode %p: disposition %x, status %d\n", inode,
838 it_disposition(it, ~0), it->it_status);
840 rc = ll_local_open(file, it, fd, *och_p);
842 GOTO(out_och_free, rc);
845 rc = pcc_file_open(inode, file);
847 GOTO(out_och_free, rc);
849 mutex_unlock(&lli->lli_och_mutex);
852 /* Must do this outside lli_och_mutex lock to prevent deadlock where
853 different kind of OPEN lock for this same inode gets cancelled
854 by ldlm_cancel_lru */
855 if (!S_ISREG(inode->i_mode))
856 GOTO(out_och_free, rc);
858 cl_lov_delay_create_clear(&file->f_flags);
859 GOTO(out_och_free, rc);
863 if (och_p && *och_p) {
864 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
865 *och_p = NULL; /* OBD_FREE writes some magic there */
868 mutex_unlock(&lli->lli_och_mutex);
871 if (lli->lli_opendir_key == fd)
872 ll_deauthorize_statahead(inode, fd);
875 ll_file_data_put(fd);
877 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
881 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
882 ptlrpc_req_finished(it->it_request);
883 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
889 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
890 struct ldlm_lock_desc *desc, void *data, int flag)
893 struct lustre_handle lockh;
897 case LDLM_CB_BLOCKING:
898 ldlm_lock2handle(lock, &lockh);
899 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
901 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
905 case LDLM_CB_CANCELING:
913 * When setting a lease on a file, we take ownership of the lli_mds_*_och
914 * and save it as fd->fd_och so as to force client to reopen the file even
915 * if it has an open lock in cache already.
917 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
918 struct lustre_handle *old_open_handle)
920 struct ll_inode_info *lli = ll_i2info(inode);
921 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
922 struct obd_client_handle **och_p;
927 /* Get the openhandle of the file */
928 mutex_lock(&lli->lli_och_mutex);
929 if (fd->fd_lease_och != NULL)
930 GOTO(out_unlock, rc = -EBUSY);
932 if (fd->fd_och == NULL) {
933 if (file->f_mode & FMODE_WRITE) {
934 LASSERT(lli->lli_mds_write_och != NULL);
935 och_p = &lli->lli_mds_write_och;
936 och_usecount = &lli->lli_open_fd_write_count;
938 LASSERT(lli->lli_mds_read_och != NULL);
939 och_p = &lli->lli_mds_read_och;
940 och_usecount = &lli->lli_open_fd_read_count;
943 if (*och_usecount > 1)
944 GOTO(out_unlock, rc = -EBUSY);
951 *old_open_handle = fd->fd_och->och_open_handle;
955 mutex_unlock(&lli->lli_och_mutex);
960 * Release ownership on lli_mds_*_och when putting back a file lease.
962 static int ll_lease_och_release(struct inode *inode, struct file *file)
964 struct ll_inode_info *lli = ll_i2info(inode);
965 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
966 struct obd_client_handle **och_p;
967 struct obd_client_handle *old_och = NULL;
972 mutex_lock(&lli->lli_och_mutex);
973 if (file->f_mode & FMODE_WRITE) {
974 och_p = &lli->lli_mds_write_och;
975 och_usecount = &lli->lli_open_fd_write_count;
977 och_p = &lli->lli_mds_read_och;
978 och_usecount = &lli->lli_open_fd_read_count;
981 /* The file may have been open by another process (broken lease) so
982 * *och_p is not NULL. In this case we should simply increase usecount
985 if (*och_p != NULL) {
986 old_och = fd->fd_och;
993 mutex_unlock(&lli->lli_och_mutex);
996 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1002 * Acquire a lease and open the file.
1004 static struct obd_client_handle *
1005 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1008 struct lookup_intent it = { .it_op = IT_OPEN };
1009 struct ll_sb_info *sbi = ll_i2sbi(inode);
1010 struct md_op_data *op_data;
1011 struct ptlrpc_request *req = NULL;
1012 struct lustre_handle old_open_handle = { 0 };
1013 struct obd_client_handle *och = NULL;
1018 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1019 RETURN(ERR_PTR(-EINVAL));
1022 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1023 RETURN(ERR_PTR(-EPERM));
1025 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1027 RETURN(ERR_PTR(rc));
1032 RETURN(ERR_PTR(-ENOMEM));
1034 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1035 LUSTRE_OPC_ANY, NULL);
1036 if (IS_ERR(op_data))
1037 GOTO(out, rc = PTR_ERR(op_data));
1039 /* To tell the MDT this openhandle is from the same owner */
1040 op_data->op_open_handle = old_open_handle;
1042 it.it_flags = fmode | open_flags;
1043 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1044 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1045 &ll_md_blocking_lease_ast,
1046 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1047 * it can be cancelled which may mislead applications that the lease is
1049 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1050 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1051 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1052 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1053 ll_finish_md_op_data(op_data);
1054 ptlrpc_req_finished(req);
1056 GOTO(out_release_it, rc);
1058 if (it_disposition(&it, DISP_LOOKUP_NEG))
1059 GOTO(out_release_it, rc = -ENOENT);
1061 rc = it_open_error(DISP_OPEN_OPEN, &it);
1063 GOTO(out_release_it, rc);
1065 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1066 ll_och_fill(sbi->ll_md_exp, &it, och);
1068 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1069 GOTO(out_close, rc = -EOPNOTSUPP);
1071 /* already get lease, handle lease lock */
1072 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1073 if (it.it_lock_mode == 0 ||
1074 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1075 /* open lock must return for lease */
1076 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1077 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1079 GOTO(out_close, rc = -EPROTO);
1082 ll_intent_release(&it);
1086 /* Cancel open lock */
1087 if (it.it_lock_mode != 0) {
1088 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1090 it.it_lock_mode = 0;
1091 och->och_lease_handle.cookie = 0ULL;
1093 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1095 CERROR("%s: error closing file "DFID": %d\n",
1096 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1097 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1099 ll_intent_release(&it);
1103 RETURN(ERR_PTR(rc));
1107 * Check whether a layout swap can be done between two inodes.
1109 * \param[in] inode1 First inode to check
1110 * \param[in] inode2 Second inode to check
1112 * \retval 0 on success, layout swap can be performed between both inodes
1113 * \retval negative error code if requirements are not met
1115 static int ll_check_swap_layouts_validity(struct inode *inode1,
1116 struct inode *inode2)
1118 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1121 if (inode_permission(inode1, MAY_WRITE) ||
1122 inode_permission(inode2, MAY_WRITE))
1125 if (inode1->i_sb != inode2->i_sb)
1131 static int ll_swap_layouts_close(struct obd_client_handle *och,
1132 struct inode *inode, struct inode *inode2)
1134 const struct lu_fid *fid1 = ll_inode2fid(inode);
1135 const struct lu_fid *fid2;
1139 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1140 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1142 rc = ll_check_swap_layouts_validity(inode, inode2);
1144 GOTO(out_free_och, rc);
1146 /* We now know that inode2 is a lustre inode */
1147 fid2 = ll_inode2fid(inode2);
1149 rc = lu_fid_cmp(fid1, fid2);
1151 GOTO(out_free_och, rc = -EINVAL);
1153 /* Close the file and {swap,merge} layouts between inode & inode2.
1154 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1155 * because we still need it to pack l_remote_handle to MDT. */
1156 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1159 och = NULL; /* freed in ll_close_inode_openhandle() */
1169 * Release lease and close the file.
1170 * It will check if the lease has ever broken.
1172 static int ll_lease_close_intent(struct obd_client_handle *och,
1173 struct inode *inode,
1174 bool *lease_broken, enum mds_op_bias bias,
1177 struct ldlm_lock *lock;
1178 bool cancelled = true;
1182 lock = ldlm_handle2lock(&och->och_lease_handle);
1184 lock_res_and_lock(lock);
1185 cancelled = ldlm_is_cancel(lock);
1186 unlock_res_and_lock(lock);
1187 LDLM_LOCK_PUT(lock);
1190 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1191 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1193 if (lease_broken != NULL)
1194 *lease_broken = cancelled;
1196 if (!cancelled && !bias)
1197 ldlm_cli_cancel(&och->och_lease_handle, 0);
1199 if (cancelled) { /* no need to excute intent */
1204 rc = ll_close_inode_openhandle(inode, och, bias, data);
1208 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1211 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1215 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1217 static int ll_lease_file_resync(struct obd_client_handle *och,
1218 struct inode *inode, unsigned long arg)
1220 struct ll_sb_info *sbi = ll_i2sbi(inode);
1221 struct md_op_data *op_data;
1222 struct ll_ioc_lease_id ioc;
1223 __u64 data_version_unused;
1227 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1228 LUSTRE_OPC_ANY, NULL);
1229 if (IS_ERR(op_data))
1230 RETURN(PTR_ERR(op_data));
1232 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1236 /* before starting file resync, it's necessary to clean up page cache
1237 * in client memory, otherwise once the layout version is increased,
1238 * writing back cached data will be denied the OSTs. */
1239 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1243 op_data->op_lease_handle = och->och_lease_handle;
1244 op_data->op_mirror_id = ioc.lil_mirror_id;
1245 rc = md_file_resync(sbi->ll_md_exp, op_data);
1251 ll_finish_md_op_data(op_data);
1255 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1257 struct ll_inode_info *lli = ll_i2info(inode);
1258 struct cl_object *obj = lli->lli_clob;
1259 struct cl_attr *attr = vvp_env_thread_attr(env);
1267 ll_inode_size_lock(inode);
1269 /* Merge timestamps the most recently obtained from MDS with
1270 * timestamps obtained from OSTs.
1272 * Do not overwrite atime of inode because it may be refreshed
1273 * by file_accessed() function. If the read was served by cache
1274 * data, there is no RPC to be sent so that atime may not be
1275 * transferred to OSTs at all. MDT only updates atime at close time
1276 * if it's at least 'mdd.*.atime_diff' older.
1277 * All in all, the atime in Lustre does not strictly comply with
1278 * POSIX. Solving this problem needs to send an RPC to MDT for each
1279 * read, this will hurt performance.
1281 if (inode->i_atime.tv_sec < lli->lli_atime ||
1282 lli->lli_update_atime) {
1283 inode->i_atime.tv_sec = lli->lli_atime;
1284 lli->lli_update_atime = 0;
1286 inode->i_mtime.tv_sec = lli->lli_mtime;
1287 inode->i_ctime.tv_sec = lli->lli_ctime;
1289 mtime = inode->i_mtime.tv_sec;
1290 atime = inode->i_atime.tv_sec;
1291 ctime = inode->i_ctime.tv_sec;
1293 cl_object_attr_lock(obj);
1294 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1297 rc = cl_object_attr_get(env, obj, attr);
1298 cl_object_attr_unlock(obj);
1301 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1303 if (atime < attr->cat_atime)
1304 atime = attr->cat_atime;
1306 if (ctime < attr->cat_ctime)
1307 ctime = attr->cat_ctime;
1309 if (mtime < attr->cat_mtime)
1310 mtime = attr->cat_mtime;
1312 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1313 PFID(&lli->lli_fid), attr->cat_size);
1315 i_size_write(inode, attr->cat_size);
1316 inode->i_blocks = attr->cat_blocks;
1318 inode->i_mtime.tv_sec = mtime;
1319 inode->i_atime.tv_sec = atime;
1320 inode->i_ctime.tv_sec = ctime;
1323 ll_inode_size_unlock(inode);
1329 * Set designated mirror for I/O.
1331 * So far only read, write, and truncated can support to issue I/O to
1332 * designated mirror.
1334 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1336 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1338 /* clear layout version for generic(non-resync) I/O in case it carries
1339 * stale layout version due to I/O restart */
1340 io->ci_layout_version = 0;
1342 /* FLR: disable non-delay for designated mirror I/O because obviously
1343 * only one mirror is available */
1344 if (fd->fd_designated_mirror > 0) {
1346 io->ci_designated_mirror = fd->fd_designated_mirror;
1347 io->ci_layout_version = fd->fd_layout_version;
1350 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1351 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1354 static bool file_is_noatime(const struct file *file)
1356 const struct vfsmount *mnt = file->f_path.mnt;
1357 const struct inode *inode = file_inode((struct file *)file);
1359 /* Adapted from file_accessed() and touch_atime().*/
1360 if (file->f_flags & O_NOATIME)
1363 if (inode->i_flags & S_NOATIME)
1366 if (IS_NOATIME(inode))
1369 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1372 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1375 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1381 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1382 struct vvp_io_args *args)
1384 struct inode *inode = file_inode(file);
1385 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1387 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1388 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1390 if (iot == CIT_WRITE) {
1391 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1392 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1393 file->f_flags & O_DIRECT ||
1395 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1396 io->u.ci_wr.wr_sync |= !!(args &&
1397 args->via_io_subtype == IO_NORMAL &&
1398 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1402 io->ci_obj = ll_i2info(inode)->lli_clob;
1403 io->ci_lockreq = CILR_MAYBE;
1404 if (ll_file_nolock(file)) {
1405 io->ci_lockreq = CILR_NEVER;
1406 io->ci_no_srvlock = 1;
1407 } else if (file->f_flags & O_APPEND) {
1408 io->ci_lockreq = CILR_MANDATORY;
1410 io->ci_noatime = file_is_noatime(file);
1411 io->ci_async_readahead = false;
1413 /* FLR: only use non-delay I/O for read as there is only one
1414 * avaliable mirror for write. */
1415 io->ci_ndelay = !(iot == CIT_WRITE);
1417 ll_io_set_mirror(io, file);
1420 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1423 struct ll_inode_info *lli = ll_i2info(inode);
1424 struct ll_sb_info *sbi = ll_i2sbi(inode);
1425 enum obd_heat_type sample_type;
1426 enum obd_heat_type iobyte_type;
1427 __u64 now = ktime_get_real_seconds();
1429 if (!ll_sbi_has_file_heat(sbi) ||
1430 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1433 if (iot == CIT_READ) {
1434 sample_type = OBD_HEAT_READSAMPLE;
1435 iobyte_type = OBD_HEAT_READBYTE;
1436 } else if (iot == CIT_WRITE) {
1437 sample_type = OBD_HEAT_WRITESAMPLE;
1438 iobyte_type = OBD_HEAT_WRITEBYTE;
1443 spin_lock(&lli->lli_heat_lock);
1444 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1445 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1446 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1447 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1448 spin_unlock(&lli->lli_heat_lock);
1452 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1453 struct file *file, enum cl_io_type iot,
1454 loff_t *ppos, size_t count)
1456 struct vvp_io *vio = vvp_env_io(env);
1457 struct inode *inode = file_inode(file);
1458 struct ll_inode_info *lli = ll_i2info(inode);
1459 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1460 struct range_lock range;
1464 unsigned retried = 0;
1465 bool restarted = false;
1469 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1470 file_dentry(file)->d_name.name,
1471 iot == CIT_READ ? "read" : "write", *ppos, count);
1474 io = vvp_env_thread_io(env);
1475 ll_io_init(io, file, iot, args);
1476 io->ci_ndelay_tried = retried;
1478 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1479 bool range_locked = false;
1481 if (file->f_flags & O_APPEND)
1482 range_lock_init(&range, 0, LUSTRE_EOF);
1484 range_lock_init(&range, *ppos, *ppos + count - 1);
1486 vio->vui_fd = LUSTRE_FPRIVATE(file);
1487 vio->vui_io_subtype = args->via_io_subtype;
1489 switch (vio->vui_io_subtype) {
1491 vio->vui_iter = args->u.normal.via_iter;
1492 vio->vui_iocb = args->u.normal.via_iocb;
1493 /* Direct IO reads must also take range lock,
1494 * or multiple reads will try to work on the same pages
1495 * See LU-6227 for details. */
1496 if (((iot == CIT_WRITE) ||
1497 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1498 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1499 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1501 rc = range_lock(&lli->lli_write_tree, &range);
1505 range_locked = true;
1509 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1510 vio->u.splice.vui_flags = args->u.splice.via_flags;
1513 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1517 ll_cl_add(file, env, io, LCC_RW);
1518 rc = cl_io_loop(env, io);
1519 ll_cl_remove(file, env);
1522 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1524 range_unlock(&lli->lli_write_tree, &range);
1527 /* cl_io_rw_init() handled IO */
1531 if (io->ci_nob > 0) {
1532 result += io->ci_nob;
1533 count -= io->ci_nob;
1534 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1536 /* prepare IO restart */
1537 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1538 args->u.normal.via_iter = vio->vui_iter;
1541 cl_io_fini(env, io);
1544 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1545 file->f_path.dentry->d_name.name,
1546 iot, rc, result, io->ci_need_restart);
1548 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1550 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1551 file_dentry(file)->d_name.name,
1552 iot == CIT_READ ? "read" : "write",
1553 *ppos, count, result, rc);
1554 /* preserve the tried count for FLR */
1555 retried = io->ci_ndelay_tried;
1560 if (iot == CIT_READ) {
1562 ll_stats_ops_tally(ll_i2sbi(inode),
1563 LPROC_LL_READ_BYTES, result);
1564 } else if (iot == CIT_WRITE) {
1566 ll_stats_ops_tally(ll_i2sbi(inode),
1567 LPROC_LL_WRITE_BYTES, result);
1568 fd->fd_write_failed = false;
1569 } else if (result == 0 && rc == 0) {
1572 fd->fd_write_failed = true;
1574 fd->fd_write_failed = false;
1575 } else if (rc != -ERESTARTSYS) {
1576 fd->fd_write_failed = true;
1580 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1582 ll_heat_add(inode, iot, result);
1584 RETURN(result > 0 ? result : rc);
1588 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1589 * especially for small I/O.
1591 * To serve a read request, CLIO has to create and initialize a cl_io and
1592 * then request DLM lock. This has turned out to have siginificant overhead
1593 * and affects the performance of small I/O dramatically.
1595 * It's not necessary to create a cl_io for each I/O. Under the help of read
1596 * ahead, most of the pages being read are already in memory cache and we can
1597 * read those pages directly because if the pages exist, the corresponding DLM
1598 * lock must exist so that page content must be valid.
1600 * In fast read implementation, the llite speculatively finds and reads pages
1601 * in memory cache. There are three scenarios for fast read:
1602 * - If the page exists and is uptodate, kernel VM will provide the data and
1603 * CLIO won't be intervened;
1604 * - If the page was brought into memory by read ahead, it will be exported
1605 * and read ahead parameters will be updated;
1606 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1607 * it will go back and invoke normal read, i.e., a cl_io will be created
1608 * and DLM lock will be requested.
1610 * POSIX compliance: posix standard states that read is intended to be atomic.
1611 * Lustre read implementation is in line with Linux kernel read implementation
1612 * and neither of them complies with POSIX standard in this matter. Fast read
1613 * doesn't make the situation worse on single node but it may interleave write
1614 * results from multiple nodes due to short read handling in ll_file_aio_read().
1616 * \param env - lu_env
1617 * \param iocb - kiocb from kernel
1618 * \param iter - user space buffers where the data will be copied
1620 * \retval - number of bytes have been read, or error code if error occurred.
1623 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1627 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1630 /* NB: we can't do direct IO for fast read because it will need a lock
1631 * to make IO engine happy. */
1632 if (iocb->ki_filp->f_flags & O_DIRECT)
1635 result = generic_file_read_iter(iocb, iter);
1637 /* If the first page is not in cache, generic_file_aio_read() will be
1638 * returned with -ENODATA.
1639 * See corresponding code in ll_readpage(). */
1640 if (result == -ENODATA)
1644 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1645 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1646 LPROC_LL_READ_BYTES, result);
1653 * Read from a file (through the page cache).
1655 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1658 struct vvp_io_args *args;
1664 if (!iov_iter_count(to))
1668 * Currently when PCC read failed, we do not fall back to the
1669 * normal read path, just return the error.
1670 * The resaon is that: for RW-PCC, the file data may be modified
1671 * in the PCC and inconsistent with the data on OSTs (or file
1672 * data has been removed from the Lustre file system), at this
1673 * time, fallback to the normal read path may read the wrong
1675 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1676 * path: read data from data copy on OSTs.
1678 result = pcc_file_read_iter(iocb, to, &cached);
1682 ll_ras_enter(iocb->ki_filp);
1684 result = ll_do_fast_read(iocb, to);
1685 if (result < 0 || iov_iter_count(to) == 0)
1688 env = cl_env_get(&refcheck);
1690 return PTR_ERR(env);
1692 args = ll_env_args(env, IO_NORMAL);
1693 args->u.normal.via_iter = to;
1694 args->u.normal.via_iocb = iocb;
1696 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1697 &iocb->ki_pos, iov_iter_count(to));
1700 else if (result == 0)
1703 cl_env_put(env, &refcheck);
1709 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1710 * If a page is already in the page cache and dirty (and some other things -
1711 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1712 * write to it without doing a full I/O, because Lustre already knows about it
1713 * and will write it out. This saves a lot of processing time.
1715 * All writes here are within one page, so exclusion is handled by the page
1716 * lock on the vm page. We do not do tiny writes for writes which touch
1717 * multiple pages because it's very unlikely multiple sequential pages are
1718 * are already dirty.
1720 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1721 * and are unlikely to be to already dirty pages.
1723 * Attribute updates are important here, we do them in ll_tiny_write_end.
1725 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1727 ssize_t count = iov_iter_count(iter);
1728 struct file *file = iocb->ki_filp;
1729 struct inode *inode = file_inode(file);
1730 bool lock_inode = !IS_NOSEC(inode);
1735 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1736 * of function for why.
1738 if (count >= PAGE_SIZE ||
1739 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1742 if (unlikely(lock_inode))
1744 result = __generic_file_write_iter(iocb, iter);
1746 if (unlikely(lock_inode))
1747 inode_unlock(inode);
1749 /* If the page is not already dirty, ll_tiny_write_begin returns
1750 * -ENODATA. We continue on to normal write.
1752 if (result == -ENODATA)
1756 ll_heat_add(inode, CIT_WRITE, result);
1757 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1759 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1762 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1768 * Write to a file (through the page cache).
1770 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1772 struct vvp_io_args *args;
1774 ssize_t rc_tiny = 0, rc_normal;
1781 if (!iov_iter_count(from))
1782 GOTO(out, rc_normal = 0);
1785 * When PCC write failed, we usually do not fall back to the normal
1786 * write path, just return the error. But there is a special case when
1787 * returned error code is -ENOSPC due to running out of space on PCC HSM
1788 * bakcend. At this time, it will fall back to normal I/O path and
1789 * retry the I/O. As the file is in HSM released state, it will restore
1790 * the file data to OSTs first and redo the write again. And the
1791 * restore process will revoke the layout lock and detach the file
1792 * from PCC cache automatically.
1794 result = pcc_file_write_iter(iocb, from, &cached);
1795 if (cached && result != -ENOSPC && result != -EDQUOT)
1798 /* NB: we can't do direct IO for tiny writes because they use the page
1799 * cache, we can't do sync writes because tiny writes can't flush
1800 * pages, and we can't do append writes because we can't guarantee the
1801 * required DLM locks are held to protect file size.
1803 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1804 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1805 rc_tiny = ll_do_tiny_write(iocb, from);
1807 /* In case of error, go on and try normal write - Only stop if tiny
1808 * write completed I/O.
1810 if (iov_iter_count(from) == 0)
1811 GOTO(out, rc_normal = rc_tiny);
1813 env = cl_env_get(&refcheck);
1815 return PTR_ERR(env);
1817 args = ll_env_args(env, IO_NORMAL);
1818 args->u.normal.via_iter = from;
1819 args->u.normal.via_iocb = iocb;
1821 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1822 &iocb->ki_pos, iov_iter_count(from));
1824 /* On success, combine bytes written. */
1825 if (rc_tiny >= 0 && rc_normal > 0)
1826 rc_normal += rc_tiny;
1827 /* On error, only return error from normal write if tiny write did not
1828 * write any bytes. Otherwise return bytes written by tiny write.
1830 else if (rc_tiny > 0)
1831 rc_normal = rc_tiny;
1833 cl_env_put(env, &refcheck);
1838 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1840 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1842 static int ll_file_get_iov_count(const struct iovec *iov,
1843 unsigned long *nr_segs, size_t *count)
1848 for (seg = 0; seg < *nr_segs; seg++) {
1849 const struct iovec *iv = &iov[seg];
1852 * If any segment has a negative length, or the cumulative
1853 * length ever wraps negative then return -EINVAL.
1856 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1858 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1863 cnt -= iv->iov_len; /* This segment is no good */
1870 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1871 unsigned long nr_segs, loff_t pos)
1878 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1885 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1886 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1887 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1888 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1889 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1891 result = ll_file_read_iter(iocb, &to);
1896 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1899 struct iovec iov = { .iov_base = buf, .iov_len = count };
1908 init_sync_kiocb(&kiocb, file);
1909 kiocb.ki_pos = *ppos;
1910 #ifdef HAVE_KIOCB_KI_LEFT
1911 kiocb.ki_left = count;
1912 #elif defined(HAVE_KI_NBYTES)
1913 kiocb.i_nbytes = count;
1916 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1917 *ppos = kiocb.ki_pos;
1923 * Write to a file (through the page cache).
1926 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1927 unsigned long nr_segs, loff_t pos)
1929 struct iov_iter from;
1934 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1941 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1942 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1943 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1944 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1945 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1947 result = ll_file_write_iter(iocb, &from);
1952 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1953 size_t count, loff_t *ppos)
1955 struct iovec iov = { .iov_base = (void __user *)buf,
1965 init_sync_kiocb(&kiocb, file);
1966 kiocb.ki_pos = *ppos;
1967 #ifdef HAVE_KIOCB_KI_LEFT
1968 kiocb.ki_left = count;
1969 #elif defined(HAVE_KI_NBYTES)
1970 kiocb.ki_nbytes = count;
1973 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1974 *ppos = kiocb.ki_pos;
1978 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1981 * Send file content (through pagecache) somewhere with helper
1983 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1984 struct pipe_inode_info *pipe, size_t count,
1988 struct vvp_io_args *args;
1995 result = pcc_file_splice_read(in_file, ppos, pipe,
1996 count, flags, &cached);
2000 ll_ras_enter(in_file);
2002 env = cl_env_get(&refcheck);
2004 RETURN(PTR_ERR(env));
2006 args = ll_env_args(env, IO_SPLICE);
2007 args->u.splice.via_pipe = pipe;
2008 args->u.splice.via_flags = flags;
2010 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2011 cl_env_put(env, &refcheck);
2015 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2016 __u64 flags, struct lov_user_md *lum, int lum_size)
2018 struct lookup_intent oit = {
2020 .it_flags = flags | MDS_OPEN_BY_FID,
2025 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2026 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2027 /* this code will only exist for big-endian systems */
2028 lustre_swab_lov_user_md(lum);
2031 ll_inode_size_lock(inode);
2032 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2034 GOTO(out_unlock, rc);
2036 ll_release_openhandle(dentry, &oit);
2039 ll_inode_size_unlock(inode);
2040 ll_intent_release(&oit);
2045 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2046 struct lov_mds_md **lmmp, int *lmm_size,
2047 struct ptlrpc_request **request)
2049 struct ll_sb_info *sbi = ll_i2sbi(inode);
2050 struct mdt_body *body;
2051 struct lov_mds_md *lmm = NULL;
2052 struct ptlrpc_request *req = NULL;
2053 struct md_op_data *op_data;
2056 rc = ll_get_default_mdsize(sbi, &lmmsize);
2060 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2061 strlen(filename), lmmsize,
2062 LUSTRE_OPC_ANY, NULL);
2063 if (IS_ERR(op_data))
2064 RETURN(PTR_ERR(op_data));
2066 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2067 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2068 ll_finish_md_op_data(op_data);
2070 CDEBUG(D_INFO, "md_getattr_name failed "
2071 "on %s: rc %d\n", filename, rc);
2075 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2076 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2078 lmmsize = body->mbo_eadatasize;
2080 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2082 GOTO(out, rc = -ENODATA);
2085 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2086 LASSERT(lmm != NULL);
2088 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2089 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2090 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2091 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2092 GOTO(out, rc = -EPROTO);
2095 * This is coming from the MDS, so is probably in
2096 * little endian. We convert it to host endian before
2097 * passing it to userspace.
2099 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2100 __swab32(LOV_MAGIC_MAGIC)) {
2101 int stripe_count = 0;
2103 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2104 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2105 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2106 if (le32_to_cpu(lmm->lmm_pattern) &
2107 LOV_PATTERN_F_RELEASED)
2111 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
2113 /* if function called for directory - we should
2114 * avoid swab not existent lsm objects */
2115 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2116 lustre_swab_lov_user_md_objects(
2117 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2119 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2120 S_ISREG(body->mbo_mode))
2121 lustre_swab_lov_user_md_objects(
2122 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2128 *lmm_size = lmmsize;
2133 static int ll_lov_setea(struct inode *inode, struct file *file,
2136 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2137 struct lov_user_md *lump;
2138 int lum_size = sizeof(struct lov_user_md) +
2139 sizeof(struct lov_user_ost_data);
2143 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2146 OBD_ALLOC_LARGE(lump, lum_size);
2150 if (copy_from_user(lump, arg, lum_size))
2151 GOTO(out_lump, rc = -EFAULT);
2153 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2155 cl_lov_delay_create_clear(&file->f_flags);
2158 OBD_FREE_LARGE(lump, lum_size);
2162 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2169 env = cl_env_get(&refcheck);
2171 RETURN(PTR_ERR(env));
2173 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2174 cl_env_put(env, &refcheck);
2178 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2181 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2182 struct lov_user_md *klum;
2184 __u64 flags = FMODE_WRITE;
2187 rc = ll_copy_user_md(lum, &klum);
2192 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2197 rc = put_user(0, &lum->lmm_stripe_count);
2201 rc = ll_layout_refresh(inode, &gen);
2205 rc = ll_file_getstripe(inode, arg, lum_size);
2207 cl_lov_delay_create_clear(&file->f_flags);
2210 OBD_FREE_LARGE(klum, lum_size);
2215 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2217 struct ll_inode_info *lli = ll_i2info(inode);
2218 struct cl_object *obj = lli->lli_clob;
2219 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2220 struct ll_grouplock grouplock;
2225 CWARN("group id for group lock must not be 0\n");
2229 if (ll_file_nolock(file))
2230 RETURN(-EOPNOTSUPP);
2232 spin_lock(&lli->lli_lock);
2233 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2234 CWARN("group lock already existed with gid %lu\n",
2235 fd->fd_grouplock.lg_gid);
2236 spin_unlock(&lli->lli_lock);
2239 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2240 spin_unlock(&lli->lli_lock);
2243 * XXX: group lock needs to protect all OST objects while PFL
2244 * can add new OST objects during the IO, so we'd instantiate
2245 * all OST objects before getting its group lock.
2250 struct cl_layout cl = {
2251 .cl_is_composite = false,
2253 struct lu_extent ext = {
2255 .e_end = OBD_OBJECT_EOF,
2258 env = cl_env_get(&refcheck);
2260 RETURN(PTR_ERR(env));
2262 rc = cl_object_layout_get(env, obj, &cl);
2263 if (!rc && cl.cl_is_composite)
2264 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2267 cl_env_put(env, &refcheck);
2272 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2273 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2277 spin_lock(&lli->lli_lock);
2278 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2279 spin_unlock(&lli->lli_lock);
2280 CERROR("another thread just won the race\n");
2281 cl_put_grouplock(&grouplock);
2285 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2286 fd->fd_grouplock = grouplock;
2287 spin_unlock(&lli->lli_lock);
2289 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2293 static int ll_put_grouplock(struct inode *inode, struct file *file,
2296 struct ll_inode_info *lli = ll_i2info(inode);
2297 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2298 struct ll_grouplock grouplock;
2301 spin_lock(&lli->lli_lock);
2302 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2303 spin_unlock(&lli->lli_lock);
2304 CWARN("no group lock held\n");
2308 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2310 if (fd->fd_grouplock.lg_gid != arg) {
2311 CWARN("group lock %lu doesn't match current id %lu\n",
2312 arg, fd->fd_grouplock.lg_gid);
2313 spin_unlock(&lli->lli_lock);
2317 grouplock = fd->fd_grouplock;
2318 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2319 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2320 spin_unlock(&lli->lli_lock);
2322 cl_put_grouplock(&grouplock);
2323 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2328 * Close inode open handle
2330 * \param dentry [in] dentry which contains the inode
2331 * \param it [in,out] intent which contains open info and result
2334 * \retval <0 failure
2336 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2338 struct inode *inode = dentry->d_inode;
2339 struct obd_client_handle *och;
2345 /* Root ? Do nothing. */
2346 if (dentry->d_inode->i_sb->s_root == dentry)
2349 /* No open handle to close? Move away */
2350 if (!it_disposition(it, DISP_OPEN_OPEN))
2353 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2355 OBD_ALLOC(och, sizeof(*och));
2357 GOTO(out, rc = -ENOMEM);
2359 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2361 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2363 /* this one is in place of ll_file_open */
2364 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2365 ptlrpc_req_finished(it->it_request);
2366 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2372 * Get size for inode for which FIEMAP mapping is requested.
2373 * Make the FIEMAP get_info call and returns the result.
2374 * \param fiemap kernel buffer to hold extens
2375 * \param num_bytes kernel buffer size
2377 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2383 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2386 /* Checks for fiemap flags */
2387 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2388 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2392 /* Check for FIEMAP_FLAG_SYNC */
2393 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2394 rc = filemap_fdatawrite(inode->i_mapping);
2399 env = cl_env_get(&refcheck);
2401 RETURN(PTR_ERR(env));
2403 if (i_size_read(inode) == 0) {
2404 rc = ll_glimpse_size(inode);
2409 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2410 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2411 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2413 /* If filesize is 0, then there would be no objects for mapping */
2414 if (fmkey.lfik_oa.o_size == 0) {
2415 fiemap->fm_mapped_extents = 0;
2419 fmkey.lfik_fiemap = *fiemap;
2421 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2422 &fmkey, fiemap, &num_bytes);
2424 cl_env_put(env, &refcheck);
2428 int ll_fid2path(struct inode *inode, void __user *arg)
2430 struct obd_export *exp = ll_i2mdexp(inode);
2431 const struct getinfo_fid2path __user *gfin = arg;
2433 struct getinfo_fid2path *gfout;
2439 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2440 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2443 /* Only need to get the buflen */
2444 if (get_user(pathlen, &gfin->gf_pathlen))
2447 if (pathlen > PATH_MAX)
2450 outsize = sizeof(*gfout) + pathlen;
2451 OBD_ALLOC(gfout, outsize);
2455 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2456 GOTO(gf_free, rc = -EFAULT);
2457 /* append root FID after gfout to let MDT know the root FID so that it
2458 * can lookup the correct path, this is mainly for fileset.
2459 * old server without fileset mount support will ignore this. */
2460 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2462 /* Call mdc_iocontrol */
2463 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2467 if (copy_to_user(arg, gfout, outsize))
2471 OBD_FREE(gfout, outsize);
2476 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2478 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2486 ioc->idv_version = 0;
2487 ioc->idv_layout_version = UINT_MAX;
2489 /* If no file object initialized, we consider its version is 0. */
2493 env = cl_env_get(&refcheck);
2495 RETURN(PTR_ERR(env));
2497 io = vvp_env_thread_io(env);
2499 io->u.ci_data_version.dv_data_version = 0;
2500 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2501 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2504 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2505 result = cl_io_loop(env, io);
2507 result = io->ci_result;
2509 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2510 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2512 cl_io_fini(env, io);
2514 if (unlikely(io->ci_need_restart))
2517 cl_env_put(env, &refcheck);
2523 * Read the data_version for inode.
2525 * This value is computed using stripe object version on OST.
2526 * Version is computed using server side locking.
2528 * @param flags if do sync on the OST side;
2530 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2531 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2533 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2535 struct ioc_data_version ioc = { .idv_flags = flags };
2538 rc = ll_ioc_data_version(inode, &ioc);
2540 *data_version = ioc.idv_version;
2546 * Trigger a HSM release request for the provided inode.
2548 int ll_hsm_release(struct inode *inode)
2551 struct obd_client_handle *och = NULL;
2552 __u64 data_version = 0;
2557 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2558 ll_i2sbi(inode)->ll_fsname,
2559 PFID(&ll_i2info(inode)->lli_fid));
2561 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2563 GOTO(out, rc = PTR_ERR(och));
2565 /* Grab latest data_version and [am]time values */
2566 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2570 env = cl_env_get(&refcheck);
2572 GOTO(out, rc = PTR_ERR(env));
2574 rc = ll_merge_attr(env, inode);
2575 cl_env_put(env, &refcheck);
2577 /* If error happen, we have the wrong size for a file.
2583 /* Release the file.
2584 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2585 * we still need it to pack l_remote_handle to MDT. */
2586 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2592 if (och != NULL && !IS_ERR(och)) /* close the file */
2593 ll_lease_close(och, inode, NULL);
2598 struct ll_swap_stack {
2601 struct inode *inode1;
2602 struct inode *inode2;
2607 static int ll_swap_layouts(struct file *file1, struct file *file2,
2608 struct lustre_swap_layouts *lsl)
2610 struct mdc_swap_layouts msl;
2611 struct md_op_data *op_data;
2614 struct ll_swap_stack *llss = NULL;
2617 OBD_ALLOC_PTR(llss);
2621 llss->inode1 = file_inode(file1);
2622 llss->inode2 = file_inode(file2);
2624 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2628 /* we use 2 bool because it is easier to swap than 2 bits */
2629 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2630 llss->check_dv1 = true;
2632 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2633 llss->check_dv2 = true;
2635 /* we cannot use lsl->sl_dvX directly because we may swap them */
2636 llss->dv1 = lsl->sl_dv1;
2637 llss->dv2 = lsl->sl_dv2;
2639 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2640 if (rc == 0) /* same file, done! */
2643 if (rc < 0) { /* sequentialize it */
2644 swap(llss->inode1, llss->inode2);
2646 swap(llss->dv1, llss->dv2);
2647 swap(llss->check_dv1, llss->check_dv2);
2651 if (gid != 0) { /* application asks to flush dirty cache */
2652 rc = ll_get_grouplock(llss->inode1, file1, gid);
2656 rc = ll_get_grouplock(llss->inode2, file2, gid);
2658 ll_put_grouplock(llss->inode1, file1, gid);
2663 /* ultimate check, before swaping the layouts we check if
2664 * dataversion has changed (if requested) */
2665 if (llss->check_dv1) {
2666 rc = ll_data_version(llss->inode1, &dv, 0);
2669 if (dv != llss->dv1)
2670 GOTO(putgl, rc = -EAGAIN);
2673 if (llss->check_dv2) {
2674 rc = ll_data_version(llss->inode2, &dv, 0);
2677 if (dv != llss->dv2)
2678 GOTO(putgl, rc = -EAGAIN);
2681 /* struct md_op_data is used to send the swap args to the mdt
2682 * only flags is missing, so we use struct mdc_swap_layouts
2683 * through the md_op_data->op_data */
2684 /* flags from user space have to be converted before they are send to
2685 * server, no flag is sent today, they are only used on the client */
2688 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2689 0, LUSTRE_OPC_ANY, &msl);
2690 if (IS_ERR(op_data))
2691 GOTO(free, rc = PTR_ERR(op_data));
2693 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2694 sizeof(*op_data), op_data, NULL);
2695 ll_finish_md_op_data(op_data);
2702 ll_put_grouplock(llss->inode2, file2, gid);
2703 ll_put_grouplock(llss->inode1, file1, gid);
2713 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2715 struct obd_export *exp = ll_i2mdexp(inode);
2716 struct md_op_data *op_data;
2720 /* Detect out-of range masks */
2721 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2724 /* Non-root users are forbidden to set or clear flags which are
2725 * NOT defined in HSM_USER_MASK. */
2726 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2727 !cfs_capable(CFS_CAP_SYS_ADMIN))
2730 if (!exp_connect_archive_id_array(exp)) {
2731 /* Detect out-of range archive id */
2732 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2733 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2737 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2738 LUSTRE_OPC_ANY, hss);
2739 if (IS_ERR(op_data))
2740 RETURN(PTR_ERR(op_data));
2742 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2745 ll_finish_md_op_data(op_data);
2750 static int ll_hsm_import(struct inode *inode, struct file *file,
2751 struct hsm_user_import *hui)
2753 struct hsm_state_set *hss = NULL;
2754 struct iattr *attr = NULL;
2758 if (!S_ISREG(inode->i_mode))
2764 GOTO(out, rc = -ENOMEM);
2766 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2767 hss->hss_archive_id = hui->hui_archive_id;
2768 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2769 rc = ll_hsm_state_set(inode, hss);
2773 OBD_ALLOC_PTR(attr);
2775 GOTO(out, rc = -ENOMEM);
2777 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2778 attr->ia_mode |= S_IFREG;
2779 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2780 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2781 attr->ia_size = hui->hui_size;
2782 attr->ia_mtime.tv_sec = hui->hui_mtime;
2783 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2784 attr->ia_atime.tv_sec = hui->hui_atime;
2785 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2787 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2788 ATTR_UID | ATTR_GID |
2789 ATTR_MTIME | ATTR_MTIME_SET |
2790 ATTR_ATIME | ATTR_ATIME_SET;
2794 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2798 inode_unlock(inode);
2810 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2812 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2813 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2816 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2818 struct inode *inode = file_inode(file);
2820 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2821 ATTR_MTIME | ATTR_MTIME_SET |
2824 .tv_sec = lfu->lfu_atime_sec,
2825 .tv_nsec = lfu->lfu_atime_nsec,
2828 .tv_sec = lfu->lfu_mtime_sec,
2829 .tv_nsec = lfu->lfu_mtime_nsec,
2832 .tv_sec = lfu->lfu_ctime_sec,
2833 .tv_nsec = lfu->lfu_ctime_nsec,
2839 if (!capable(CAP_SYS_ADMIN))
2842 if (!S_ISREG(inode->i_mode))
2846 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2848 inode_unlock(inode);
2853 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2856 case MODE_READ_USER:
2858 case MODE_WRITE_USER:
2865 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2867 /* Used to allow the upper layers of the client to request an LDLM lock
2868 * without doing an actual read or write.
2870 * Used for ladvise lockahead to manually request specific locks.
2872 * \param[in] file file this ladvise lock request is on
2873 * \param[in] ladvise ladvise struct describing this lock request
2875 * \retval 0 success, no detailed result available (sync requests
2876 * and requests sent to the server [not handled locally]
2877 * cannot return detailed results)
2878 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2879 * see definitions for details.
2880 * \retval negative negative errno on error
2882 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2884 struct lu_env *env = NULL;
2885 struct cl_io *io = NULL;
2886 struct cl_lock *lock = NULL;
2887 struct cl_lock_descr *descr = NULL;
2888 struct dentry *dentry = file->f_path.dentry;
2889 struct inode *inode = dentry->d_inode;
2890 enum cl_lock_mode cl_mode;
2891 off_t start = ladvise->lla_start;
2892 off_t end = ladvise->lla_end;
2898 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2899 "start=%llu, end=%llu\n", dentry->d_name.len,
2900 dentry->d_name.name, dentry->d_inode,
2901 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2904 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2906 GOTO(out, result = cl_mode);
2908 /* Get IO environment */
2909 result = cl_io_get(inode, &env, &io, &refcheck);
2913 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2916 * nothing to do for this io. This currently happens when
2917 * stripe sub-object's are not yet created.
2919 result = io->ci_result;
2920 } else if (result == 0) {
2921 lock = vvp_env_lock(env);
2922 descr = &lock->cll_descr;
2924 descr->cld_obj = io->ci_obj;
2925 /* Convert byte offsets to pages */
2926 descr->cld_start = cl_index(io->ci_obj, start);
2927 descr->cld_end = cl_index(io->ci_obj, end);
2928 descr->cld_mode = cl_mode;
2929 /* CEF_MUST is used because we do not want to convert a
2930 * lockahead request to a lockless lock */
2931 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2934 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2935 descr->cld_enq_flags |= CEF_SPECULATIVE;
2937 result = cl_lock_request(env, io, lock);
2939 /* On success, we need to release the lock */
2941 cl_lock_release(env, lock);
2943 cl_io_fini(env, io);
2944 cl_env_put(env, &refcheck);
2946 /* -ECANCELED indicates a matching lock with a different extent
2947 * was already present, and -EEXIST indicates a matching lock
2948 * on exactly the same extent was already present.
2949 * We convert them to positive values for userspace to make
2950 * recognizing true errors easier.
2951 * Note we can only return these detailed results on async requests,
2952 * as sync requests look the same as i/o requests for locking. */
2953 if (result == -ECANCELED)
2954 result = LLA_RESULT_DIFFERENT;
2955 else if (result == -EEXIST)
2956 result = LLA_RESULT_SAME;
2961 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2963 static int ll_ladvise_sanity(struct inode *inode,
2964 struct llapi_lu_ladvise *ladvise)
2966 struct ll_sb_info *sbi = ll_i2sbi(inode);
2967 enum lu_ladvise_type advice = ladvise->lla_advice;
2968 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2969 * be in the first 32 bits of enum ladvise_flags */
2970 __u32 flags = ladvise->lla_peradvice_flags;
2971 /* 3 lines at 80 characters per line, should be plenty */
2974 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2976 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2977 "last supported advice is %s (value '%d'): rc = %d\n",
2978 sbi->ll_fsname, advice,
2979 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2983 /* Per-advice checks */
2985 case LU_LADVISE_LOCKNOEXPAND:
2986 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2988 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2989 "rc = %d\n", sbi->ll_fsname, flags,
2990 ladvise_names[advice], rc);
2994 case LU_LADVISE_LOCKAHEAD:
2995 /* Currently only READ and WRITE modes can be requested */
2996 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2997 ladvise->lla_lockahead_mode == 0) {
2999 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3000 "rc = %d\n", sbi->ll_fsname,
3001 ladvise->lla_lockahead_mode,
3002 ladvise_names[advice], rc);
3005 case LU_LADVISE_WILLREAD:
3006 case LU_LADVISE_DONTNEED:
3008 /* Note fall through above - These checks apply to all advices
3009 * except LOCKNOEXPAND */
3010 if (flags & ~LF_DEFAULT_MASK) {
3012 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3013 "rc = %d\n", sbi->ll_fsname, flags,
3014 ladvise_names[advice], rc);
3017 if (ladvise->lla_start >= ladvise->lla_end) {
3019 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3020 "for %s: rc = %d\n", sbi->ll_fsname,
3021 ladvise->lla_start, ladvise->lla_end,
3022 ladvise_names[advice], rc);
3034 * Give file access advices
3036 * The ladvise interface is similar to Linux fadvise() system call, except it
3037 * forwards the advices directly from Lustre client to server. The server side
3038 * codes will apply appropriate read-ahead and caching techniques for the
3039 * corresponding files.
3041 * A typical workload for ladvise is e.g. a bunch of different clients are
3042 * doing small random reads of a file, so prefetching pages into OSS cache
3043 * with big linear reads before the random IO is a net benefit. Fetching
3044 * all that data into each client cache with fadvise() may not be, due to
3045 * much more data being sent to the client.
3047 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3048 struct llapi_lu_ladvise *ladvise)
3052 struct cl_ladvise_io *lio;
3057 env = cl_env_get(&refcheck);
3059 RETURN(PTR_ERR(env));
3061 io = vvp_env_thread_io(env);
3062 io->ci_obj = ll_i2info(inode)->lli_clob;
3064 /* initialize parameters for ladvise */
3065 lio = &io->u.ci_ladvise;
3066 lio->li_start = ladvise->lla_start;
3067 lio->li_end = ladvise->lla_end;
3068 lio->li_fid = ll_inode2fid(inode);
3069 lio->li_advice = ladvise->lla_advice;
3070 lio->li_flags = flags;
3072 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3073 rc = cl_io_loop(env, io);
3077 cl_io_fini(env, io);
3078 cl_env_put(env, &refcheck);
3082 static int ll_lock_noexpand(struct file *file, int flags)
3084 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3086 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3091 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3094 struct fsxattr fsxattr;
3096 if (copy_from_user(&fsxattr,
3097 (const struct fsxattr __user *)arg,
3101 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3102 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3103 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3104 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3105 if (copy_to_user((struct fsxattr __user *)arg,
3106 &fsxattr, sizeof(fsxattr)))
3112 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3115 * Project Quota ID state is only allowed to change from within the init
3116 * namespace. Enforce that restriction only if we are trying to change
3117 * the quota ID state. Everything else is allowed in user namespaces.
3119 if (current_user_ns() == &init_user_ns)
3122 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3125 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3126 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3129 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3136 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3140 struct md_op_data *op_data;
3141 struct ptlrpc_request *req = NULL;
3143 struct fsxattr fsxattr;
3144 struct cl_object *obj;
3148 if (copy_from_user(&fsxattr,
3149 (const struct fsxattr __user *)arg,
3153 rc = ll_ioctl_check_project(inode, &fsxattr);
3157 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3158 LUSTRE_OPC_ANY, NULL);
3159 if (IS_ERR(op_data))
3160 RETURN(PTR_ERR(op_data));
3162 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3163 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3164 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3165 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3166 op_data->op_projid = fsxattr.fsx_projid;
3167 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3168 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3170 ptlrpc_req_finished(req);
3172 GOTO(out_fsxattr, rc);
3173 ll_update_inode_flags(inode, op_data->op_attr_flags);
3174 obj = ll_i2info(inode)->lli_clob;
3176 GOTO(out_fsxattr, rc);
3178 OBD_ALLOC_PTR(attr);
3180 GOTO(out_fsxattr, rc = -ENOMEM);
3182 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3183 fsxattr.fsx_xflags);
3186 ll_finish_md_op_data(op_data);
3190 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3193 struct inode *inode = file_inode(file);
3194 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3195 struct ll_inode_info *lli = ll_i2info(inode);
3196 struct obd_client_handle *och = NULL;
3197 struct split_param sp;
3198 struct pcc_param param;
3199 bool lease_broken = false;
3201 enum mds_op_bias bias = 0;
3202 struct file *layout_file = NULL;
3204 size_t data_size = 0;
3205 bool attached = false;
3210 mutex_lock(&lli->lli_och_mutex);
3211 if (fd->fd_lease_och != NULL) {
3212 och = fd->fd_lease_och;
3213 fd->fd_lease_och = NULL;
3215 mutex_unlock(&lli->lli_och_mutex);
3220 fmode = och->och_flags;
3222 switch (ioc->lil_flags) {
3223 case LL_LEASE_RESYNC_DONE:
3224 if (ioc->lil_count > IOC_IDS_MAX)
3225 GOTO(out_lease_close, rc = -EINVAL);
3227 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3228 OBD_ALLOC(data, data_size);
3230 GOTO(out_lease_close, rc = -ENOMEM);
3232 if (copy_from_user(data, (void __user *)arg, data_size))
3233 GOTO(out_lease_close, rc = -EFAULT);
3235 bias = MDS_CLOSE_RESYNC_DONE;
3237 case LL_LEASE_LAYOUT_MERGE: {
3240 if (ioc->lil_count != 1)
3241 GOTO(out_lease_close, rc = -EINVAL);
3243 arg += sizeof(*ioc);
3244 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3245 GOTO(out_lease_close, rc = -EFAULT);
3247 layout_file = fget(fd);
3249 GOTO(out_lease_close, rc = -EBADF);
3251 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3252 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3253 GOTO(out_lease_close, rc = -EPERM);
3255 data = file_inode(layout_file);
3256 bias = MDS_CLOSE_LAYOUT_MERGE;
3259 case LL_LEASE_LAYOUT_SPLIT: {
3263 if (ioc->lil_count != 2)
3264 GOTO(out_lease_close, rc = -EINVAL);
3266 arg += sizeof(*ioc);
3267 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3268 GOTO(out_lease_close, rc = -EFAULT);
3270 arg += sizeof(__u32);
3271 if (copy_from_user(&mirror_id, (void __user *)arg,
3273 GOTO(out_lease_close, rc = -EFAULT);
3275 layout_file = fget(fdv);
3277 GOTO(out_lease_close, rc = -EBADF);
3279 sp.sp_inode = file_inode(layout_file);
3280 sp.sp_mirror_id = (__u16)mirror_id;
3282 bias = MDS_CLOSE_LAYOUT_SPLIT;
3285 case LL_LEASE_PCC_ATTACH:
3286 if (ioc->lil_count != 1)
3289 arg += sizeof(*ioc);
3290 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3292 GOTO(out_lease_close, rc2 = -EFAULT);
3294 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3296 GOTO(out_lease_close, rc2);
3299 /* Grab latest data version */
3300 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3303 GOTO(out_lease_close, rc2);
3306 bias = MDS_PCC_ATTACH;
3309 /* without close intent */
3314 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3318 rc = ll_lease_och_release(inode, file);
3327 switch (ioc->lil_flags) {
3328 case LL_LEASE_RESYNC_DONE:
3330 OBD_FREE(data, data_size);
3332 case LL_LEASE_LAYOUT_MERGE:
3333 case LL_LEASE_LAYOUT_SPLIT:
3337 case LL_LEASE_PCC_ATTACH:
3340 rc = pcc_readwrite_attach_fini(file, inode,
3341 param.pa_layout_gen,
3348 rc = ll_lease_type_from_fmode(fmode);
3352 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3355 struct inode *inode = file_inode(file);
3356 struct ll_inode_info *lli = ll_i2info(inode);
3357 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3358 struct obd_client_handle *och = NULL;
3359 __u64 open_flags = 0;
3365 switch (ioc->lil_mode) {
3366 case LL_LEASE_WRLCK:
3367 if (!(file->f_mode & FMODE_WRITE))
3369 fmode = FMODE_WRITE;
3371 case LL_LEASE_RDLCK:
3372 if (!(file->f_mode & FMODE_READ))
3376 case LL_LEASE_UNLCK:
3377 RETURN(ll_file_unlock_lease(file, ioc, arg));
3382 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3384 /* apply for lease */
3385 if (ioc->lil_flags & LL_LEASE_RESYNC)
3386 open_flags = MDS_OPEN_RESYNC;
3387 och = ll_lease_open(inode, file, fmode, open_flags);
3389 RETURN(PTR_ERR(och));
3391 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3392 rc = ll_lease_file_resync(och, inode, arg);
3394 ll_lease_close(och, inode, NULL);
3397 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3399 ll_lease_close(och, inode, NULL);
3405 mutex_lock(&lli->lli_och_mutex);
3406 if (fd->fd_lease_och == NULL) {
3407 fd->fd_lease_och = och;
3410 mutex_unlock(&lli->lli_och_mutex);
3412 /* impossible now that only excl is supported for now */
3413 ll_lease_close(och, inode, &lease_broken);
3419 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3421 struct ll_inode_info *lli = ll_i2info(inode);
3422 struct ll_sb_info *sbi = ll_i2sbi(inode);
3423 __u64 now = ktime_get_real_seconds();
3426 spin_lock(&lli->lli_heat_lock);
3427 heat->lh_flags = lli->lli_heat_flags;
3428 for (i = 0; i < heat->lh_count; i++)
3429 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3430 now, sbi->ll_heat_decay_weight,
3431 sbi->ll_heat_period_second);
3432 spin_unlock(&lli->lli_heat_lock);
3435 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3437 struct ll_inode_info *lli = ll_i2info(inode);
3440 spin_lock(&lli->lli_heat_lock);
3441 if (flags & LU_HEAT_FLAG_CLEAR)
3442 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3444 if (flags & LU_HEAT_FLAG_OFF)
3445 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3447 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3449 spin_unlock(&lli->lli_heat_lock);
3455 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3457 struct inode *inode = file_inode(file);
3458 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3462 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3463 PFID(ll_inode2fid(inode)), inode, cmd);
3464 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3466 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3467 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3471 case LL_IOC_GETFLAGS:
3472 /* Get the current value of the file flags */
3473 return put_user(fd->fd_flags, (int __user *)arg);
3474 case LL_IOC_SETFLAGS:
3475 case LL_IOC_CLRFLAGS:
3476 /* Set or clear specific file flags */
3477 /* XXX This probably needs checks to ensure the flags are
3478 * not abused, and to handle any flag side effects.
3480 if (get_user(flags, (int __user *) arg))
3483 if (cmd == LL_IOC_SETFLAGS) {
3484 if ((flags & LL_FILE_IGNORE_LOCK) &&
3485 !(file->f_flags & O_DIRECT)) {
3486 CERROR("%s: unable to disable locking on "
3487 "non-O_DIRECT file\n", current->comm);
3491 fd->fd_flags |= flags;
3493 fd->fd_flags &= ~flags;
3496 case LL_IOC_LOV_SETSTRIPE:
3497 case LL_IOC_LOV_SETSTRIPE_NEW:
3498 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3499 case LL_IOC_LOV_SETEA:
3500 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3501 case LL_IOC_LOV_SWAP_LAYOUTS: {
3503 struct lustre_swap_layouts lsl;
3505 if (copy_from_user(&lsl, (char __user *)arg,
3506 sizeof(struct lustre_swap_layouts)))
3509 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3512 file2 = fget(lsl.sl_fd);
3516 /* O_WRONLY or O_RDWR */
3517 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3518 GOTO(out, rc = -EPERM);
3520 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3521 struct inode *inode2;
3522 struct ll_inode_info *lli;
3523 struct obd_client_handle *och = NULL;
3525 lli = ll_i2info(inode);
3526 mutex_lock(&lli->lli_och_mutex);
3527 if (fd->fd_lease_och != NULL) {
3528 och = fd->fd_lease_och;
3529 fd->fd_lease_och = NULL;
3531 mutex_unlock(&lli->lli_och_mutex);
3533 GOTO(out, rc = -ENOLCK);
3534 inode2 = file_inode(file2);
3535 rc = ll_swap_layouts_close(och, inode, inode2);
3537 rc = ll_swap_layouts(file, file2, &lsl);
3543 case LL_IOC_LOV_GETSTRIPE:
3544 case LL_IOC_LOV_GETSTRIPE_NEW:
3545 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3546 case FS_IOC_GETFLAGS:
3547 case FS_IOC_SETFLAGS:
3548 RETURN(ll_iocontrol(inode, file, cmd, arg));
3549 case FSFILT_IOC_GETVERSION:
3550 case FS_IOC_GETVERSION:
3551 RETURN(put_user(inode->i_generation, (int __user *)arg));
3552 /* We need to special case any other ioctls we want to handle,
3553 * to send them to the MDS/OST as appropriate and to properly
3554 * network encode the arg field. */
3555 case FS_IOC_SETVERSION:
3558 case LL_IOC_GROUP_LOCK:
3559 RETURN(ll_get_grouplock(inode, file, arg));
3560 case LL_IOC_GROUP_UNLOCK:
3561 RETURN(ll_put_grouplock(inode, file, arg));
3562 case IOC_OBD_STATFS:
3563 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3565 case LL_IOC_FLUSHCTX:
3566 RETURN(ll_flush_ctx(inode));
3567 case LL_IOC_PATH2FID: {
3568 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3569 sizeof(struct lu_fid)))
3574 case LL_IOC_GETPARENT:
3575 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3577 case OBD_IOC_FID2PATH:
3578 RETURN(ll_fid2path(inode, (void __user *)arg));
3579 case LL_IOC_DATA_VERSION: {
3580 struct ioc_data_version idv;
3583 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3586 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3587 rc = ll_ioc_data_version(inode, &idv);
3590 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3596 case LL_IOC_GET_MDTIDX: {
3599 mdtidx = ll_get_mdt_idx(inode);
3603 if (put_user((int)mdtidx, (int __user *)arg))
3608 case OBD_IOC_GETDTNAME:
3609 case OBD_IOC_GETMDNAME:
3610 RETURN(ll_get_obd_name(inode, cmd, arg));
3611 case LL_IOC_HSM_STATE_GET: {
3612 struct md_op_data *op_data;
3613 struct hsm_user_state *hus;
3620 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3621 LUSTRE_OPC_ANY, hus);
3622 if (IS_ERR(op_data)) {
3624 RETURN(PTR_ERR(op_data));
3627 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3630 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3633 ll_finish_md_op_data(op_data);
3637 case LL_IOC_HSM_STATE_SET: {
3638 struct hsm_state_set *hss;
3645 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3650 rc = ll_hsm_state_set(inode, hss);
3655 case LL_IOC_HSM_ACTION: {
3656 struct md_op_data *op_data;
3657 struct hsm_current_action *hca;
3664 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3665 LUSTRE_OPC_ANY, hca);
3666 if (IS_ERR(op_data)) {
3668 RETURN(PTR_ERR(op_data));
3671 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3674 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3677 ll_finish_md_op_data(op_data);
3681 case LL_IOC_SET_LEASE_OLD: {
3682 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3684 RETURN(ll_file_set_lease(file, &ioc, 0));
3686 case LL_IOC_SET_LEASE: {
3687 struct ll_ioc_lease ioc;
3689 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3692 RETURN(ll_file_set_lease(file, &ioc, arg));
3694 case LL_IOC_GET_LEASE: {
3695 struct ll_inode_info *lli = ll_i2info(inode);
3696 struct ldlm_lock *lock = NULL;
3699 mutex_lock(&lli->lli_och_mutex);
3700 if (fd->fd_lease_och != NULL) {
3701 struct obd_client_handle *och = fd->fd_lease_och;
3703 lock = ldlm_handle2lock(&och->och_lease_handle);
3705 lock_res_and_lock(lock);
3706 if (!ldlm_is_cancel(lock))
3707 fmode = och->och_flags;
3709 unlock_res_and_lock(lock);
3710 LDLM_LOCK_PUT(lock);
3713 mutex_unlock(&lli->lli_och_mutex);
3715 RETURN(ll_lease_type_from_fmode(fmode));
3717 case LL_IOC_HSM_IMPORT: {
3718 struct hsm_user_import *hui;
3724 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3729 rc = ll_hsm_import(inode, file, hui);
3734 case LL_IOC_FUTIMES_3: {
3735 struct ll_futimes_3 lfu;
3737 if (copy_from_user(&lfu,
3738 (const struct ll_futimes_3 __user *)arg,
3742 RETURN(ll_file_futimes_3(file, &lfu));
3744 case LL_IOC_LADVISE: {
3745 struct llapi_ladvise_hdr *k_ladvise_hdr;
3746 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3749 int alloc_size = sizeof(*k_ladvise_hdr);
3752 u_ladvise_hdr = (void __user *)arg;
3753 OBD_ALLOC_PTR(k_ladvise_hdr);
3754 if (k_ladvise_hdr == NULL)
3757 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3758 GOTO(out_ladvise, rc = -EFAULT);
3760 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3761 k_ladvise_hdr->lah_count < 1)
3762 GOTO(out_ladvise, rc = -EINVAL);
3764 num_advise = k_ladvise_hdr->lah_count;
3765 if (num_advise >= LAH_COUNT_MAX)
3766 GOTO(out_ladvise, rc = -EFBIG);
3768 OBD_FREE_PTR(k_ladvise_hdr);
3769 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3770 lah_advise[num_advise]);
3771 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3772 if (k_ladvise_hdr == NULL)
3776 * TODO: submit multiple advices to one server in a single RPC
3778 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3779 GOTO(out_ladvise, rc = -EFAULT);
3781 for (i = 0; i < num_advise; i++) {
3782 struct llapi_lu_ladvise *k_ladvise =
3783 &k_ladvise_hdr->lah_advise[i];
3784 struct llapi_lu_ladvise __user *u_ladvise =
3785 &u_ladvise_hdr->lah_advise[i];
3787 rc = ll_ladvise_sanity(inode, k_ladvise);
3789 GOTO(out_ladvise, rc);
3791 switch (k_ladvise->lla_advice) {
3792 case LU_LADVISE_LOCKNOEXPAND:
3793 rc = ll_lock_noexpand(file,
3794 k_ladvise->lla_peradvice_flags);
3795 GOTO(out_ladvise, rc);
3796 case LU_LADVISE_LOCKAHEAD:
3798 rc = ll_file_lock_ahead(file, k_ladvise);
3801 GOTO(out_ladvise, rc);
3804 &u_ladvise->lla_lockahead_result))
3805 GOTO(out_ladvise, rc = -EFAULT);
3808 rc = ll_ladvise(inode, file,
3809 k_ladvise_hdr->lah_flags,
3812 GOTO(out_ladvise, rc);
3819 OBD_FREE(k_ladvise_hdr, alloc_size);
3822 case LL_IOC_FLR_SET_MIRROR: {
3823 /* mirror I/O must be direct to avoid polluting page cache
3825 if (!(file->f_flags & O_DIRECT))
3828 fd->fd_designated_mirror = (__u32)arg;
3831 case LL_IOC_FSGETXATTR:
3832 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3833 case LL_IOC_FSSETXATTR:
3834 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3836 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3837 case LL_IOC_HEAT_GET: {
3838 struct lu_heat uheat;
3839 struct lu_heat *heat;
3842 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3845 if (uheat.lh_count > OBD_HEAT_COUNT)
3846 uheat.lh_count = OBD_HEAT_COUNT;
3848 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3849 OBD_ALLOC(heat, size);
3853 heat->lh_count = uheat.lh_count;
3854 ll_heat_get(inode, heat);
3855 rc = copy_to_user((char __user *)arg, heat, size);
3856 OBD_FREE(heat, size);
3857 RETURN(rc ? -EFAULT : 0);
3859 case LL_IOC_HEAT_SET: {
3862 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3865 rc = ll_heat_set(inode, flags);
3868 case LL_IOC_PCC_DETACH: {
3869 struct lu_pcc_detach *detach;
3871 OBD_ALLOC_PTR(detach);
3875 if (copy_from_user(detach,
3876 (const struct lu_pcc_detach __user *)arg,
3878 GOTO(out_detach_free, rc = -EFAULT);
3880 if (!S_ISREG(inode->i_mode))
3881 GOTO(out_detach_free, rc = -EINVAL);
3883 if (!inode_owner_or_capable(inode))
3884 GOTO(out_detach_free, rc = -EPERM);
3886 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3888 OBD_FREE_PTR(detach);
3891 case LL_IOC_PCC_STATE: {
3892 struct lu_pcc_state __user *ustate =
3893 (struct lu_pcc_state __user *)arg;
3894 struct lu_pcc_state *state;
3896 OBD_ALLOC_PTR(state);
3900 if (copy_from_user(state, ustate, sizeof(*state)))
3901 GOTO(out_state, rc = -EFAULT);
3903 rc = pcc_ioctl_state(file, inode, state);
3905 GOTO(out_state, rc);
3907 if (copy_to_user(ustate, state, sizeof(*state)))
3908 GOTO(out_state, rc = -EFAULT);
3911 OBD_FREE_PTR(state);
3915 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3916 (void __user *)arg));
3920 #ifndef HAVE_FILE_LLSEEK_SIZE
3921 static inline loff_t
3922 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3924 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3926 if (offset > maxsize)
3929 if (offset != file->f_pos) {
3930 file->f_pos = offset;
3931 file->f_version = 0;
3937 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3938 loff_t maxsize, loff_t eof)
3940 struct inode *inode = file_inode(file);
3948 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3949 * position-querying operation. Avoid rewriting the "same"
3950 * f_pos value back to the file because a concurrent read(),
3951 * write() or lseek() might have altered it
3956 * f_lock protects against read/modify/write race with other
3957 * SEEK_CURs. Note that parallel writes and reads behave
3961 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3962 inode_unlock(inode);
3966 * In the generic case the entire file is data, so as long as
3967 * offset isn't at the end of the file then the offset is data.
3974 * There is a virtual hole at the end of the file, so as long as
3975 * offset isn't i_size or larger, return i_size.
3983 return llseek_execute(file, offset, maxsize);
3987 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3989 struct inode *inode = file_inode(file);
3990 loff_t retval, eof = 0;
3993 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3994 (origin == SEEK_CUR) ? file->f_pos : 0);
3995 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3996 PFID(ll_inode2fid(inode)), inode, retval, retval,
3998 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4000 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4001 retval = ll_glimpse_size(inode);
4004 eof = i_size_read(inode);
4007 retval = ll_generic_file_llseek_size(file, offset, origin,
4008 ll_file_maxbytes(inode), eof);
4012 static int ll_flush(struct file *file, fl_owner_t id)
4014 struct inode *inode = file_inode(file);
4015 struct ll_inode_info *lli = ll_i2info(inode);
4016 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4019 LASSERT(!S_ISDIR(inode->i_mode));
4021 /* catch async errors that were recorded back when async writeback
4022 * failed for pages in this mapping. */
4023 rc = lli->lli_async_rc;
4024 lli->lli_async_rc = 0;
4025 if (lli->lli_clob != NULL) {
4026 err = lov_read_and_clear_async_rc(lli->lli_clob);
4031 /* The application has been told write failure already.
4032 * Do not report failure again. */
4033 if (fd->fd_write_failed)
4035 return rc ? -EIO : 0;
4039 * Called to make sure a portion of file has been written out.
4040 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4042 * Return how many pages have been written.
4044 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4045 enum cl_fsync_mode mode, int ignore_layout)
4049 struct cl_fsync_io *fio;
4054 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4055 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4058 env = cl_env_get(&refcheck);
4060 RETURN(PTR_ERR(env));
4062 io = vvp_env_thread_io(env);
4063 io->ci_obj = ll_i2info(inode)->lli_clob;
4064 io->ci_ignore_layout = ignore_layout;
4066 /* initialize parameters for sync */
4067 fio = &io->u.ci_fsync;
4068 fio->fi_start = start;
4070 fio->fi_fid = ll_inode2fid(inode);
4071 fio->fi_mode = mode;
4072 fio->fi_nr_written = 0;
4074 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4075 result = cl_io_loop(env, io);
4077 result = io->ci_result;
4079 result = fio->fi_nr_written;
4080 cl_io_fini(env, io);
4081 cl_env_put(env, &refcheck);
4087 * When dentry is provided (the 'else' case), file_dentry() may be
4088 * null and dentry must be used directly rather than pulled from
4089 * file_dentry() as is done otherwise.
4092 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4094 struct dentry *dentry = file_dentry(file);
4095 struct inode *inode = dentry->d_inode;
4096 struct ll_inode_info *lli = ll_i2info(inode);
4097 struct ptlrpc_request *req;
4102 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4104 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4106 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4108 /* fsync's caller has already called _fdata{sync,write}, we want
4109 * that IO to finish before calling the osc and mdc sync methods */
4110 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4113 /* catch async errors that were recorded back when async writeback
4114 * failed for pages in this mapping. */
4115 if (!S_ISDIR(inode->i_mode)) {
4116 err = lli->lli_async_rc;
4117 lli->lli_async_rc = 0;
4120 if (lli->lli_clob != NULL) {
4121 err = lov_read_and_clear_async_rc(lli->lli_clob);
4127 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4131 ptlrpc_req_finished(req);
4133 if (S_ISREG(inode->i_mode)) {
4134 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4137 /* Sync metadata on MDT first, and then sync the cached data
4140 err = pcc_fsync(file, start, end, datasync, &cached);
4142 err = cl_sync_file_range(inode, start, end,
4144 if (rc == 0 && err < 0)
4147 fd->fd_write_failed = true;
4149 fd->fd_write_failed = false;
4152 inode_unlock(inode);
4157 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4159 struct inode *inode = file_inode(file);
4160 struct ll_sb_info *sbi = ll_i2sbi(inode);
4161 struct ldlm_enqueue_info einfo = {
4162 .ei_type = LDLM_FLOCK,
4163 .ei_cb_cp = ldlm_flock_completion_ast,
4164 .ei_cbdata = file_lock,
4166 struct md_op_data *op_data;
4167 struct lustre_handle lockh = { 0 };
4168 union ldlm_policy_data flock = { { 0 } };
4169 int fl_type = file_lock->fl_type;
4175 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4176 PFID(ll_inode2fid(inode)), file_lock);
4178 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4180 if (file_lock->fl_flags & FL_FLOCK) {
4181 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4182 /* flocks are whole-file locks */
4183 flock.l_flock.end = OFFSET_MAX;
4184 /* For flocks owner is determined by the local file desctiptor*/
4185 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4186 } else if (file_lock->fl_flags & FL_POSIX) {
4187 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4188 flock.l_flock.start = file_lock->fl_start;
4189 flock.l_flock.end = file_lock->fl_end;
4193 flock.l_flock.pid = file_lock->fl_pid;
4195 /* Somewhat ugly workaround for svc lockd.
4196 * lockd installs custom fl_lmops->lm_compare_owner that checks
4197 * for the fl_owner to be the same (which it always is on local node
4198 * I guess between lockd processes) and then compares pid.
4199 * As such we assign pid to the owner field to make it all work,
4200 * conflict with normal locks is unlikely since pid space and
4201 * pointer space for current->files are not intersecting */
4202 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4203 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4207 einfo.ei_mode = LCK_PR;
4210 /* An unlock request may or may not have any relation to
4211 * existing locks so we may not be able to pass a lock handle
4212 * via a normal ldlm_lock_cancel() request. The request may even
4213 * unlock a byte range in the middle of an existing lock. In
4214 * order to process an unlock request we need all of the same
4215 * information that is given with a normal read or write record
4216 * lock request. To avoid creating another ldlm unlock (cancel)
4217 * message we'll treat a LCK_NL flock request as an unlock. */
4218 einfo.ei_mode = LCK_NL;
4221 einfo.ei_mode = LCK_PW;
4224 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4239 flags = LDLM_FL_BLOCK_NOWAIT;
4245 flags = LDLM_FL_TEST_LOCK;
4248 CERROR("unknown fcntl lock command: %d\n", cmd);
4252 /* Save the old mode so that if the mode in the lock changes we
4253 * can decrement the appropriate reader or writer refcount. */
4254 file_lock->fl_type = einfo.ei_mode;
4256 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4257 LUSTRE_OPC_ANY, NULL);
4258 if (IS_ERR(op_data))
4259 RETURN(PTR_ERR(op_data));
4261 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4262 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4263 flock.l_flock.pid, flags, einfo.ei_mode,
4264 flock.l_flock.start, flock.l_flock.end);
4266 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4269 /* Restore the file lock type if not TEST lock. */
4270 if (!(flags & LDLM_FL_TEST_LOCK))
4271 file_lock->fl_type = fl_type;
4273 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4274 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4275 !(flags & LDLM_FL_TEST_LOCK))
4276 rc2 = locks_lock_file_wait(file, file_lock);
4278 if ((file_lock->fl_flags & FL_FLOCK) &&
4279 (rc == 0 || file_lock->fl_type == F_UNLCK))
4280 rc2 = flock_lock_file_wait(file, file_lock);
4281 if ((file_lock->fl_flags & FL_POSIX) &&
4282 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4283 !(flags & LDLM_FL_TEST_LOCK))
4284 rc2 = posix_lock_file_wait(file, file_lock);
4285 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4287 if (rc2 && file_lock->fl_type != F_UNLCK) {
4288 einfo.ei_mode = LCK_NL;
4289 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4294 ll_finish_md_op_data(op_data);
4299 int ll_get_fid_by_name(struct inode *parent, const char *name,
4300 int namelen, struct lu_fid *fid,
4301 struct inode **inode)
4303 struct md_op_data *op_data = NULL;
4304 struct mdt_body *body;
4305 struct ptlrpc_request *req;
4309 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4310 LUSTRE_OPC_ANY, NULL);
4311 if (IS_ERR(op_data))
4312 RETURN(PTR_ERR(op_data));
4314 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4315 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4316 ll_finish_md_op_data(op_data);
4320 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4322 GOTO(out_req, rc = -EFAULT);
4324 *fid = body->mbo_fid1;
4327 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4329 ptlrpc_req_finished(req);
4333 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4336 struct dentry *dchild = NULL;
4337 struct inode *child_inode = NULL;
4338 struct md_op_data *op_data;
4339 struct ptlrpc_request *request = NULL;
4340 struct obd_client_handle *och = NULL;
4342 struct mdt_body *body;
4343 __u64 data_version = 0;
4344 size_t namelen = strlen(name);
4345 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4349 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4350 PFID(ll_inode2fid(parent)), name,
4351 lum->lum_stripe_offset, lum->lum_stripe_count);
4353 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4354 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4355 lustre_swab_lmv_user_md(lum);
4357 /* Get child FID first */
4358 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4361 dchild = d_lookup(file_dentry(file), &qstr);
4363 if (dchild->d_inode)
4364 child_inode = igrab(dchild->d_inode);
4369 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4378 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4379 OBD_CONNECT2_DIR_MIGRATE)) {
4380 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4381 ll_dir_striped(child_inode)) {
4382 CERROR("%s: MDT doesn't support stripe directory "
4383 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4384 GOTO(out_iput, rc = -EOPNOTSUPP);
4389 * lfs migrate command needs to be blocked on the client
4390 * by checking the migrate FID against the FID of the
4393 if (child_inode == parent->i_sb->s_root->d_inode)
4394 GOTO(out_iput, rc = -EINVAL);
4396 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4397 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4398 if (IS_ERR(op_data))
4399 GOTO(out_iput, rc = PTR_ERR(op_data));
4401 inode_lock(child_inode);
4402 op_data->op_fid3 = *ll_inode2fid(child_inode);
4403 if (!fid_is_sane(&op_data->op_fid3)) {
4404 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4405 ll_i2sbi(parent)->ll_fsname, name,
4406 PFID(&op_data->op_fid3));
4407 GOTO(out_unlock, rc = -EINVAL);
4410 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4411 op_data->op_data = lum;
4412 op_data->op_data_size = lumlen;
4415 if (S_ISREG(child_inode->i_mode)) {
4416 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4420 GOTO(out_unlock, rc);
4423 rc = ll_data_version(child_inode, &data_version,
4426 GOTO(out_close, rc);
4428 op_data->op_open_handle = och->och_open_handle;
4429 op_data->op_data_version = data_version;
4430 op_data->op_lease_handle = och->och_lease_handle;
4431 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4433 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4434 och->och_mod->mod_open_req->rq_replay = 0;
4435 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4438 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4439 name, namelen, &request);
4441 LASSERT(request != NULL);
4442 ll_update_times(request, parent);
4445 if (rc == 0 || rc == -EAGAIN) {
4446 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4447 LASSERT(body != NULL);
4449 /* If the server does release layout lock, then we cleanup
4450 * the client och here, otherwise release it in out_close: */
4451 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4452 obd_mod_put(och->och_mod);
4453 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4455 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4461 if (request != NULL) {
4462 ptlrpc_req_finished(request);
4466 /* Try again if the lease has cancelled. */
4467 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4472 ll_lease_close(och, child_inode, NULL);
4474 clear_nlink(child_inode);
4476 inode_unlock(child_inode);
4477 ll_finish_md_op_data(op_data);
4484 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4486 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4490 * In order to avoid flood of warning messages, only print one message
4491 * for one file. And the entire message rate on the client is limited
4492 * by CDEBUG_LIMIT too.
4494 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4495 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4496 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4497 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4503 * test if some locks matching bits and l_req_mode are acquired
4504 * - bits can be in different locks
4505 * - if found clear the common lock bits in *bits
4506 * - the bits not found, are kept in *bits
4508 * \param bits [IN] searched lock bits [IN]
4509 * \param l_req_mode [IN] searched lock mode
4510 * \retval boolean, true iff all bits are found
4512 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4514 struct lustre_handle lockh;
4515 union ldlm_policy_data policy;
4516 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4517 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4526 fid = &ll_i2info(inode)->lli_fid;
4527 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4528 ldlm_lockname[mode]);
4530 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4531 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4532 policy.l_inodebits.bits = *bits & (1 << i);
4533 if (policy.l_inodebits.bits == 0)
4536 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4537 &policy, mode, &lockh)) {
4538 struct ldlm_lock *lock;
4540 lock = ldlm_handle2lock(&lockh);
4543 ~(lock->l_policy_data.l_inodebits.bits);
4544 LDLM_LOCK_PUT(lock);
4546 *bits &= ~policy.l_inodebits.bits;
4553 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4554 struct lustre_handle *lockh, __u64 flags,
4555 enum ldlm_mode mode)
4557 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4562 fid = &ll_i2info(inode)->lli_fid;
4563 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4565 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4566 fid, LDLM_IBITS, &policy, mode, lockh);
4571 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4573 /* Already unlinked. Just update nlink and return success */
4574 if (rc == -ENOENT) {
4576 /* If it is striped directory, and there is bad stripe
4577 * Let's revalidate the dentry again, instead of returning
4579 if (ll_dir_striped(inode))
4582 /* This path cannot be hit for regular files unless in
4583 * case of obscure races, so no need to to validate
4585 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4587 } else if (rc != 0) {
4588 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4589 "%s: revalidate FID "DFID" error: rc = %d\n",
4590 ll_i2sbi(inode)->ll_fsname,
4591 PFID(ll_inode2fid(inode)), rc);
4597 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4599 struct inode *inode = dentry->d_inode;
4600 struct obd_export *exp = ll_i2mdexp(inode);
4601 struct lookup_intent oit = {
4604 struct ptlrpc_request *req = NULL;
4605 struct md_op_data *op_data;
4609 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4610 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4612 /* Call getattr by fid, so do not provide name at all. */
4613 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4614 LUSTRE_OPC_ANY, NULL);
4615 if (IS_ERR(op_data))
4616 RETURN(PTR_ERR(op_data));
4618 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4619 ll_finish_md_op_data(op_data);
4621 rc = ll_inode_revalidate_fini(inode, rc);
4625 rc = ll_revalidate_it_finish(req, &oit, dentry);
4627 ll_intent_release(&oit);
4631 /* Unlinked? Unhash dentry, so it is not picked up later by
4632 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4633 * here to preserve get_cwd functionality on 2.6.
4635 if (!dentry->d_inode->i_nlink) {
4636 ll_lock_dcache(inode);
4637 d_lustre_invalidate(dentry, 0);
4638 ll_unlock_dcache(inode);
4641 ll_lookup_finish_locks(&oit, dentry);
4643 ptlrpc_req_finished(req);
4648 static int ll_merge_md_attr(struct inode *inode)
4650 struct ll_inode_info *lli = ll_i2info(inode);
4651 struct cl_attr attr = { 0 };
4654 LASSERT(lli->lli_lsm_md != NULL);
4656 if (!lmv_dir_striped(lli->lli_lsm_md))
4659 down_read(&lli->lli_lsm_sem);
4660 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4661 &attr, ll_md_blocking_ast);
4662 up_read(&lli->lli_lsm_sem);
4666 set_nlink(inode, attr.cat_nlink);
4667 inode->i_blocks = attr.cat_blocks;
4668 i_size_write(inode, attr.cat_size);
4670 ll_i2info(inode)->lli_atime = attr.cat_atime;
4671 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4672 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4677 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4679 struct inode *inode = de->d_inode;
4680 struct ll_sb_info *sbi = ll_i2sbi(inode);
4681 struct ll_inode_info *lli = ll_i2info(inode);
4684 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4686 rc = ll_inode_revalidate(de, IT_GETATTR);
4690 if (S_ISREG(inode->i_mode)) {
4693 rc = pcc_inode_getattr(inode, &cached);
4694 if (cached && rc < 0)
4697 /* In case of restore, the MDT has the right size and has
4698 * already send it back without granting the layout lock,
4699 * inode is up-to-date so glimpse is useless.
4700 * Also to glimpse we need the layout, in case of a running
4701 * restore the MDT holds the layout lock so the glimpse will
4702 * block up to the end of restore (getattr will block)
4704 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4705 rc = ll_glimpse_size(inode);
4710 /* If object isn't regular a file then don't validate size. */
4711 if (ll_dir_striped(inode)) {
4712 rc = ll_merge_md_attr(inode);
4717 inode->i_atime.tv_sec = lli->lli_atime;
4718 inode->i_mtime.tv_sec = lli->lli_mtime;
4719 inode->i_ctime.tv_sec = lli->lli_ctime;
4722 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4724 if (ll_need_32bit_api(sbi)) {
4725 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4726 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4727 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4729 stat->ino = inode->i_ino;
4730 stat->dev = inode->i_sb->s_dev;
4731 stat->rdev = inode->i_rdev;
4734 stat->mode = inode->i_mode;
4735 stat->uid = inode->i_uid;
4736 stat->gid = inode->i_gid;
4737 stat->atime = inode->i_atime;
4738 stat->mtime = inode->i_mtime;
4739 stat->ctime = inode->i_ctime;
4740 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4742 stat->nlink = inode->i_nlink;
4743 stat->size = i_size_read(inode);
4744 stat->blocks = inode->i_blocks;
4749 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4750 int ll_getattr(const struct path *path, struct kstat *stat,
4751 u32 request_mask, unsigned int flags)
4753 struct dentry *de = path->dentry;
4755 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4758 return ll_getattr_dentry(de, stat);
4761 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4762 __u64 start, __u64 len)
4766 struct fiemap *fiemap;
4767 unsigned int extent_count = fieinfo->fi_extents_max;
4769 num_bytes = sizeof(*fiemap) + (extent_count *
4770 sizeof(struct fiemap_extent));
4771 OBD_ALLOC_LARGE(fiemap, num_bytes);
4776 fiemap->fm_flags = fieinfo->fi_flags;
4777 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4778 fiemap->fm_start = start;
4779 fiemap->fm_length = len;
4780 if (extent_count > 0 &&
4781 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4782 sizeof(struct fiemap_extent)) != 0)
4783 GOTO(out, rc = -EFAULT);
4785 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4787 fieinfo->fi_flags = fiemap->fm_flags;
4788 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4789 if (extent_count > 0 &&
4790 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4791 fiemap->fm_mapped_extents *
4792 sizeof(struct fiemap_extent)) != 0)
4793 GOTO(out, rc = -EFAULT);
4795 OBD_FREE_LARGE(fiemap, num_bytes);
4799 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4801 struct ll_inode_info *lli = ll_i2info(inode);
4802 struct posix_acl *acl = NULL;
4805 spin_lock(&lli->lli_lock);
4806 /* VFS' acl_permission_check->check_acl will release the refcount */
4807 acl = posix_acl_dup(lli->lli_posix_acl);
4808 spin_unlock(&lli->lli_lock);
4813 #ifdef HAVE_IOP_SET_ACL
4814 #ifdef CONFIG_FS_POSIX_ACL
4815 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4817 struct ll_sb_info *sbi = ll_i2sbi(inode);
4818 struct ptlrpc_request *req = NULL;
4819 const char *name = NULL;
4821 size_t value_size = 0;
4826 case ACL_TYPE_ACCESS:
4827 name = XATTR_NAME_POSIX_ACL_ACCESS;
4829 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4832 case ACL_TYPE_DEFAULT:
4833 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4834 if (!S_ISDIR(inode->i_mode))
4835 rc = acl ? -EACCES : 0;
4846 value_size = posix_acl_xattr_size(acl->a_count);
4847 value = kmalloc(value_size, GFP_NOFS);
4849 GOTO(out, rc = -ENOMEM);
4851 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4853 GOTO(out_value, rc);
4856 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4857 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4858 name, value, value_size, 0, 0, &req);
4860 ptlrpc_req_finished(req);
4865 forget_cached_acl(inode, type);
4867 set_cached_acl(inode, type, acl);
4870 #endif /* CONFIG_FS_POSIX_ACL */
4871 #endif /* HAVE_IOP_SET_ACL */
4873 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4875 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4876 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4878 ll_check_acl(struct inode *inode, int mask)
4881 # ifdef CONFIG_FS_POSIX_ACL
4882 struct posix_acl *acl;
4886 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4887 if (flags & IPERM_FLAG_RCU)
4890 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4895 rc = posix_acl_permission(inode, acl, mask);
4896 posix_acl_release(acl);
4899 # else /* !CONFIG_FS_POSIX_ACL */
4901 # endif /* CONFIG_FS_POSIX_ACL */
4903 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4905 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4906 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4908 # ifdef HAVE_INODE_PERMISION_2ARGS
4909 int ll_inode_permission(struct inode *inode, int mask)
4911 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4916 struct ll_sb_info *sbi;
4917 struct root_squash_info *squash;
4918 struct cred *cred = NULL;
4919 const struct cred *old_cred = NULL;
4921 bool squash_id = false;
4924 #ifdef MAY_NOT_BLOCK
4925 if (mask & MAY_NOT_BLOCK)
4927 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4928 if (flags & IPERM_FLAG_RCU)
4932 /* as root inode are NOT getting validated in lookup operation,
4933 * need to do it before permission check. */
4935 if (inode == inode->i_sb->s_root->d_inode) {
4936 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4941 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4942 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4944 /* squash fsuid/fsgid if needed */
4945 sbi = ll_i2sbi(inode);
4946 squash = &sbi->ll_squash;
4947 if (unlikely(squash->rsi_uid != 0 &&
4948 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4949 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4953 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4954 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4955 squash->rsi_uid, squash->rsi_gid);
4957 /* update current process's credentials
4958 * and FS capability */
4959 cred = prepare_creds();
4963 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4964 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4965 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4966 if ((1 << cap) & CFS_CAP_FS_MASK)
4967 cap_lower(cred->cap_effective, cap);
4969 old_cred = override_creds(cred);
4972 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4973 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4974 /* restore current process's credentials and FS capability */
4976 revert_creds(old_cred);
4983 /* -o localflock - only provides locally consistent flock locks */
4984 struct file_operations ll_file_operations = {
4985 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4986 # ifdef HAVE_SYNC_READ_WRITE
4987 .read = new_sync_read,
4988 .write = new_sync_write,
4990 .read_iter = ll_file_read_iter,
4991 .write_iter = ll_file_write_iter,
4992 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4993 .read = ll_file_read,
4994 .aio_read = ll_file_aio_read,
4995 .write = ll_file_write,
4996 .aio_write = ll_file_aio_write,
4997 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4998 .unlocked_ioctl = ll_file_ioctl,
4999 .open = ll_file_open,
5000 .release = ll_file_release,
5001 .mmap = ll_file_mmap,
5002 .llseek = ll_file_seek,
5003 .splice_read = ll_file_splice_read,
5008 struct file_operations ll_file_operations_flock = {
5009 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5010 # ifdef HAVE_SYNC_READ_WRITE
5011 .read = new_sync_read,
5012 .write = new_sync_write,
5013 # endif /* HAVE_SYNC_READ_WRITE */
5014 .read_iter = ll_file_read_iter,
5015 .write_iter = ll_file_write_iter,
5016 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5017 .read = ll_file_read,
5018 .aio_read = ll_file_aio_read,
5019 .write = ll_file_write,
5020 .aio_write = ll_file_aio_write,
5021 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5022 .unlocked_ioctl = ll_file_ioctl,
5023 .open = ll_file_open,
5024 .release = ll_file_release,
5025 .mmap = ll_file_mmap,
5026 .llseek = ll_file_seek,
5027 .splice_read = ll_file_splice_read,
5030 .flock = ll_file_flock,
5031 .lock = ll_file_flock
5034 /* These are for -o noflock - to return ENOSYS on flock calls */
5035 struct file_operations ll_file_operations_noflock = {
5036 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5037 # ifdef HAVE_SYNC_READ_WRITE
5038 .read = new_sync_read,
5039 .write = new_sync_write,
5040 # endif /* HAVE_SYNC_READ_WRITE */
5041 .read_iter = ll_file_read_iter,
5042 .write_iter = ll_file_write_iter,
5043 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5044 .read = ll_file_read,
5045 .aio_read = ll_file_aio_read,
5046 .write = ll_file_write,
5047 .aio_write = ll_file_aio_write,
5048 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5049 .unlocked_ioctl = ll_file_ioctl,
5050 .open = ll_file_open,
5051 .release = ll_file_release,
5052 .mmap = ll_file_mmap,
5053 .llseek = ll_file_seek,
5054 .splice_read = ll_file_splice_read,
5057 .flock = ll_file_noflock,
5058 .lock = ll_file_noflock
5061 struct inode_operations ll_file_inode_operations = {
5062 .setattr = ll_setattr,
5063 .getattr = ll_getattr,
5064 .permission = ll_inode_permission,
5065 #ifdef HAVE_IOP_XATTR
5066 .setxattr = ll_setxattr,
5067 .getxattr = ll_getxattr,
5068 .removexattr = ll_removexattr,
5070 .listxattr = ll_listxattr,
5071 .fiemap = ll_fiemap,
5072 #ifdef HAVE_IOP_GET_ACL
5073 .get_acl = ll_get_acl,
5075 #ifdef HAVE_IOP_SET_ACL
5076 .set_acl = ll_set_acl,
5080 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5082 struct ll_inode_info *lli = ll_i2info(inode);
5083 struct cl_object *obj = lli->lli_clob;
5092 env = cl_env_get(&refcheck);
5094 RETURN(PTR_ERR(env));
5096 rc = cl_conf_set(env, lli->lli_clob, conf);
5100 if (conf->coc_opc == OBJECT_CONF_SET) {
5101 struct ldlm_lock *lock = conf->coc_lock;
5102 struct cl_layout cl = {
5106 LASSERT(lock != NULL);
5107 LASSERT(ldlm_has_layout(lock));
5109 /* it can only be allowed to match after layout is
5110 * applied to inode otherwise false layout would be
5111 * seen. Applying layout shoud happen before dropping
5112 * the intent lock. */
5113 ldlm_lock_allow_match(lock);
5115 rc = cl_object_layout_get(env, obj, &cl);
5120 DFID": layout version change: %u -> %u\n",
5121 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5123 ll_layout_version_set(lli, cl.cl_layout_gen);
5127 cl_env_put(env, &refcheck);
5132 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5133 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5136 struct ll_sb_info *sbi = ll_i2sbi(inode);
5137 struct ptlrpc_request *req;
5144 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5145 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5146 lock->l_lvb_data, lock->l_lvb_len);
5148 if (lock->l_lvb_data != NULL)
5151 /* if layout lock was granted right away, the layout is returned
5152 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5153 * blocked and then granted via completion ast, we have to fetch
5154 * layout here. Please note that we can't use the LVB buffer in
5155 * completion AST because it doesn't have a large enough buffer */
5156 rc = ll_get_default_mdsize(sbi, &lmmsize);
5160 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5161 XATTR_NAME_LOV, lmmsize, &req);
5164 GOTO(out, rc = 0); /* empty layout */
5171 if (lmmsize == 0) /* empty layout */
5174 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5176 GOTO(out, rc = -EFAULT);
5178 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5179 if (lvbdata == NULL)
5180 GOTO(out, rc = -ENOMEM);
5182 memcpy(lvbdata, lmm, lmmsize);
5183 lock_res_and_lock(lock);
5184 if (unlikely(lock->l_lvb_data == NULL)) {
5185 lock->l_lvb_type = LVB_T_LAYOUT;
5186 lock->l_lvb_data = lvbdata;
5187 lock->l_lvb_len = lmmsize;
5190 unlock_res_and_lock(lock);
5193 OBD_FREE_LARGE(lvbdata, lmmsize);
5198 ptlrpc_req_finished(req);
5203 * Apply the layout to the inode. Layout lock is held and will be released
5206 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5207 struct inode *inode)
5209 struct ll_inode_info *lli = ll_i2info(inode);
5210 struct ll_sb_info *sbi = ll_i2sbi(inode);
5211 struct ldlm_lock *lock;
5212 struct cl_object_conf conf;
5215 bool wait_layout = false;
5218 LASSERT(lustre_handle_is_used(lockh));
5220 lock = ldlm_handle2lock(lockh);
5221 LASSERT(lock != NULL);
5222 LASSERT(ldlm_has_layout(lock));
5224 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5225 PFID(&lli->lli_fid), inode);
5227 /* in case this is a caching lock and reinstate with new inode */
5228 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5230 lock_res_and_lock(lock);
5231 lvb_ready = ldlm_is_lvb_ready(lock);
5232 unlock_res_and_lock(lock);
5234 /* checking lvb_ready is racy but this is okay. The worst case is
5235 * that multi processes may configure the file on the same time. */
5239 rc = ll_layout_fetch(inode, lock);
5243 /* for layout lock, lmm is stored in lock's lvb.
5244 * lvb_data is immutable if the lock is held so it's safe to access it
5247 * set layout to file. Unlikely this will fail as old layout was
5248 * surely eliminated */
5249 memset(&conf, 0, sizeof conf);
5250 conf.coc_opc = OBJECT_CONF_SET;
5251 conf.coc_inode = inode;
5252 conf.coc_lock = lock;
5253 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5254 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5255 rc = ll_layout_conf(inode, &conf);
5257 /* refresh layout failed, need to wait */
5258 wait_layout = rc == -EBUSY;
5261 LDLM_LOCK_PUT(lock);
5262 ldlm_lock_decref(lockh, mode);
5264 /* wait for IO to complete if it's still being used. */
5266 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5267 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5269 memset(&conf, 0, sizeof conf);
5270 conf.coc_opc = OBJECT_CONF_WAIT;
5271 conf.coc_inode = inode;
5272 rc = ll_layout_conf(inode, &conf);
5276 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5277 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5283 * Issue layout intent RPC to MDS.
5284 * \param inode [in] file inode
5285 * \param intent [in] layout intent
5287 * \retval 0 on success
5288 * \retval < 0 error code
5290 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5292 struct ll_inode_info *lli = ll_i2info(inode);
5293 struct ll_sb_info *sbi = ll_i2sbi(inode);
5294 struct md_op_data *op_data;
5295 struct lookup_intent it;
5296 struct ptlrpc_request *req;
5300 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5301 0, 0, LUSTRE_OPC_ANY, NULL);
5302 if (IS_ERR(op_data))
5303 RETURN(PTR_ERR(op_data));
5305 op_data->op_data = intent;
5306 op_data->op_data_size = sizeof(*intent);
5308 memset(&it, 0, sizeof(it));
5309 it.it_op = IT_LAYOUT;
5310 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5311 intent->li_opc == LAYOUT_INTENT_TRUNC)
5312 it.it_flags = FMODE_WRITE;
5314 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5315 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5317 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5318 &ll_md_blocking_ast, 0);
5319 if (it.it_request != NULL)
5320 ptlrpc_req_finished(it.it_request);
5321 it.it_request = NULL;
5323 ll_finish_md_op_data(op_data);
5325 /* set lock data in case this is a new lock */
5327 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5329 ll_intent_drop_lock(&it);
5335 * This function checks if there exists a LAYOUT lock on the client side,
5336 * or enqueues it if it doesn't have one in cache.
5338 * This function will not hold layout lock so it may be revoked any time after
5339 * this function returns. Any operations depend on layout should be redone
5342 * This function should be called before lov_io_init() to get an uptodate
5343 * layout version, the caller should save the version number and after IO
5344 * is finished, this function should be called again to verify that layout
5345 * is not changed during IO time.
5347 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5349 struct ll_inode_info *lli = ll_i2info(inode);
5350 struct ll_sb_info *sbi = ll_i2sbi(inode);
5351 struct lustre_handle lockh;
5352 struct layout_intent intent = {
5353 .li_opc = LAYOUT_INTENT_ACCESS,
5355 enum ldlm_mode mode;
5359 *gen = ll_layout_version_get(lli);
5360 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5364 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5365 LASSERT(S_ISREG(inode->i_mode));
5367 /* take layout lock mutex to enqueue layout lock exclusively. */
5368 mutex_lock(&lli->lli_layout_mutex);
5371 /* mostly layout lock is caching on the local side, so try to
5372 * match it before grabbing layout lock mutex. */
5373 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5374 LCK_CR | LCK_CW | LCK_PR |
5376 if (mode != 0) { /* hit cached lock */
5377 rc = ll_layout_lock_set(&lockh, mode, inode);
5383 rc = ll_layout_intent(inode, &intent);
5389 *gen = ll_layout_version_get(lli);
5390 mutex_unlock(&lli->lli_layout_mutex);
5396 * Issue layout intent RPC indicating where in a file an IO is about to write.
5398 * \param[in] inode file inode.
5399 * \param[in] ext write range with start offset of fille in bytes where
5400 * an IO is about to write, and exclusive end offset in
5403 * \retval 0 on success
5404 * \retval < 0 error code
5406 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5407 struct lu_extent *ext)
5409 struct layout_intent intent = {
5411 .li_extent.e_start = ext->e_start,
5412 .li_extent.e_end = ext->e_end,
5417 rc = ll_layout_intent(inode, &intent);
5423 * This function send a restore request to the MDT
5425 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5427 struct hsm_user_request *hur;
5431 len = sizeof(struct hsm_user_request) +
5432 sizeof(struct hsm_user_item);
5433 OBD_ALLOC(hur, len);
5437 hur->hur_request.hr_action = HUA_RESTORE;
5438 hur->hur_request.hr_archive_id = 0;
5439 hur->hur_request.hr_flags = 0;
5440 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5441 sizeof(hur->hur_user_item[0].hui_fid));
5442 hur->hur_user_item[0].hui_extent.offset = offset;
5443 hur->hur_user_item[0].hui_extent.length = length;
5444 hur->hur_request.hr_itemcount = 1;
5445 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,