4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
166 case MDS_CLOSE_LAYOUT_SPLIT:
167 case MDS_CLOSE_LAYOUT_SWAP: {
168 struct split_param *sp = data;
170 LASSERT(data != NULL);
171 op_data->op_bias |= bias;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
175 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
176 op_data->op_mirror_id = sp->sp_mirror_id;
178 op_data->op_fid2 = *ll_inode2fid(data);
183 case MDS_CLOSE_RESYNC_DONE: {
184 struct ll_ioc_lease *ioc = data;
186 LASSERT(data != NULL);
187 op_data->op_attr_blocks +=
188 ioc->lil_count * op_data->op_attr_blocks;
189 op_data->op_attr.ia_valid |= ATTR_SIZE;
190 op_data->op_xvalid |= OP_XVALID_BLOCKS;
191 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_data = &ioc->lil_ids[0];
195 op_data->op_data_size =
196 ioc->lil_count * sizeof(ioc->lil_ids[0]);
200 case MDS_PCC_ATTACH: {
201 struct pcc_param *param = data;
203 LASSERT(data != NULL);
204 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
205 op_data->op_archive_id = param->pa_archive_id;
206 op_data->op_data_version = param->pa_data_version;
207 op_data->op_lease_handle = och->och_lease_handle;
211 case MDS_HSM_RELEASE:
212 LASSERT(data != NULL);
213 op_data->op_bias |= MDS_HSM_RELEASE;
214 op_data->op_data_version = *(__u64 *)data;
215 op_data->op_lease_handle = och->och_lease_handle;
216 op_data->op_attr.ia_valid |= ATTR_SIZE;
217 op_data->op_xvalid |= OP_XVALID_BLOCKS;
221 LASSERT(data == NULL);
225 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
226 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
227 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
228 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
230 rc = md_close(md_exp, op_data, och->och_mod, &req);
231 if (rc != 0 && rc != -EINTR)
232 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
233 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
235 if (rc == 0 && op_data->op_bias & bias) {
236 struct mdt_body *body;
238 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
239 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 if (bias & MDS_PCC_ATTACH) {
243 struct pcc_param *param = data;
245 param->pa_layout_gen = body->mbo_layout_gen;
249 ll_finish_md_op_data(op_data);
253 md_clear_open_replay_data(md_exp, och);
254 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
257 ptlrpc_req_finished(req); /* This is close request */
261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
263 struct ll_inode_info *lli = ll_i2info(inode);
264 struct obd_client_handle **och_p;
265 struct obd_client_handle *och;
270 if (fmode & FMODE_WRITE) {
271 och_p = &lli->lli_mds_write_och;
272 och_usecount = &lli->lli_open_fd_write_count;
273 } else if (fmode & FMODE_EXEC) {
274 och_p = &lli->lli_mds_exec_och;
275 och_usecount = &lli->lli_open_fd_exec_count;
277 LASSERT(fmode & FMODE_READ);
278 och_p = &lli->lli_mds_read_och;
279 och_usecount = &lli->lli_open_fd_read_count;
282 mutex_lock(&lli->lli_och_mutex);
283 if (*och_usecount > 0) {
284 /* There are still users of this handle, so skip
286 mutex_unlock(&lli->lli_och_mutex);
292 mutex_unlock(&lli->lli_och_mutex);
295 /* There might be a race and this handle may already
297 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
303 static int ll_md_close(struct inode *inode, struct file *file)
305 union ldlm_policy_data policy = {
306 .l_inodebits = { MDS_INODELOCK_OPEN },
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310 struct ll_inode_info *lli = ll_i2info(inode);
311 struct lustre_handle lockh;
312 enum ldlm_mode lockmode;
316 /* clear group lock, if present */
317 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
318 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
320 if (fd->fd_lease_och != NULL) {
323 /* Usually the lease is not released when the
324 * application crashed, we need to release here. */
325 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
326 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
327 PFID(&lli->lli_fid), rc, lease_broken);
329 fd->fd_lease_och = NULL;
332 if (fd->fd_och != NULL) {
333 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
338 /* Let's see if we have good enough OPEN lock on the file and if
339 we can skip talking to MDS */
340 mutex_lock(&lli->lli_och_mutex);
341 if (fd->fd_omode & FMODE_WRITE) {
343 LASSERT(lli->lli_open_fd_write_count);
344 lli->lli_open_fd_write_count--;
345 } else if (fd->fd_omode & FMODE_EXEC) {
347 LASSERT(lli->lli_open_fd_exec_count);
348 lli->lli_open_fd_exec_count--;
351 LASSERT(lli->lli_open_fd_read_count);
352 lli->lli_open_fd_read_count--;
354 mutex_unlock(&lli->lli_och_mutex);
356 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
357 LDLM_IBITS, &policy, lockmode, &lockh))
358 rc = ll_md_real_close(inode, fd->fd_omode);
361 LUSTRE_FPRIVATE(file) = NULL;
362 ll_file_data_put(fd);
367 /* While this returns an error code, fput() the caller does not, so we need
368 * to make every effort to clean up all of our state here. Also, applications
369 * rarely check close errors and even if an error is returned they will not
370 * re-try the close call.
372 int ll_file_release(struct inode *inode, struct file *file)
374 struct ll_file_data *fd;
375 struct ll_sb_info *sbi = ll_i2sbi(inode);
376 struct ll_inode_info *lli = ll_i2info(inode);
380 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
381 PFID(ll_inode2fid(inode)), inode);
383 if (inode->i_sb->s_root != file_dentry(file))
384 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
385 fd = LUSTRE_FPRIVATE(file);
388 /* The last ref on @file, maybe not the the owner pid of statahead,
389 * because parent and child process can share the same file handle. */
390 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
391 ll_deauthorize_statahead(inode, fd);
393 if (inode->i_sb->s_root == file_dentry(file)) {
394 LUSTRE_FPRIVATE(file) = NULL;
395 ll_file_data_put(fd);
399 pcc_file_release(inode, file);
401 if (!S_ISDIR(inode->i_mode)) {
402 if (lli->lli_clob != NULL)
403 lov_read_and_clear_async_rc(lli->lli_clob);
404 lli->lli_async_rc = 0;
407 rc = ll_md_close(inode, file);
409 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
410 libcfs_debug_dumplog();
415 static inline int ll_dom_readpage(void *data, struct page *page)
417 struct niobuf_local *lnb = data;
420 kaddr = ll_kmap_atomic(page, KM_USER0);
421 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
422 if (lnb->lnb_len < PAGE_SIZE)
423 memset(kaddr + lnb->lnb_len, 0,
424 PAGE_SIZE - lnb->lnb_len);
425 flush_dcache_page(page);
426 SetPageUptodate(page);
427 ll_kunmap_atomic(kaddr, KM_USER0);
433 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
434 struct lookup_intent *it)
436 struct ll_inode_info *lli = ll_i2info(inode);
437 struct cl_object *obj = lli->lli_clob;
438 struct address_space *mapping = inode->i_mapping;
440 struct niobuf_remote *rnb;
441 struct mdt_body *body;
443 unsigned long index, start;
444 struct niobuf_local lnb;
451 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
455 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
456 if (rnb == NULL || rnb->rnb_len == 0)
459 /* LU-11595: Server may return whole file and that is OK always or
460 * it may return just file tail and its offset must be aligned with
461 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
462 * smaller then offset may be not aligned and that data is just ignored.
464 if (rnb->rnb_offset % PAGE_SIZE)
467 /* Server returns whole file or just file tail if it fills in reply
468 * buffer, in both cases total size should be equal to the file size.
470 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
471 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
472 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
473 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
474 rnb->rnb_len, body->mbo_dom_size);
478 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
479 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
481 data = (char *)rnb + sizeof(*rnb);
483 lnb.lnb_file_offset = rnb->rnb_offset;
484 start = lnb.lnb_file_offset / PAGE_SIZE;
486 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
487 lnb.lnb_page_offset = 0;
489 lnb.lnb_data = data + (index << PAGE_SHIFT);
490 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
491 if (lnb.lnb_len > PAGE_SIZE)
492 lnb.lnb_len = PAGE_SIZE;
494 vmpage = read_cache_page(mapping, index + start,
495 ll_dom_readpage, &lnb);
496 if (IS_ERR(vmpage)) {
497 CWARN("%s: cannot fill page %lu for "DFID
498 " with data: rc = %li\n",
499 ll_i2sbi(inode)->ll_fsname, index + start,
500 PFID(lu_object_fid(&obj->co_lu)),
506 } while (rnb->rnb_len > (index << PAGE_SHIFT));
510 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
511 struct lookup_intent *itp)
513 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
514 struct dentry *parent = de->d_parent;
517 struct md_op_data *op_data;
518 struct ptlrpc_request *req = NULL;
522 LASSERT(parent != NULL);
523 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
525 /* if server supports open-by-fid, or file name is invalid, don't pack
526 * name in open request */
527 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
528 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
530 len = de->d_name.len;
531 name = kmalloc(len + 1, GFP_NOFS);
536 spin_lock(&de->d_lock);
537 if (len != de->d_name.len) {
538 spin_unlock(&de->d_lock);
542 memcpy(name, de->d_name.name, len);
544 spin_unlock(&de->d_lock);
546 if (!lu_name_is_valid_2(name, len)) {
552 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
553 name, len, 0, LUSTRE_OPC_ANY, NULL);
554 if (IS_ERR(op_data)) {
556 RETURN(PTR_ERR(op_data));
558 op_data->op_data = lmm;
559 op_data->op_data_size = lmmsize;
561 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
562 &ll_md_blocking_ast, 0);
564 ll_finish_md_op_data(op_data);
566 /* reason for keep own exit path - don`t flood log
567 * with messages with -ESTALE errors.
569 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
570 it_open_error(DISP_OPEN_OPEN, itp))
572 ll_release_openhandle(de, itp);
576 if (it_disposition(itp, DISP_LOOKUP_NEG))
577 GOTO(out, rc = -ENOENT);
579 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
580 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
581 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
585 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
587 if (!rc && itp->it_lock_mode) {
588 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
589 struct ldlm_lock *lock;
590 bool has_dom_bit = false;
592 /* If we got a lock back and it has a LOOKUP bit set,
593 * make sure the dentry is marked as valid so we can find it.
594 * We don't need to care about actual hashing since other bits
595 * of kernel will deal with that later.
597 lock = ldlm_handle2lock(&handle);
599 has_dom_bit = ldlm_has_dom(lock);
600 if (lock->l_policy_data.l_inodebits.bits &
601 MDS_INODELOCK_LOOKUP)
602 d_lustre_revalidate(de);
606 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
608 ll_dom_finish_open(de->d_inode, req, itp);
612 ptlrpc_req_finished(req);
613 ll_intent_drop_lock(itp);
615 /* We did open by fid, but by the time we got to the server,
616 * the object disappeared. If this is a create, we cannot really
617 * tell the userspace that the file it was trying to create
618 * does not exist. Instead let's return -ESTALE, and the VFS will
619 * retry the create with LOOKUP_REVAL that we are going to catch
620 * in ll_revalidate_dentry() and use lookup then.
622 if (rc == -ENOENT && itp->it_op & IT_CREAT)
628 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
629 struct obd_client_handle *och)
631 struct mdt_body *body;
633 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
634 och->och_open_handle = body->mbo_open_handle;
635 och->och_fid = body->mbo_fid1;
636 och->och_lease_handle.cookie = it->it_lock_handle;
637 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
638 och->och_flags = it->it_flags;
640 return md_set_open_replay_data(md_exp, och, it);
643 static int ll_local_open(struct file *file, struct lookup_intent *it,
644 struct ll_file_data *fd, struct obd_client_handle *och)
646 struct inode *inode = file_inode(file);
649 LASSERT(!LUSTRE_FPRIVATE(file));
656 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
661 LUSTRE_FPRIVATE(file) = fd;
662 ll_readahead_init(inode, &fd->fd_ras);
663 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
665 /* ll_cl_context initialize */
666 rwlock_init(&fd->fd_lock);
667 INIT_LIST_HEAD(&fd->fd_lccs);
672 /* Open a file, and (for the very first open) create objects on the OSTs at
673 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
674 * creation or open until ll_lov_setstripe() ioctl is called.
676 * If we already have the stripe MD locally then we don't request it in
677 * md_open(), by passing a lmm_size = 0.
679 * It is up to the application to ensure no other processes open this file
680 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
681 * used. We might be able to avoid races of that sort by getting lli_open_sem
682 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
683 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
685 int ll_file_open(struct inode *inode, struct file *file)
687 struct ll_inode_info *lli = ll_i2info(inode);
688 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
689 .it_flags = file->f_flags };
690 struct obd_client_handle **och_p = NULL;
691 __u64 *och_usecount = NULL;
692 struct ll_file_data *fd;
696 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
697 PFID(ll_inode2fid(inode)), inode, file->f_flags);
699 it = file->private_data; /* XXX: compat macro */
700 file->private_data = NULL; /* prevent ll_local_open assertion */
702 fd = ll_file_data_get();
704 GOTO(out_nofiledata, rc = -ENOMEM);
707 if (S_ISDIR(inode->i_mode))
708 ll_authorize_statahead(inode, fd);
710 if (inode->i_sb->s_root == file_dentry(file)) {
711 LUSTRE_FPRIVATE(file) = fd;
715 if (!it || !it->it_disposition) {
716 /* Convert f_flags into access mode. We cannot use file->f_mode,
717 * because everything but O_ACCMODE mask was stripped from
719 if ((oit.it_flags + 1) & O_ACCMODE)
721 if (file->f_flags & O_TRUNC)
722 oit.it_flags |= FMODE_WRITE;
724 /* kernel only call f_op->open in dentry_open. filp_open calls
725 * dentry_open after call to open_namei that checks permissions.
726 * Only nfsd_open call dentry_open directly without checking
727 * permissions and because of that this code below is safe.
729 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
730 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
732 /* We do not want O_EXCL here, presumably we opened the file
733 * already? XXX - NFS implications? */
734 oit.it_flags &= ~O_EXCL;
736 /* bug20584, if "it_flags" contains O_CREAT, the file will be
737 * created if necessary, then "IT_CREAT" should be set to keep
738 * consistent with it */
739 if (oit.it_flags & O_CREAT)
740 oit.it_op |= IT_CREAT;
746 /* Let's see if we have file open on MDS already. */
747 if (it->it_flags & FMODE_WRITE) {
748 och_p = &lli->lli_mds_write_och;
749 och_usecount = &lli->lli_open_fd_write_count;
750 } else if (it->it_flags & FMODE_EXEC) {
751 och_p = &lli->lli_mds_exec_och;
752 och_usecount = &lli->lli_open_fd_exec_count;
754 och_p = &lli->lli_mds_read_och;
755 och_usecount = &lli->lli_open_fd_read_count;
758 mutex_lock(&lli->lli_och_mutex);
759 if (*och_p) { /* Open handle is present */
760 if (it_disposition(it, DISP_OPEN_OPEN)) {
761 /* Well, there's extra open request that we do not need,
762 let's close it somehow. This will decref request. */
763 rc = it_open_error(DISP_OPEN_OPEN, it);
765 mutex_unlock(&lli->lli_och_mutex);
766 GOTO(out_openerr, rc);
769 ll_release_openhandle(file_dentry(file), it);
773 rc = ll_local_open(file, it, fd, NULL);
776 mutex_unlock(&lli->lli_och_mutex);
777 GOTO(out_openerr, rc);
780 LASSERT(*och_usecount == 0);
781 if (!it->it_disposition) {
782 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
783 /* We cannot just request lock handle now, new ELC code
784 means that one of other OPEN locks for this file
785 could be cancelled, and since blocking ast handler
786 would attempt to grab och_mutex as well, that would
787 result in a deadlock */
788 mutex_unlock(&lli->lli_och_mutex);
790 * Normally called under two situations:
792 * 2. A race/condition on MDS resulting in no open
793 * handle to be returned from LOOKUP|OPEN request,
794 * for example if the target entry was a symlink.
796 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
797 * marked by a bit set in ll_iget_for_nfs. Clear the
798 * bit so that it's not confusing later callers.
800 * NB; when ldd is NULL, it must have come via normal
801 * lookup path only, since ll_iget_for_nfs always calls
804 if (ldd && ldd->lld_nfs_dentry) {
805 ldd->lld_nfs_dentry = 0;
806 it->it_flags |= MDS_OPEN_LOCK;
810 * Always specify MDS_OPEN_BY_FID because we don't want
811 * to get file with different fid.
813 it->it_flags |= MDS_OPEN_BY_FID;
814 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
817 GOTO(out_openerr, rc);
821 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
823 GOTO(out_och_free, rc = -ENOMEM);
827 /* md_intent_lock() didn't get a request ref if there was an
828 * open error, so don't do cleanup on the request here
830 /* XXX (green): Should not we bail out on any error here, not
831 * just open error? */
832 rc = it_open_error(DISP_OPEN_OPEN, it);
834 GOTO(out_och_free, rc);
836 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
837 "inode %p: disposition %x, status %d\n", inode,
838 it_disposition(it, ~0), it->it_status);
840 rc = ll_local_open(file, it, fd, *och_p);
842 GOTO(out_och_free, rc);
845 rc = pcc_file_open(inode, file);
847 GOTO(out_och_free, rc);
849 mutex_unlock(&lli->lli_och_mutex);
852 /* Must do this outside lli_och_mutex lock to prevent deadlock where
853 different kind of OPEN lock for this same inode gets cancelled
854 by ldlm_cancel_lru */
855 if (!S_ISREG(inode->i_mode))
856 GOTO(out_och_free, rc);
858 cl_lov_delay_create_clear(&file->f_flags);
859 GOTO(out_och_free, rc);
863 if (och_p && *och_p) {
864 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
865 *och_p = NULL; /* OBD_FREE writes some magic there */
868 mutex_unlock(&lli->lli_och_mutex);
871 if (lli->lli_opendir_key == fd)
872 ll_deauthorize_statahead(inode, fd);
875 ll_file_data_put(fd);
877 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
881 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
882 ptlrpc_req_finished(it->it_request);
883 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
889 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
890 struct ldlm_lock_desc *desc, void *data, int flag)
893 struct lustre_handle lockh;
897 case LDLM_CB_BLOCKING:
898 ldlm_lock2handle(lock, &lockh);
899 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
901 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
905 case LDLM_CB_CANCELING:
913 * When setting a lease on a file, we take ownership of the lli_mds_*_och
914 * and save it as fd->fd_och so as to force client to reopen the file even
915 * if it has an open lock in cache already.
917 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
918 struct lustre_handle *old_open_handle)
920 struct ll_inode_info *lli = ll_i2info(inode);
921 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
922 struct obd_client_handle **och_p;
927 /* Get the openhandle of the file */
928 mutex_lock(&lli->lli_och_mutex);
929 if (fd->fd_lease_och != NULL)
930 GOTO(out_unlock, rc = -EBUSY);
932 if (fd->fd_och == NULL) {
933 if (file->f_mode & FMODE_WRITE) {
934 LASSERT(lli->lli_mds_write_och != NULL);
935 och_p = &lli->lli_mds_write_och;
936 och_usecount = &lli->lli_open_fd_write_count;
938 LASSERT(lli->lli_mds_read_och != NULL);
939 och_p = &lli->lli_mds_read_och;
940 och_usecount = &lli->lli_open_fd_read_count;
943 if (*och_usecount > 1)
944 GOTO(out_unlock, rc = -EBUSY);
951 *old_open_handle = fd->fd_och->och_open_handle;
955 mutex_unlock(&lli->lli_och_mutex);
960 * Release ownership on lli_mds_*_och when putting back a file lease.
962 static int ll_lease_och_release(struct inode *inode, struct file *file)
964 struct ll_inode_info *lli = ll_i2info(inode);
965 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
966 struct obd_client_handle **och_p;
967 struct obd_client_handle *old_och = NULL;
972 mutex_lock(&lli->lli_och_mutex);
973 if (file->f_mode & FMODE_WRITE) {
974 och_p = &lli->lli_mds_write_och;
975 och_usecount = &lli->lli_open_fd_write_count;
977 och_p = &lli->lli_mds_read_och;
978 och_usecount = &lli->lli_open_fd_read_count;
981 /* The file may have been open by another process (broken lease) so
982 * *och_p is not NULL. In this case we should simply increase usecount
985 if (*och_p != NULL) {
986 old_och = fd->fd_och;
993 mutex_unlock(&lli->lli_och_mutex);
996 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1002 * Acquire a lease and open the file.
1004 static struct obd_client_handle *
1005 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1008 struct lookup_intent it = { .it_op = IT_OPEN };
1009 struct ll_sb_info *sbi = ll_i2sbi(inode);
1010 struct md_op_data *op_data;
1011 struct ptlrpc_request *req = NULL;
1012 struct lustre_handle old_open_handle = { 0 };
1013 struct obd_client_handle *och = NULL;
1018 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1019 RETURN(ERR_PTR(-EINVAL));
1022 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1023 RETURN(ERR_PTR(-EPERM));
1025 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1027 RETURN(ERR_PTR(rc));
1032 RETURN(ERR_PTR(-ENOMEM));
1034 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1035 LUSTRE_OPC_ANY, NULL);
1036 if (IS_ERR(op_data))
1037 GOTO(out, rc = PTR_ERR(op_data));
1039 /* To tell the MDT this openhandle is from the same owner */
1040 op_data->op_open_handle = old_open_handle;
1042 it.it_flags = fmode | open_flags;
1043 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1044 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1045 &ll_md_blocking_lease_ast,
1046 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1047 * it can be cancelled which may mislead applications that the lease is
1049 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1050 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1051 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1052 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1053 ll_finish_md_op_data(op_data);
1054 ptlrpc_req_finished(req);
1056 GOTO(out_release_it, rc);
1058 if (it_disposition(&it, DISP_LOOKUP_NEG))
1059 GOTO(out_release_it, rc = -ENOENT);
1061 rc = it_open_error(DISP_OPEN_OPEN, &it);
1063 GOTO(out_release_it, rc);
1065 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1066 ll_och_fill(sbi->ll_md_exp, &it, och);
1068 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1069 GOTO(out_close, rc = -EOPNOTSUPP);
1071 /* already get lease, handle lease lock */
1072 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1073 if (it.it_lock_mode == 0 ||
1074 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1075 /* open lock must return for lease */
1076 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1077 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1079 GOTO(out_close, rc = -EPROTO);
1082 ll_intent_release(&it);
1086 /* Cancel open lock */
1087 if (it.it_lock_mode != 0) {
1088 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1090 it.it_lock_mode = 0;
1091 och->och_lease_handle.cookie = 0ULL;
1093 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1095 CERROR("%s: error closing file "DFID": %d\n",
1096 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1097 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1099 ll_intent_release(&it);
1103 RETURN(ERR_PTR(rc));
1107 * Check whether a layout swap can be done between two inodes.
1109 * \param[in] inode1 First inode to check
1110 * \param[in] inode2 Second inode to check
1112 * \retval 0 on success, layout swap can be performed between both inodes
1113 * \retval negative error code if requirements are not met
1115 static int ll_check_swap_layouts_validity(struct inode *inode1,
1116 struct inode *inode2)
1118 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1121 if (inode_permission(inode1, MAY_WRITE) ||
1122 inode_permission(inode2, MAY_WRITE))
1125 if (inode1->i_sb != inode2->i_sb)
1131 static int ll_swap_layouts_close(struct obd_client_handle *och,
1132 struct inode *inode, struct inode *inode2)
1134 const struct lu_fid *fid1 = ll_inode2fid(inode);
1135 const struct lu_fid *fid2;
1139 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1140 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1142 rc = ll_check_swap_layouts_validity(inode, inode2);
1144 GOTO(out_free_och, rc);
1146 /* We now know that inode2 is a lustre inode */
1147 fid2 = ll_inode2fid(inode2);
1149 rc = lu_fid_cmp(fid1, fid2);
1151 GOTO(out_free_och, rc = -EINVAL);
1153 /* Close the file and {swap,merge} layouts between inode & inode2.
1154 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1155 * because we still need it to pack l_remote_handle to MDT. */
1156 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1159 och = NULL; /* freed in ll_close_inode_openhandle() */
1169 * Release lease and close the file.
1170 * It will check if the lease has ever broken.
1172 static int ll_lease_close_intent(struct obd_client_handle *och,
1173 struct inode *inode,
1174 bool *lease_broken, enum mds_op_bias bias,
1177 struct ldlm_lock *lock;
1178 bool cancelled = true;
1182 lock = ldlm_handle2lock(&och->och_lease_handle);
1184 lock_res_and_lock(lock);
1185 cancelled = ldlm_is_cancel(lock);
1186 unlock_res_and_lock(lock);
1187 LDLM_LOCK_PUT(lock);
1190 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1191 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1193 if (lease_broken != NULL)
1194 *lease_broken = cancelled;
1196 if (!cancelled && !bias)
1197 ldlm_cli_cancel(&och->och_lease_handle, 0);
1199 if (cancelled) { /* no need to excute intent */
1204 rc = ll_close_inode_openhandle(inode, och, bias, data);
1208 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1211 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1215 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1217 static int ll_lease_file_resync(struct obd_client_handle *och,
1218 struct inode *inode, unsigned long arg)
1220 struct ll_sb_info *sbi = ll_i2sbi(inode);
1221 struct md_op_data *op_data;
1222 struct ll_ioc_lease_id ioc;
1223 __u64 data_version_unused;
1227 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1228 LUSTRE_OPC_ANY, NULL);
1229 if (IS_ERR(op_data))
1230 RETURN(PTR_ERR(op_data));
1232 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1236 /* before starting file resync, it's necessary to clean up page cache
1237 * in client memory, otherwise once the layout version is increased,
1238 * writing back cached data will be denied the OSTs. */
1239 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1243 op_data->op_lease_handle = och->och_lease_handle;
1244 op_data->op_mirror_id = ioc.lil_mirror_id;
1245 rc = md_file_resync(sbi->ll_md_exp, op_data);
1251 ll_finish_md_op_data(op_data);
1255 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1257 struct ll_inode_info *lli = ll_i2info(inode);
1258 struct cl_object *obj = lli->lli_clob;
1259 struct cl_attr *attr = vvp_env_thread_attr(env);
1267 ll_inode_size_lock(inode);
1269 /* Merge timestamps the most recently obtained from MDS with
1270 * timestamps obtained from OSTs.
1272 * Do not overwrite atime of inode because it may be refreshed
1273 * by file_accessed() function. If the read was served by cache
1274 * data, there is no RPC to be sent so that atime may not be
1275 * transferred to OSTs at all. MDT only updates atime at close time
1276 * if it's at least 'mdd.*.atime_diff' older.
1277 * All in all, the atime in Lustre does not strictly comply with
1278 * POSIX. Solving this problem needs to send an RPC to MDT for each
1279 * read, this will hurt performance.
1281 if (inode->i_atime.tv_sec < lli->lli_atime ||
1282 lli->lli_update_atime) {
1283 inode->i_atime.tv_sec = lli->lli_atime;
1284 lli->lli_update_atime = 0;
1286 inode->i_mtime.tv_sec = lli->lli_mtime;
1287 inode->i_ctime.tv_sec = lli->lli_ctime;
1289 mtime = inode->i_mtime.tv_sec;
1290 atime = inode->i_atime.tv_sec;
1291 ctime = inode->i_ctime.tv_sec;
1293 cl_object_attr_lock(obj);
1294 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1297 rc = cl_object_attr_get(env, obj, attr);
1298 cl_object_attr_unlock(obj);
1301 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1303 if (atime < attr->cat_atime)
1304 atime = attr->cat_atime;
1306 if (ctime < attr->cat_ctime)
1307 ctime = attr->cat_ctime;
1309 if (mtime < attr->cat_mtime)
1310 mtime = attr->cat_mtime;
1312 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1313 PFID(&lli->lli_fid), attr->cat_size);
1315 i_size_write(inode, attr->cat_size);
1316 inode->i_blocks = attr->cat_blocks;
1318 inode->i_mtime.tv_sec = mtime;
1319 inode->i_atime.tv_sec = atime;
1320 inode->i_ctime.tv_sec = ctime;
1323 ll_inode_size_unlock(inode);
1329 * Set designated mirror for I/O.
1331 * So far only read, write, and truncated can support to issue I/O to
1332 * designated mirror.
1334 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1336 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1338 /* clear layout version for generic(non-resync) I/O in case it carries
1339 * stale layout version due to I/O restart */
1340 io->ci_layout_version = 0;
1342 /* FLR: disable non-delay for designated mirror I/O because obviously
1343 * only one mirror is available */
1344 if (fd->fd_designated_mirror > 0) {
1346 io->ci_designated_mirror = fd->fd_designated_mirror;
1347 io->ci_layout_version = fd->fd_layout_version;
1350 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1351 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1354 static bool file_is_noatime(const struct file *file)
1356 const struct vfsmount *mnt = file->f_path.mnt;
1357 const struct inode *inode = file_inode((struct file *)file);
1359 /* Adapted from file_accessed() and touch_atime().*/
1360 if (file->f_flags & O_NOATIME)
1363 if (inode->i_flags & S_NOATIME)
1366 if (IS_NOATIME(inode))
1369 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1372 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1375 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1381 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1383 struct inode *inode = file_inode(file);
1384 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1386 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1387 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1389 if (iot == CIT_WRITE) {
1390 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1391 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1392 file->f_flags & O_DIRECT ||
1395 io->ci_obj = ll_i2info(inode)->lli_clob;
1396 io->ci_lockreq = CILR_MAYBE;
1397 if (ll_file_nolock(file)) {
1398 io->ci_lockreq = CILR_NEVER;
1399 io->ci_no_srvlock = 1;
1400 } else if (file->f_flags & O_APPEND) {
1401 io->ci_lockreq = CILR_MANDATORY;
1403 io->ci_noatime = file_is_noatime(file);
1404 io->ci_async_readahead = false;
1406 /* FLR: only use non-delay I/O for read as there is only one
1407 * avaliable mirror for write. */
1408 io->ci_ndelay = !(iot == CIT_WRITE);
1410 ll_io_set_mirror(io, file);
1413 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1416 struct ll_inode_info *lli = ll_i2info(inode);
1417 struct ll_sb_info *sbi = ll_i2sbi(inode);
1418 enum obd_heat_type sample_type;
1419 enum obd_heat_type iobyte_type;
1420 __u64 now = ktime_get_real_seconds();
1422 if (!ll_sbi_has_file_heat(sbi) ||
1423 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1426 if (iot == CIT_READ) {
1427 sample_type = OBD_HEAT_READSAMPLE;
1428 iobyte_type = OBD_HEAT_READBYTE;
1429 } else if (iot == CIT_WRITE) {
1430 sample_type = OBD_HEAT_WRITESAMPLE;
1431 iobyte_type = OBD_HEAT_WRITEBYTE;
1436 spin_lock(&lli->lli_heat_lock);
1437 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1438 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1439 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1440 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1441 spin_unlock(&lli->lli_heat_lock);
1445 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1446 struct file *file, enum cl_io_type iot,
1447 loff_t *ppos, size_t count)
1449 struct vvp_io *vio = vvp_env_io(env);
1450 struct inode *inode = file_inode(file);
1451 struct ll_inode_info *lli = ll_i2info(inode);
1452 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1453 struct range_lock range;
1457 unsigned retried = 0;
1458 bool restarted = false;
1462 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1463 file_dentry(file)->d_name.name,
1464 iot == CIT_READ ? "read" : "write", *ppos, count);
1467 io = vvp_env_thread_io(env);
1468 ll_io_init(io, file, iot);
1469 io->ci_ndelay_tried = retried;
1471 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1472 bool range_locked = false;
1474 if (file->f_flags & O_APPEND)
1475 range_lock_init(&range, 0, LUSTRE_EOF);
1477 range_lock_init(&range, *ppos, *ppos + count - 1);
1479 vio->vui_fd = LUSTRE_FPRIVATE(file);
1480 vio->vui_io_subtype = args->via_io_subtype;
1482 switch (vio->vui_io_subtype) {
1484 vio->vui_iter = args->u.normal.via_iter;
1485 vio->vui_iocb = args->u.normal.via_iocb;
1486 /* Direct IO reads must also take range lock,
1487 * or multiple reads will try to work on the same pages
1488 * See LU-6227 for details. */
1489 if (((iot == CIT_WRITE) ||
1490 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1491 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1492 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1494 rc = range_lock(&lli->lli_write_tree, &range);
1498 range_locked = true;
1502 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1503 vio->u.splice.vui_flags = args->u.splice.via_flags;
1506 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1510 ll_cl_add(file, env, io, LCC_RW);
1511 rc = cl_io_loop(env, io);
1512 ll_cl_remove(file, env);
1515 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1517 range_unlock(&lli->lli_write_tree, &range);
1520 /* cl_io_rw_init() handled IO */
1524 if (io->ci_nob > 0) {
1525 result += io->ci_nob;
1526 count -= io->ci_nob;
1527 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1529 /* prepare IO restart */
1530 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1531 args->u.normal.via_iter = vio->vui_iter;
1534 cl_io_fini(env, io);
1537 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1538 file->f_path.dentry->d_name.name,
1539 iot, rc, result, io->ci_need_restart);
1541 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1543 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1544 file_dentry(file)->d_name.name,
1545 iot == CIT_READ ? "read" : "write",
1546 *ppos, count, result, rc);
1547 /* preserve the tried count for FLR */
1548 retried = io->ci_ndelay_tried;
1553 if (iot == CIT_READ) {
1555 ll_stats_ops_tally(ll_i2sbi(inode),
1556 LPROC_LL_READ_BYTES, result);
1557 } else if (iot == CIT_WRITE) {
1559 ll_stats_ops_tally(ll_i2sbi(inode),
1560 LPROC_LL_WRITE_BYTES, result);
1561 fd->fd_write_failed = false;
1562 } else if (result == 0 && rc == 0) {
1565 fd->fd_write_failed = true;
1567 fd->fd_write_failed = false;
1568 } else if (rc != -ERESTARTSYS) {
1569 fd->fd_write_failed = true;
1573 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1575 ll_heat_add(inode, iot, result);
1577 RETURN(result > 0 ? result : rc);
1581 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1582 * especially for small I/O.
1584 * To serve a read request, CLIO has to create and initialize a cl_io and
1585 * then request DLM lock. This has turned out to have siginificant overhead
1586 * and affects the performance of small I/O dramatically.
1588 * It's not necessary to create a cl_io for each I/O. Under the help of read
1589 * ahead, most of the pages being read are already in memory cache and we can
1590 * read those pages directly because if the pages exist, the corresponding DLM
1591 * lock must exist so that page content must be valid.
1593 * In fast read implementation, the llite speculatively finds and reads pages
1594 * in memory cache. There are three scenarios for fast read:
1595 * - If the page exists and is uptodate, kernel VM will provide the data and
1596 * CLIO won't be intervened;
1597 * - If the page was brought into memory by read ahead, it will be exported
1598 * and read ahead parameters will be updated;
1599 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1600 * it will go back and invoke normal read, i.e., a cl_io will be created
1601 * and DLM lock will be requested.
1603 * POSIX compliance: posix standard states that read is intended to be atomic.
1604 * Lustre read implementation is in line with Linux kernel read implementation
1605 * and neither of them complies with POSIX standard in this matter. Fast read
1606 * doesn't make the situation worse on single node but it may interleave write
1607 * results from multiple nodes due to short read handling in ll_file_aio_read().
1609 * \param env - lu_env
1610 * \param iocb - kiocb from kernel
1611 * \param iter - user space buffers where the data will be copied
1613 * \retval - number of bytes have been read, or error code if error occurred.
1616 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1620 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1623 /* NB: we can't do direct IO for fast read because it will need a lock
1624 * to make IO engine happy. */
1625 if (iocb->ki_filp->f_flags & O_DIRECT)
1628 result = generic_file_read_iter(iocb, iter);
1630 /* If the first page is not in cache, generic_file_aio_read() will be
1631 * returned with -ENODATA.
1632 * See corresponding code in ll_readpage(). */
1633 if (result == -ENODATA)
1637 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1638 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1639 LPROC_LL_READ_BYTES, result);
1646 * Read from a file (through the page cache).
1648 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1651 struct vvp_io_args *args;
1657 if (!iov_iter_count(to))
1661 * Currently when PCC read failed, we do not fall back to the
1662 * normal read path, just return the error.
1663 * The resaon is that: for RW-PCC, the file data may be modified
1664 * in the PCC and inconsistent with the data on OSTs (or file
1665 * data has been removed from the Lustre file system), at this
1666 * time, fallback to the normal read path may read the wrong
1668 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1669 * path: read data from data copy on OSTs.
1671 result = pcc_file_read_iter(iocb, to, &cached);
1675 ll_ras_enter(iocb->ki_filp);
1677 result = ll_do_fast_read(iocb, to);
1678 if (result < 0 || iov_iter_count(to) == 0)
1681 env = cl_env_get(&refcheck);
1683 return PTR_ERR(env);
1685 args = ll_env_args(env, IO_NORMAL);
1686 args->u.normal.via_iter = to;
1687 args->u.normal.via_iocb = iocb;
1689 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1690 &iocb->ki_pos, iov_iter_count(to));
1693 else if (result == 0)
1696 cl_env_put(env, &refcheck);
1702 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1703 * If a page is already in the page cache and dirty (and some other things -
1704 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1705 * write to it without doing a full I/O, because Lustre already knows about it
1706 * and will write it out. This saves a lot of processing time.
1708 * All writes here are within one page, so exclusion is handled by the page
1709 * lock on the vm page. We do not do tiny writes for writes which touch
1710 * multiple pages because it's very unlikely multiple sequential pages are
1711 * are already dirty.
1713 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1714 * and are unlikely to be to already dirty pages.
1716 * Attribute updates are important here, we do them in ll_tiny_write_end.
1718 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1720 ssize_t count = iov_iter_count(iter);
1721 struct file *file = iocb->ki_filp;
1722 struct inode *inode = file_inode(file);
1723 bool lock_inode = !IS_NOSEC(inode);
1728 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1729 * of function for why.
1731 if (count >= PAGE_SIZE ||
1732 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1735 if (unlikely(lock_inode))
1737 result = __generic_file_write_iter(iocb, iter);
1739 if (unlikely(lock_inode))
1740 inode_unlock(inode);
1742 /* If the page is not already dirty, ll_tiny_write_begin returns
1743 * -ENODATA. We continue on to normal write.
1745 if (result == -ENODATA)
1749 ll_heat_add(inode, CIT_WRITE, result);
1750 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1752 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1755 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1761 * Write to a file (through the page cache).
1763 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1765 struct vvp_io_args *args;
1767 ssize_t rc_tiny = 0, rc_normal;
1774 if (!iov_iter_count(from))
1775 GOTO(out, rc_normal = 0);
1778 * When PCC write failed, we usually do not fall back to the normal
1779 * write path, just return the error. But there is a special case when
1780 * returned error code is -ENOSPC due to running out of space on PCC HSM
1781 * bakcend. At this time, it will fall back to normal I/O path and
1782 * retry the I/O. As the file is in HSM released state, it will restore
1783 * the file data to OSTs first and redo the write again. And the
1784 * restore process will revoke the layout lock and detach the file
1785 * from PCC cache automatically.
1787 result = pcc_file_write_iter(iocb, from, &cached);
1788 if (cached && result != -ENOSPC && result != -EDQUOT)
1791 /* NB: we can't do direct IO for tiny writes because they use the page
1792 * cache, we can't do sync writes because tiny writes can't flush
1793 * pages, and we can't do append writes because we can't guarantee the
1794 * required DLM locks are held to protect file size.
1796 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1797 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1798 rc_tiny = ll_do_tiny_write(iocb, from);
1800 /* In case of error, go on and try normal write - Only stop if tiny
1801 * write completed I/O.
1803 if (iov_iter_count(from) == 0)
1804 GOTO(out, rc_normal = rc_tiny);
1806 env = cl_env_get(&refcheck);
1808 return PTR_ERR(env);
1810 args = ll_env_args(env, IO_NORMAL);
1811 args->u.normal.via_iter = from;
1812 args->u.normal.via_iocb = iocb;
1814 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1815 &iocb->ki_pos, iov_iter_count(from));
1817 /* On success, combine bytes written. */
1818 if (rc_tiny >= 0 && rc_normal > 0)
1819 rc_normal += rc_tiny;
1820 /* On error, only return error from normal write if tiny write did not
1821 * write any bytes. Otherwise return bytes written by tiny write.
1823 else if (rc_tiny > 0)
1824 rc_normal = rc_tiny;
1826 cl_env_put(env, &refcheck);
1831 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1833 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1835 static int ll_file_get_iov_count(const struct iovec *iov,
1836 unsigned long *nr_segs, size_t *count)
1841 for (seg = 0; seg < *nr_segs; seg++) {
1842 const struct iovec *iv = &iov[seg];
1845 * If any segment has a negative length, or the cumulative
1846 * length ever wraps negative then return -EINVAL.
1849 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1851 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1856 cnt -= iv->iov_len; /* This segment is no good */
1863 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1864 unsigned long nr_segs, loff_t pos)
1871 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1878 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1879 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1880 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1881 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1882 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1884 result = ll_file_read_iter(iocb, &to);
1889 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1892 struct iovec iov = { .iov_base = buf, .iov_len = count };
1901 init_sync_kiocb(&kiocb, file);
1902 kiocb.ki_pos = *ppos;
1903 #ifdef HAVE_KIOCB_KI_LEFT
1904 kiocb.ki_left = count;
1905 #elif defined(HAVE_KI_NBYTES)
1906 kiocb.i_nbytes = count;
1909 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1910 *ppos = kiocb.ki_pos;
1916 * Write to a file (through the page cache).
1919 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1920 unsigned long nr_segs, loff_t pos)
1922 struct iov_iter from;
1927 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1934 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1935 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1936 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1937 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1938 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1940 result = ll_file_write_iter(iocb, &from);
1945 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1946 size_t count, loff_t *ppos)
1948 struct iovec iov = { .iov_base = (void __user *)buf,
1958 init_sync_kiocb(&kiocb, file);
1959 kiocb.ki_pos = *ppos;
1960 #ifdef HAVE_KIOCB_KI_LEFT
1961 kiocb.ki_left = count;
1962 #elif defined(HAVE_KI_NBYTES)
1963 kiocb.ki_nbytes = count;
1966 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1967 *ppos = kiocb.ki_pos;
1971 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1974 * Send file content (through pagecache) somewhere with helper
1976 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1977 struct pipe_inode_info *pipe, size_t count,
1981 struct vvp_io_args *args;
1988 result = pcc_file_splice_read(in_file, ppos, pipe,
1989 count, flags, &cached);
1993 ll_ras_enter(in_file);
1995 env = cl_env_get(&refcheck);
1997 RETURN(PTR_ERR(env));
1999 args = ll_env_args(env, IO_SPLICE);
2000 args->u.splice.via_pipe = pipe;
2001 args->u.splice.via_flags = flags;
2003 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2004 cl_env_put(env, &refcheck);
2008 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2009 __u64 flags, struct lov_user_md *lum, int lum_size)
2011 struct lookup_intent oit = {
2013 .it_flags = flags | MDS_OPEN_BY_FID,
2018 ll_inode_size_lock(inode);
2019 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2021 GOTO(out_unlock, rc);
2023 ll_release_openhandle(dentry, &oit);
2026 ll_inode_size_unlock(inode);
2027 ll_intent_release(&oit);
2032 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2033 struct lov_mds_md **lmmp, int *lmm_size,
2034 struct ptlrpc_request **request)
2036 struct ll_sb_info *sbi = ll_i2sbi(inode);
2037 struct mdt_body *body;
2038 struct lov_mds_md *lmm = NULL;
2039 struct ptlrpc_request *req = NULL;
2040 struct md_op_data *op_data;
2043 rc = ll_get_default_mdsize(sbi, &lmmsize);
2047 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2048 strlen(filename), lmmsize,
2049 LUSTRE_OPC_ANY, NULL);
2050 if (IS_ERR(op_data))
2051 RETURN(PTR_ERR(op_data));
2053 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2054 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2055 ll_finish_md_op_data(op_data);
2057 CDEBUG(D_INFO, "md_getattr_name failed "
2058 "on %s: rc %d\n", filename, rc);
2062 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2063 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2065 lmmsize = body->mbo_eadatasize;
2067 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2069 GOTO(out, rc = -ENODATA);
2072 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2073 LASSERT(lmm != NULL);
2075 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2076 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2077 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2078 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2079 GOTO(out, rc = -EPROTO);
2082 * This is coming from the MDS, so is probably in
2083 * little endian. We convert it to host endian before
2084 * passing it to userspace.
2086 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2089 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2090 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2091 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2092 if (le32_to_cpu(lmm->lmm_pattern) &
2093 LOV_PATTERN_F_RELEASED)
2097 /* if function called for directory - we should
2098 * avoid swab not existent lsm objects */
2099 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2100 lustre_swab_lov_user_md_v1(
2101 (struct lov_user_md_v1 *)lmm);
2102 if (S_ISREG(body->mbo_mode))
2103 lustre_swab_lov_user_md_objects(
2104 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2106 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2107 lustre_swab_lov_user_md_v3(
2108 (struct lov_user_md_v3 *)lmm);
2109 if (S_ISREG(body->mbo_mode))
2110 lustre_swab_lov_user_md_objects(
2111 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2113 } else if (lmm->lmm_magic ==
2114 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2115 lustre_swab_lov_comp_md_v1(
2116 (struct lov_comp_md_v1 *)lmm);
2117 } else if (lmm->lmm_magic ==
2118 cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2119 struct lov_foreign_md *lfm;
2121 lfm = (struct lov_foreign_md *)lmm;
2122 __swab32s(&lfm->lfm_magic);
2123 __swab32s(&lfm->lfm_length);
2124 __swab32s(&lfm->lfm_type);
2125 __swab32s(&lfm->lfm_flags);
2131 *lmm_size = lmmsize;
2136 static int ll_lov_setea(struct inode *inode, struct file *file,
2139 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2140 struct lov_user_md *lump;
2141 int lum_size = sizeof(struct lov_user_md) +
2142 sizeof(struct lov_user_ost_data);
2146 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2149 OBD_ALLOC_LARGE(lump, lum_size);
2153 if (copy_from_user(lump, arg, lum_size))
2154 GOTO(out_lump, rc = -EFAULT);
2156 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2158 cl_lov_delay_create_clear(&file->f_flags);
2161 OBD_FREE_LARGE(lump, lum_size);
2165 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2172 env = cl_env_get(&refcheck);
2174 RETURN(PTR_ERR(env));
2176 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2177 cl_env_put(env, &refcheck);
2181 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2184 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2185 struct lov_user_md *klum;
2187 __u64 flags = FMODE_WRITE;
2190 rc = ll_copy_user_md(lum, &klum);
2195 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2200 rc = put_user(0, &lum->lmm_stripe_count);
2204 rc = ll_layout_refresh(inode, &gen);
2208 rc = ll_file_getstripe(inode, arg, lum_size);
2210 cl_lov_delay_create_clear(&file->f_flags);
2213 OBD_FREE(klum, lum_size);
2218 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2220 struct ll_inode_info *lli = ll_i2info(inode);
2221 struct cl_object *obj = lli->lli_clob;
2222 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2223 struct ll_grouplock grouplock;
2228 CWARN("group id for group lock must not be 0\n");
2232 if (ll_file_nolock(file))
2233 RETURN(-EOPNOTSUPP);
2235 spin_lock(&lli->lli_lock);
2236 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2237 CWARN("group lock already existed with gid %lu\n",
2238 fd->fd_grouplock.lg_gid);
2239 spin_unlock(&lli->lli_lock);
2242 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2243 spin_unlock(&lli->lli_lock);
2246 * XXX: group lock needs to protect all OST objects while PFL
2247 * can add new OST objects during the IO, so we'd instantiate
2248 * all OST objects before getting its group lock.
2253 struct cl_layout cl = {
2254 .cl_is_composite = false,
2256 struct lu_extent ext = {
2258 .e_end = OBD_OBJECT_EOF,
2261 env = cl_env_get(&refcheck);
2263 RETURN(PTR_ERR(env));
2265 rc = cl_object_layout_get(env, obj, &cl);
2266 if (!rc && cl.cl_is_composite)
2267 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2270 cl_env_put(env, &refcheck);
2275 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2276 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2280 spin_lock(&lli->lli_lock);
2281 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2282 spin_unlock(&lli->lli_lock);
2283 CERROR("another thread just won the race\n");
2284 cl_put_grouplock(&grouplock);
2288 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2289 fd->fd_grouplock = grouplock;
2290 spin_unlock(&lli->lli_lock);
2292 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2296 static int ll_put_grouplock(struct inode *inode, struct file *file,
2299 struct ll_inode_info *lli = ll_i2info(inode);
2300 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2301 struct ll_grouplock grouplock;
2304 spin_lock(&lli->lli_lock);
2305 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2306 spin_unlock(&lli->lli_lock);
2307 CWARN("no group lock held\n");
2311 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2313 if (fd->fd_grouplock.lg_gid != arg) {
2314 CWARN("group lock %lu doesn't match current id %lu\n",
2315 arg, fd->fd_grouplock.lg_gid);
2316 spin_unlock(&lli->lli_lock);
2320 grouplock = fd->fd_grouplock;
2321 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2322 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2323 spin_unlock(&lli->lli_lock);
2325 cl_put_grouplock(&grouplock);
2326 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2331 * Close inode open handle
2333 * \param dentry [in] dentry which contains the inode
2334 * \param it [in,out] intent which contains open info and result
2337 * \retval <0 failure
2339 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2341 struct inode *inode = dentry->d_inode;
2342 struct obd_client_handle *och;
2348 /* Root ? Do nothing. */
2349 if (dentry->d_inode->i_sb->s_root == dentry)
2352 /* No open handle to close? Move away */
2353 if (!it_disposition(it, DISP_OPEN_OPEN))
2356 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2358 OBD_ALLOC(och, sizeof(*och));
2360 GOTO(out, rc = -ENOMEM);
2362 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2364 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2366 /* this one is in place of ll_file_open */
2367 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2368 ptlrpc_req_finished(it->it_request);
2369 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2375 * Get size for inode for which FIEMAP mapping is requested.
2376 * Make the FIEMAP get_info call and returns the result.
2377 * \param fiemap kernel buffer to hold extens
2378 * \param num_bytes kernel buffer size
2380 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2386 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2389 /* Checks for fiemap flags */
2390 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2391 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2395 /* Check for FIEMAP_FLAG_SYNC */
2396 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2397 rc = filemap_fdatawrite(inode->i_mapping);
2402 env = cl_env_get(&refcheck);
2404 RETURN(PTR_ERR(env));
2406 if (i_size_read(inode) == 0) {
2407 rc = ll_glimpse_size(inode);
2412 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2413 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2414 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2416 /* If filesize is 0, then there would be no objects for mapping */
2417 if (fmkey.lfik_oa.o_size == 0) {
2418 fiemap->fm_mapped_extents = 0;
2422 fmkey.lfik_fiemap = *fiemap;
2424 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2425 &fmkey, fiemap, &num_bytes);
2427 cl_env_put(env, &refcheck);
2431 int ll_fid2path(struct inode *inode, void __user *arg)
2433 struct obd_export *exp = ll_i2mdexp(inode);
2434 const struct getinfo_fid2path __user *gfin = arg;
2436 struct getinfo_fid2path *gfout;
2442 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2443 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2446 /* Only need to get the buflen */
2447 if (get_user(pathlen, &gfin->gf_pathlen))
2450 if (pathlen > PATH_MAX)
2453 outsize = sizeof(*gfout) + pathlen;
2454 OBD_ALLOC(gfout, outsize);
2458 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2459 GOTO(gf_free, rc = -EFAULT);
2460 /* append root FID after gfout to let MDT know the root FID so that it
2461 * can lookup the correct path, this is mainly for fileset.
2462 * old server without fileset mount support will ignore this. */
2463 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2465 /* Call mdc_iocontrol */
2466 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2470 if (copy_to_user(arg, gfout, outsize))
2474 OBD_FREE(gfout, outsize);
2479 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2481 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2489 ioc->idv_version = 0;
2490 ioc->idv_layout_version = UINT_MAX;
2492 /* If no file object initialized, we consider its version is 0. */
2496 env = cl_env_get(&refcheck);
2498 RETURN(PTR_ERR(env));
2500 io = vvp_env_thread_io(env);
2502 io->u.ci_data_version.dv_data_version = 0;
2503 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2504 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2507 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2508 result = cl_io_loop(env, io);
2510 result = io->ci_result;
2512 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2513 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2515 cl_io_fini(env, io);
2517 if (unlikely(io->ci_need_restart))
2520 cl_env_put(env, &refcheck);
2526 * Read the data_version for inode.
2528 * This value is computed using stripe object version on OST.
2529 * Version is computed using server side locking.
2531 * @param flags if do sync on the OST side;
2533 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2534 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2536 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2538 struct ioc_data_version ioc = { .idv_flags = flags };
2541 rc = ll_ioc_data_version(inode, &ioc);
2543 *data_version = ioc.idv_version;
2549 * Trigger a HSM release request for the provided inode.
2551 int ll_hsm_release(struct inode *inode)
2554 struct obd_client_handle *och = NULL;
2555 __u64 data_version = 0;
2560 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2561 ll_i2sbi(inode)->ll_fsname,
2562 PFID(&ll_i2info(inode)->lli_fid));
2564 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2566 GOTO(out, rc = PTR_ERR(och));
2568 /* Grab latest data_version and [am]time values */
2569 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2573 env = cl_env_get(&refcheck);
2575 GOTO(out, rc = PTR_ERR(env));
2577 rc = ll_merge_attr(env, inode);
2578 cl_env_put(env, &refcheck);
2580 /* If error happen, we have the wrong size for a file.
2586 /* Release the file.
2587 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2588 * we still need it to pack l_remote_handle to MDT. */
2589 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2595 if (och != NULL && !IS_ERR(och)) /* close the file */
2596 ll_lease_close(och, inode, NULL);
2601 struct ll_swap_stack {
2604 struct inode *inode1;
2605 struct inode *inode2;
2610 static int ll_swap_layouts(struct file *file1, struct file *file2,
2611 struct lustre_swap_layouts *lsl)
2613 struct mdc_swap_layouts msl;
2614 struct md_op_data *op_data;
2617 struct ll_swap_stack *llss = NULL;
2620 OBD_ALLOC_PTR(llss);
2624 llss->inode1 = file_inode(file1);
2625 llss->inode2 = file_inode(file2);
2627 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2631 /* we use 2 bool because it is easier to swap than 2 bits */
2632 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2633 llss->check_dv1 = true;
2635 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2636 llss->check_dv2 = true;
2638 /* we cannot use lsl->sl_dvX directly because we may swap them */
2639 llss->dv1 = lsl->sl_dv1;
2640 llss->dv2 = lsl->sl_dv2;
2642 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2643 if (rc == 0) /* same file, done! */
2646 if (rc < 0) { /* sequentialize it */
2647 swap(llss->inode1, llss->inode2);
2649 swap(llss->dv1, llss->dv2);
2650 swap(llss->check_dv1, llss->check_dv2);
2654 if (gid != 0) { /* application asks to flush dirty cache */
2655 rc = ll_get_grouplock(llss->inode1, file1, gid);
2659 rc = ll_get_grouplock(llss->inode2, file2, gid);
2661 ll_put_grouplock(llss->inode1, file1, gid);
2666 /* ultimate check, before swaping the layouts we check if
2667 * dataversion has changed (if requested) */
2668 if (llss->check_dv1) {
2669 rc = ll_data_version(llss->inode1, &dv, 0);
2672 if (dv != llss->dv1)
2673 GOTO(putgl, rc = -EAGAIN);
2676 if (llss->check_dv2) {
2677 rc = ll_data_version(llss->inode2, &dv, 0);
2680 if (dv != llss->dv2)
2681 GOTO(putgl, rc = -EAGAIN);
2684 /* struct md_op_data is used to send the swap args to the mdt
2685 * only flags is missing, so we use struct mdc_swap_layouts
2686 * through the md_op_data->op_data */
2687 /* flags from user space have to be converted before they are send to
2688 * server, no flag is sent today, they are only used on the client */
2691 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2692 0, LUSTRE_OPC_ANY, &msl);
2693 if (IS_ERR(op_data))
2694 GOTO(free, rc = PTR_ERR(op_data));
2696 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2697 sizeof(*op_data), op_data, NULL);
2698 ll_finish_md_op_data(op_data);
2705 ll_put_grouplock(llss->inode2, file2, gid);
2706 ll_put_grouplock(llss->inode1, file1, gid);
2716 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2718 struct obd_export *exp = ll_i2mdexp(inode);
2719 struct md_op_data *op_data;
2723 /* Detect out-of range masks */
2724 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2727 /* Non-root users are forbidden to set or clear flags which are
2728 * NOT defined in HSM_USER_MASK. */
2729 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2730 !cfs_capable(CFS_CAP_SYS_ADMIN))
2733 if (!exp_connect_archive_id_array(exp)) {
2734 /* Detect out-of range archive id */
2735 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2736 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2740 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2741 LUSTRE_OPC_ANY, hss);
2742 if (IS_ERR(op_data))
2743 RETURN(PTR_ERR(op_data));
2745 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2748 ll_finish_md_op_data(op_data);
2753 static int ll_hsm_import(struct inode *inode, struct file *file,
2754 struct hsm_user_import *hui)
2756 struct hsm_state_set *hss = NULL;
2757 struct iattr *attr = NULL;
2761 if (!S_ISREG(inode->i_mode))
2767 GOTO(out, rc = -ENOMEM);
2769 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2770 hss->hss_archive_id = hui->hui_archive_id;
2771 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2772 rc = ll_hsm_state_set(inode, hss);
2776 OBD_ALLOC_PTR(attr);
2778 GOTO(out, rc = -ENOMEM);
2780 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2781 attr->ia_mode |= S_IFREG;
2782 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2783 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2784 attr->ia_size = hui->hui_size;
2785 attr->ia_mtime.tv_sec = hui->hui_mtime;
2786 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2787 attr->ia_atime.tv_sec = hui->hui_atime;
2788 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2790 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2791 ATTR_UID | ATTR_GID |
2792 ATTR_MTIME | ATTR_MTIME_SET |
2793 ATTR_ATIME | ATTR_ATIME_SET;
2797 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2801 inode_unlock(inode);
2813 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2815 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2816 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2819 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2821 struct inode *inode = file_inode(file);
2823 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2824 ATTR_MTIME | ATTR_MTIME_SET |
2827 .tv_sec = lfu->lfu_atime_sec,
2828 .tv_nsec = lfu->lfu_atime_nsec,
2831 .tv_sec = lfu->lfu_mtime_sec,
2832 .tv_nsec = lfu->lfu_mtime_nsec,
2835 .tv_sec = lfu->lfu_ctime_sec,
2836 .tv_nsec = lfu->lfu_ctime_nsec,
2842 if (!capable(CAP_SYS_ADMIN))
2845 if (!S_ISREG(inode->i_mode))
2849 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2851 inode_unlock(inode);
2856 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2859 case MODE_READ_USER:
2861 case MODE_WRITE_USER:
2868 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2870 /* Used to allow the upper layers of the client to request an LDLM lock
2871 * without doing an actual read or write.
2873 * Used for ladvise lockahead to manually request specific locks.
2875 * \param[in] file file this ladvise lock request is on
2876 * \param[in] ladvise ladvise struct describing this lock request
2878 * \retval 0 success, no detailed result available (sync requests
2879 * and requests sent to the server [not handled locally]
2880 * cannot return detailed results)
2881 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2882 * see definitions for details.
2883 * \retval negative negative errno on error
2885 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2887 struct lu_env *env = NULL;
2888 struct cl_io *io = NULL;
2889 struct cl_lock *lock = NULL;
2890 struct cl_lock_descr *descr = NULL;
2891 struct dentry *dentry = file->f_path.dentry;
2892 struct inode *inode = dentry->d_inode;
2893 enum cl_lock_mode cl_mode;
2894 off_t start = ladvise->lla_start;
2895 off_t end = ladvise->lla_end;
2901 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2902 "start=%llu, end=%llu\n", dentry->d_name.len,
2903 dentry->d_name.name, dentry->d_inode,
2904 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2907 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2909 GOTO(out, result = cl_mode);
2911 /* Get IO environment */
2912 result = cl_io_get(inode, &env, &io, &refcheck);
2916 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2919 * nothing to do for this io. This currently happens when
2920 * stripe sub-object's are not yet created.
2922 result = io->ci_result;
2923 } else if (result == 0) {
2924 lock = vvp_env_lock(env);
2925 descr = &lock->cll_descr;
2927 descr->cld_obj = io->ci_obj;
2928 /* Convert byte offsets to pages */
2929 descr->cld_start = cl_index(io->ci_obj, start);
2930 descr->cld_end = cl_index(io->ci_obj, end);
2931 descr->cld_mode = cl_mode;
2932 /* CEF_MUST is used because we do not want to convert a
2933 * lockahead request to a lockless lock */
2934 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2937 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2938 descr->cld_enq_flags |= CEF_SPECULATIVE;
2940 result = cl_lock_request(env, io, lock);
2942 /* On success, we need to release the lock */
2944 cl_lock_release(env, lock);
2946 cl_io_fini(env, io);
2947 cl_env_put(env, &refcheck);
2949 /* -ECANCELED indicates a matching lock with a different extent
2950 * was already present, and -EEXIST indicates a matching lock
2951 * on exactly the same extent was already present.
2952 * We convert them to positive values for userspace to make
2953 * recognizing true errors easier.
2954 * Note we can only return these detailed results on async requests,
2955 * as sync requests look the same as i/o requests for locking. */
2956 if (result == -ECANCELED)
2957 result = LLA_RESULT_DIFFERENT;
2958 else if (result == -EEXIST)
2959 result = LLA_RESULT_SAME;
2964 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2966 static int ll_ladvise_sanity(struct inode *inode,
2967 struct llapi_lu_ladvise *ladvise)
2969 struct ll_sb_info *sbi = ll_i2sbi(inode);
2970 enum lu_ladvise_type advice = ladvise->lla_advice;
2971 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2972 * be in the first 32 bits of enum ladvise_flags */
2973 __u32 flags = ladvise->lla_peradvice_flags;
2974 /* 3 lines at 80 characters per line, should be plenty */
2977 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2979 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2980 "last supported advice is %s (value '%d'): rc = %d\n",
2981 sbi->ll_fsname, advice,
2982 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2986 /* Per-advice checks */
2988 case LU_LADVISE_LOCKNOEXPAND:
2989 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2991 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2992 "rc = %d\n", sbi->ll_fsname, flags,
2993 ladvise_names[advice], rc);
2997 case LU_LADVISE_LOCKAHEAD:
2998 /* Currently only READ and WRITE modes can be requested */
2999 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3000 ladvise->lla_lockahead_mode == 0) {
3002 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3003 "rc = %d\n", sbi->ll_fsname,
3004 ladvise->lla_lockahead_mode,
3005 ladvise_names[advice], rc);
3008 case LU_LADVISE_WILLREAD:
3009 case LU_LADVISE_DONTNEED:
3011 /* Note fall through above - These checks apply to all advices
3012 * except LOCKNOEXPAND */
3013 if (flags & ~LF_DEFAULT_MASK) {
3015 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3016 "rc = %d\n", sbi->ll_fsname, flags,
3017 ladvise_names[advice], rc);
3020 if (ladvise->lla_start >= ladvise->lla_end) {
3022 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3023 "for %s: rc = %d\n", sbi->ll_fsname,
3024 ladvise->lla_start, ladvise->lla_end,
3025 ladvise_names[advice], rc);
3037 * Give file access advices
3039 * The ladvise interface is similar to Linux fadvise() system call, except it
3040 * forwards the advices directly from Lustre client to server. The server side
3041 * codes will apply appropriate read-ahead and caching techniques for the
3042 * corresponding files.
3044 * A typical workload for ladvise is e.g. a bunch of different clients are
3045 * doing small random reads of a file, so prefetching pages into OSS cache
3046 * with big linear reads before the random IO is a net benefit. Fetching
3047 * all that data into each client cache with fadvise() may not be, due to
3048 * much more data being sent to the client.
3050 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3051 struct llapi_lu_ladvise *ladvise)
3055 struct cl_ladvise_io *lio;
3060 env = cl_env_get(&refcheck);
3062 RETURN(PTR_ERR(env));
3064 io = vvp_env_thread_io(env);
3065 io->ci_obj = ll_i2info(inode)->lli_clob;
3067 /* initialize parameters for ladvise */
3068 lio = &io->u.ci_ladvise;
3069 lio->li_start = ladvise->lla_start;
3070 lio->li_end = ladvise->lla_end;
3071 lio->li_fid = ll_inode2fid(inode);
3072 lio->li_advice = ladvise->lla_advice;
3073 lio->li_flags = flags;
3075 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3076 rc = cl_io_loop(env, io);
3080 cl_io_fini(env, io);
3081 cl_env_put(env, &refcheck);
3085 static int ll_lock_noexpand(struct file *file, int flags)
3087 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3089 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3094 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3097 struct fsxattr fsxattr;
3099 if (copy_from_user(&fsxattr,
3100 (const struct fsxattr __user *)arg,
3104 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3105 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3106 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3107 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3108 if (copy_to_user((struct fsxattr __user *)arg,
3109 &fsxattr, sizeof(fsxattr)))
3115 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3118 * Project Quota ID state is only allowed to change from within the init
3119 * namespace. Enforce that restriction only if we are trying to change
3120 * the quota ID state. Everything else is allowed in user namespaces.
3122 if (current_user_ns() == &init_user_ns)
3125 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3128 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3129 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3132 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3139 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3143 struct md_op_data *op_data;
3144 struct ptlrpc_request *req = NULL;
3146 struct fsxattr fsxattr;
3147 struct cl_object *obj;
3151 if (copy_from_user(&fsxattr,
3152 (const struct fsxattr __user *)arg,
3156 rc = ll_ioctl_check_project(inode, &fsxattr);
3160 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3161 LUSTRE_OPC_ANY, NULL);
3162 if (IS_ERR(op_data))
3163 RETURN(PTR_ERR(op_data));
3165 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3166 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3167 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3168 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3169 op_data->op_projid = fsxattr.fsx_projid;
3170 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3171 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3173 ptlrpc_req_finished(req);
3175 GOTO(out_fsxattr, rc);
3176 ll_update_inode_flags(inode, op_data->op_attr_flags);
3177 obj = ll_i2info(inode)->lli_clob;
3179 GOTO(out_fsxattr, rc);
3181 OBD_ALLOC_PTR(attr);
3183 GOTO(out_fsxattr, rc = -ENOMEM);
3185 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3186 fsxattr.fsx_xflags);
3189 ll_finish_md_op_data(op_data);
3193 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3196 struct inode *inode = file_inode(file);
3197 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3198 struct ll_inode_info *lli = ll_i2info(inode);
3199 struct obd_client_handle *och = NULL;
3200 struct split_param sp;
3201 struct pcc_param param;
3202 bool lease_broken = false;
3204 enum mds_op_bias bias = 0;
3205 struct file *layout_file = NULL;
3207 size_t data_size = 0;
3208 bool attached = false;
3213 mutex_lock(&lli->lli_och_mutex);
3214 if (fd->fd_lease_och != NULL) {
3215 och = fd->fd_lease_och;
3216 fd->fd_lease_och = NULL;
3218 mutex_unlock(&lli->lli_och_mutex);
3223 fmode = och->och_flags;
3225 switch (ioc->lil_flags) {
3226 case LL_LEASE_RESYNC_DONE:
3227 if (ioc->lil_count > IOC_IDS_MAX)
3228 GOTO(out_lease_close, rc = -EINVAL);
3230 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3231 OBD_ALLOC(data, data_size);
3233 GOTO(out_lease_close, rc = -ENOMEM);
3235 if (copy_from_user(data, (void __user *)arg, data_size))
3236 GOTO(out_lease_close, rc = -EFAULT);
3238 bias = MDS_CLOSE_RESYNC_DONE;
3240 case LL_LEASE_LAYOUT_MERGE: {
3243 if (ioc->lil_count != 1)
3244 GOTO(out_lease_close, rc = -EINVAL);
3246 arg += sizeof(*ioc);
3247 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3248 GOTO(out_lease_close, rc = -EFAULT);
3250 layout_file = fget(fd);
3252 GOTO(out_lease_close, rc = -EBADF);
3254 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3255 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3256 GOTO(out_lease_close, rc = -EPERM);
3258 data = file_inode(layout_file);
3259 bias = MDS_CLOSE_LAYOUT_MERGE;
3262 case LL_LEASE_LAYOUT_SPLIT: {
3266 if (ioc->lil_count != 2)
3267 GOTO(out_lease_close, rc = -EINVAL);
3269 arg += sizeof(*ioc);
3270 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3271 GOTO(out_lease_close, rc = -EFAULT);
3273 arg += sizeof(__u32);
3274 if (copy_from_user(&mirror_id, (void __user *)arg,
3276 GOTO(out_lease_close, rc = -EFAULT);
3278 layout_file = fget(fdv);
3280 GOTO(out_lease_close, rc = -EBADF);
3282 sp.sp_inode = file_inode(layout_file);
3283 sp.sp_mirror_id = (__u16)mirror_id;
3285 bias = MDS_CLOSE_LAYOUT_SPLIT;
3288 case LL_LEASE_PCC_ATTACH:
3289 if (ioc->lil_count != 1)
3292 arg += sizeof(*ioc);
3293 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3295 GOTO(out_lease_close, rc2 = -EFAULT);
3297 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3299 GOTO(out_lease_close, rc2);
3302 /* Grab latest data version */
3303 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3306 GOTO(out_lease_close, rc2);
3309 bias = MDS_PCC_ATTACH;
3312 /* without close intent */
3317 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3321 rc = ll_lease_och_release(inode, file);
3330 switch (ioc->lil_flags) {
3331 case LL_LEASE_RESYNC_DONE:
3333 OBD_FREE(data, data_size);
3335 case LL_LEASE_LAYOUT_MERGE:
3336 case LL_LEASE_LAYOUT_SPLIT:
3340 case LL_LEASE_PCC_ATTACH:
3343 rc = pcc_readwrite_attach_fini(file, inode,
3344 param.pa_layout_gen,
3351 rc = ll_lease_type_from_fmode(fmode);
3355 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3358 struct inode *inode = file_inode(file);
3359 struct ll_inode_info *lli = ll_i2info(inode);
3360 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3361 struct obd_client_handle *och = NULL;
3362 __u64 open_flags = 0;
3368 switch (ioc->lil_mode) {
3369 case LL_LEASE_WRLCK:
3370 if (!(file->f_mode & FMODE_WRITE))
3372 fmode = FMODE_WRITE;
3374 case LL_LEASE_RDLCK:
3375 if (!(file->f_mode & FMODE_READ))
3379 case LL_LEASE_UNLCK:
3380 RETURN(ll_file_unlock_lease(file, ioc, arg));
3385 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3387 /* apply for lease */
3388 if (ioc->lil_flags & LL_LEASE_RESYNC)
3389 open_flags = MDS_OPEN_RESYNC;
3390 och = ll_lease_open(inode, file, fmode, open_flags);
3392 RETURN(PTR_ERR(och));
3394 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3395 rc = ll_lease_file_resync(och, inode, arg);
3397 ll_lease_close(och, inode, NULL);
3400 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3402 ll_lease_close(och, inode, NULL);
3408 mutex_lock(&lli->lli_och_mutex);
3409 if (fd->fd_lease_och == NULL) {
3410 fd->fd_lease_och = och;
3413 mutex_unlock(&lli->lli_och_mutex);
3415 /* impossible now that only excl is supported for now */
3416 ll_lease_close(och, inode, &lease_broken);
3422 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3424 struct ll_inode_info *lli = ll_i2info(inode);
3425 struct ll_sb_info *sbi = ll_i2sbi(inode);
3426 __u64 now = ktime_get_real_seconds();
3429 spin_lock(&lli->lli_heat_lock);
3430 heat->lh_flags = lli->lli_heat_flags;
3431 for (i = 0; i < heat->lh_count; i++)
3432 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3433 now, sbi->ll_heat_decay_weight,
3434 sbi->ll_heat_period_second);
3435 spin_unlock(&lli->lli_heat_lock);
3438 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3440 struct ll_inode_info *lli = ll_i2info(inode);
3443 spin_lock(&lli->lli_heat_lock);
3444 if (flags & LU_HEAT_FLAG_CLEAR)
3445 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3447 if (flags & LU_HEAT_FLAG_OFF)
3448 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3450 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3452 spin_unlock(&lli->lli_heat_lock);
3458 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3460 struct inode *inode = file_inode(file);
3461 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3465 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3466 PFID(ll_inode2fid(inode)), inode, cmd);
3467 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3469 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3470 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3474 case LL_IOC_GETFLAGS:
3475 /* Get the current value of the file flags */
3476 return put_user(fd->fd_flags, (int __user *)arg);
3477 case LL_IOC_SETFLAGS:
3478 case LL_IOC_CLRFLAGS:
3479 /* Set or clear specific file flags */
3480 /* XXX This probably needs checks to ensure the flags are
3481 * not abused, and to handle any flag side effects.
3483 if (get_user(flags, (int __user *) arg))
3486 if (cmd == LL_IOC_SETFLAGS) {
3487 if ((flags & LL_FILE_IGNORE_LOCK) &&
3488 !(file->f_flags & O_DIRECT)) {
3489 CERROR("%s: unable to disable locking on "
3490 "non-O_DIRECT file\n", current->comm);
3494 fd->fd_flags |= flags;
3496 fd->fd_flags &= ~flags;
3499 case LL_IOC_LOV_SETSTRIPE:
3500 case LL_IOC_LOV_SETSTRIPE_NEW:
3501 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3502 case LL_IOC_LOV_SETEA:
3503 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3504 case LL_IOC_LOV_SWAP_LAYOUTS: {
3506 struct lustre_swap_layouts lsl;
3508 if (copy_from_user(&lsl, (char __user *)arg,
3509 sizeof(struct lustre_swap_layouts)))
3512 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3515 file2 = fget(lsl.sl_fd);
3519 /* O_WRONLY or O_RDWR */
3520 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3521 GOTO(out, rc = -EPERM);
3523 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3524 struct inode *inode2;
3525 struct ll_inode_info *lli;
3526 struct obd_client_handle *och = NULL;
3528 lli = ll_i2info(inode);
3529 mutex_lock(&lli->lli_och_mutex);
3530 if (fd->fd_lease_och != NULL) {
3531 och = fd->fd_lease_och;
3532 fd->fd_lease_och = NULL;
3534 mutex_unlock(&lli->lli_och_mutex);
3536 GOTO(out, rc = -ENOLCK);
3537 inode2 = file_inode(file2);
3538 rc = ll_swap_layouts_close(och, inode, inode2);
3540 rc = ll_swap_layouts(file, file2, &lsl);
3546 case LL_IOC_LOV_GETSTRIPE:
3547 case LL_IOC_LOV_GETSTRIPE_NEW:
3548 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3549 case FS_IOC_GETFLAGS:
3550 case FS_IOC_SETFLAGS:
3551 RETURN(ll_iocontrol(inode, file, cmd, arg));
3552 case FSFILT_IOC_GETVERSION:
3553 case FS_IOC_GETVERSION:
3554 RETURN(put_user(inode->i_generation, (int __user *)arg));
3555 /* We need to special case any other ioctls we want to handle,
3556 * to send them to the MDS/OST as appropriate and to properly
3557 * network encode the arg field. */
3558 case FS_IOC_SETVERSION:
3561 case LL_IOC_GROUP_LOCK:
3562 RETURN(ll_get_grouplock(inode, file, arg));
3563 case LL_IOC_GROUP_UNLOCK:
3564 RETURN(ll_put_grouplock(inode, file, arg));
3565 case IOC_OBD_STATFS:
3566 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3568 case LL_IOC_FLUSHCTX:
3569 RETURN(ll_flush_ctx(inode));
3570 case LL_IOC_PATH2FID: {
3571 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3572 sizeof(struct lu_fid)))
3577 case LL_IOC_GETPARENT:
3578 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3580 case OBD_IOC_FID2PATH:
3581 RETURN(ll_fid2path(inode, (void __user *)arg));
3582 case LL_IOC_DATA_VERSION: {
3583 struct ioc_data_version idv;
3586 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3589 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3590 rc = ll_ioc_data_version(inode, &idv);
3593 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3599 case LL_IOC_GET_MDTIDX: {
3602 mdtidx = ll_get_mdt_idx(inode);
3606 if (put_user((int)mdtidx, (int __user *)arg))
3611 case OBD_IOC_GETDTNAME:
3612 case OBD_IOC_GETMDNAME:
3613 RETURN(ll_get_obd_name(inode, cmd, arg));
3614 case LL_IOC_HSM_STATE_GET: {
3615 struct md_op_data *op_data;
3616 struct hsm_user_state *hus;
3623 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3624 LUSTRE_OPC_ANY, hus);
3625 if (IS_ERR(op_data)) {
3627 RETURN(PTR_ERR(op_data));
3630 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3633 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3636 ll_finish_md_op_data(op_data);
3640 case LL_IOC_HSM_STATE_SET: {
3641 struct hsm_state_set *hss;
3648 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3653 rc = ll_hsm_state_set(inode, hss);
3658 case LL_IOC_HSM_ACTION: {
3659 struct md_op_data *op_data;
3660 struct hsm_current_action *hca;
3667 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3668 LUSTRE_OPC_ANY, hca);
3669 if (IS_ERR(op_data)) {
3671 RETURN(PTR_ERR(op_data));
3674 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3677 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3680 ll_finish_md_op_data(op_data);
3684 case LL_IOC_SET_LEASE_OLD: {
3685 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3687 RETURN(ll_file_set_lease(file, &ioc, 0));
3689 case LL_IOC_SET_LEASE: {
3690 struct ll_ioc_lease ioc;
3692 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3695 RETURN(ll_file_set_lease(file, &ioc, arg));
3697 case LL_IOC_GET_LEASE: {
3698 struct ll_inode_info *lli = ll_i2info(inode);
3699 struct ldlm_lock *lock = NULL;
3702 mutex_lock(&lli->lli_och_mutex);
3703 if (fd->fd_lease_och != NULL) {
3704 struct obd_client_handle *och = fd->fd_lease_och;
3706 lock = ldlm_handle2lock(&och->och_lease_handle);
3708 lock_res_and_lock(lock);
3709 if (!ldlm_is_cancel(lock))
3710 fmode = och->och_flags;
3712 unlock_res_and_lock(lock);
3713 LDLM_LOCK_PUT(lock);
3716 mutex_unlock(&lli->lli_och_mutex);
3718 RETURN(ll_lease_type_from_fmode(fmode));
3720 case LL_IOC_HSM_IMPORT: {
3721 struct hsm_user_import *hui;
3727 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3732 rc = ll_hsm_import(inode, file, hui);
3737 case LL_IOC_FUTIMES_3: {
3738 struct ll_futimes_3 lfu;
3740 if (copy_from_user(&lfu,
3741 (const struct ll_futimes_3 __user *)arg,
3745 RETURN(ll_file_futimes_3(file, &lfu));
3747 case LL_IOC_LADVISE: {
3748 struct llapi_ladvise_hdr *k_ladvise_hdr;
3749 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3752 int alloc_size = sizeof(*k_ladvise_hdr);
3755 u_ladvise_hdr = (void __user *)arg;
3756 OBD_ALLOC_PTR(k_ladvise_hdr);
3757 if (k_ladvise_hdr == NULL)
3760 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3761 GOTO(out_ladvise, rc = -EFAULT);
3763 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3764 k_ladvise_hdr->lah_count < 1)
3765 GOTO(out_ladvise, rc = -EINVAL);
3767 num_advise = k_ladvise_hdr->lah_count;
3768 if (num_advise >= LAH_COUNT_MAX)
3769 GOTO(out_ladvise, rc = -EFBIG);
3771 OBD_FREE_PTR(k_ladvise_hdr);
3772 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3773 lah_advise[num_advise]);
3774 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3775 if (k_ladvise_hdr == NULL)
3779 * TODO: submit multiple advices to one server in a single RPC
3781 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3782 GOTO(out_ladvise, rc = -EFAULT);
3784 for (i = 0; i < num_advise; i++) {
3785 struct llapi_lu_ladvise *k_ladvise =
3786 &k_ladvise_hdr->lah_advise[i];
3787 struct llapi_lu_ladvise __user *u_ladvise =
3788 &u_ladvise_hdr->lah_advise[i];
3790 rc = ll_ladvise_sanity(inode, k_ladvise);
3792 GOTO(out_ladvise, rc);
3794 switch (k_ladvise->lla_advice) {
3795 case LU_LADVISE_LOCKNOEXPAND:
3796 rc = ll_lock_noexpand(file,
3797 k_ladvise->lla_peradvice_flags);
3798 GOTO(out_ladvise, rc);
3799 case LU_LADVISE_LOCKAHEAD:
3801 rc = ll_file_lock_ahead(file, k_ladvise);
3804 GOTO(out_ladvise, rc);
3807 &u_ladvise->lla_lockahead_result))
3808 GOTO(out_ladvise, rc = -EFAULT);
3811 rc = ll_ladvise(inode, file,
3812 k_ladvise_hdr->lah_flags,
3815 GOTO(out_ladvise, rc);
3822 OBD_FREE(k_ladvise_hdr, alloc_size);
3825 case LL_IOC_FLR_SET_MIRROR: {
3826 /* mirror I/O must be direct to avoid polluting page cache
3828 if (!(file->f_flags & O_DIRECT))
3831 fd->fd_designated_mirror = (__u32)arg;
3834 case LL_IOC_FSGETXATTR:
3835 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3836 case LL_IOC_FSSETXATTR:
3837 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3839 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3840 case LL_IOC_HEAT_GET: {
3841 struct lu_heat uheat;
3842 struct lu_heat *heat;
3845 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3848 if (uheat.lh_count > OBD_HEAT_COUNT)
3849 uheat.lh_count = OBD_HEAT_COUNT;
3851 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3852 OBD_ALLOC(heat, size);
3856 heat->lh_count = uheat.lh_count;
3857 ll_heat_get(inode, heat);
3858 rc = copy_to_user((char __user *)arg, heat, size);
3859 OBD_FREE(heat, size);
3860 RETURN(rc ? -EFAULT : 0);
3862 case LL_IOC_HEAT_SET: {
3865 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3868 rc = ll_heat_set(inode, flags);
3871 case LL_IOC_PCC_DETACH: {
3872 struct lu_pcc_detach *detach;
3874 OBD_ALLOC_PTR(detach);
3878 if (copy_from_user(detach,
3879 (const struct lu_pcc_detach __user *)arg,
3881 GOTO(out_detach_free, rc = -EFAULT);
3883 if (!S_ISREG(inode->i_mode))
3884 GOTO(out_detach_free, rc = -EINVAL);
3886 if (!inode_owner_or_capable(inode))
3887 GOTO(out_detach_free, rc = -EPERM);
3889 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3891 OBD_FREE_PTR(detach);
3894 case LL_IOC_PCC_STATE: {
3895 struct lu_pcc_state __user *ustate =
3896 (struct lu_pcc_state __user *)arg;
3897 struct lu_pcc_state *state;
3899 OBD_ALLOC_PTR(state);
3903 if (copy_from_user(state, ustate, sizeof(*state)))
3904 GOTO(out_state, rc = -EFAULT);
3906 rc = pcc_ioctl_state(file, inode, state);
3908 GOTO(out_state, rc);
3910 if (copy_to_user(ustate, state, sizeof(*state)))
3911 GOTO(out_state, rc = -EFAULT);
3914 OBD_FREE_PTR(state);
3918 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3919 (void __user *)arg));
3923 #ifndef HAVE_FILE_LLSEEK_SIZE
3924 static inline loff_t
3925 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3927 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3929 if (offset > maxsize)
3932 if (offset != file->f_pos) {
3933 file->f_pos = offset;
3934 file->f_version = 0;
3940 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3941 loff_t maxsize, loff_t eof)
3943 struct inode *inode = file_inode(file);
3951 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3952 * position-querying operation. Avoid rewriting the "same"
3953 * f_pos value back to the file because a concurrent read(),
3954 * write() or lseek() might have altered it
3959 * f_lock protects against read/modify/write race with other
3960 * SEEK_CURs. Note that parallel writes and reads behave
3964 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3965 inode_unlock(inode);
3969 * In the generic case the entire file is data, so as long as
3970 * offset isn't at the end of the file then the offset is data.
3977 * There is a virtual hole at the end of the file, so as long as
3978 * offset isn't i_size or larger, return i_size.
3986 return llseek_execute(file, offset, maxsize);
3990 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3992 struct inode *inode = file_inode(file);
3993 loff_t retval, eof = 0;
3996 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3997 (origin == SEEK_CUR) ? file->f_pos : 0);
3998 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3999 PFID(ll_inode2fid(inode)), inode, retval, retval,
4001 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4003 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4004 retval = ll_glimpse_size(inode);
4007 eof = i_size_read(inode);
4010 retval = ll_generic_file_llseek_size(file, offset, origin,
4011 ll_file_maxbytes(inode), eof);
4015 static int ll_flush(struct file *file, fl_owner_t id)
4017 struct inode *inode = file_inode(file);
4018 struct ll_inode_info *lli = ll_i2info(inode);
4019 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4022 LASSERT(!S_ISDIR(inode->i_mode));
4024 /* catch async errors that were recorded back when async writeback
4025 * failed for pages in this mapping. */
4026 rc = lli->lli_async_rc;
4027 lli->lli_async_rc = 0;
4028 if (lli->lli_clob != NULL) {
4029 err = lov_read_and_clear_async_rc(lli->lli_clob);
4034 /* The application has been told write failure already.
4035 * Do not report failure again. */
4036 if (fd->fd_write_failed)
4038 return rc ? -EIO : 0;
4042 * Called to make sure a portion of file has been written out.
4043 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4045 * Return how many pages have been written.
4047 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4048 enum cl_fsync_mode mode, int ignore_layout)
4052 struct cl_fsync_io *fio;
4057 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4058 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4061 env = cl_env_get(&refcheck);
4063 RETURN(PTR_ERR(env));
4065 io = vvp_env_thread_io(env);
4066 io->ci_obj = ll_i2info(inode)->lli_clob;
4067 io->ci_ignore_layout = ignore_layout;
4069 /* initialize parameters for sync */
4070 fio = &io->u.ci_fsync;
4071 fio->fi_start = start;
4073 fio->fi_fid = ll_inode2fid(inode);
4074 fio->fi_mode = mode;
4075 fio->fi_nr_written = 0;
4077 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4078 result = cl_io_loop(env, io);
4080 result = io->ci_result;
4082 result = fio->fi_nr_written;
4083 cl_io_fini(env, io);
4084 cl_env_put(env, &refcheck);
4090 * When dentry is provided (the 'else' case), file_dentry() may be
4091 * null and dentry must be used directly rather than pulled from
4092 * file_dentry() as is done otherwise.
4095 #ifdef HAVE_FILE_FSYNC_4ARGS
4096 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4098 struct dentry *dentry = file_dentry(file);
4099 #elif defined(HAVE_FILE_FSYNC_2ARGS)
4100 int ll_fsync(struct file *file, int datasync)
4102 struct dentry *dentry = file_dentry(file);
4104 loff_t end = LLONG_MAX;
4106 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
4109 loff_t end = LLONG_MAX;
4111 struct inode *inode = dentry->d_inode;
4112 struct ll_inode_info *lli = ll_i2info(inode);
4113 struct ptlrpc_request *req;
4118 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
4119 PFID(ll_inode2fid(inode)), inode);
4120 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4122 #ifdef HAVE_FILE_FSYNC_4ARGS
4123 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4126 /* fsync's caller has already called _fdata{sync,write}, we want
4127 * that IO to finish before calling the osc and mdc sync methods */
4128 rc = filemap_fdatawait(inode->i_mapping);
4131 /* catch async errors that were recorded back when async writeback
4132 * failed for pages in this mapping. */
4133 if (!S_ISDIR(inode->i_mode)) {
4134 err = lli->lli_async_rc;
4135 lli->lli_async_rc = 0;
4138 if (lli->lli_clob != NULL) {
4139 err = lov_read_and_clear_async_rc(lli->lli_clob);
4145 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4149 ptlrpc_req_finished(req);
4151 if (S_ISREG(inode->i_mode)) {
4152 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4155 /* Sync metadata on MDT first, and then sync the cached data
4158 err = pcc_fsync(file, start, end, datasync, &cached);
4160 err = cl_sync_file_range(inode, start, end,
4162 if (rc == 0 && err < 0)
4165 fd->fd_write_failed = true;
4167 fd->fd_write_failed = false;
4170 #ifdef HAVE_FILE_FSYNC_4ARGS
4171 inode_unlock(inode);
4177 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4179 struct inode *inode = file_inode(file);
4180 struct ll_sb_info *sbi = ll_i2sbi(inode);
4181 struct ldlm_enqueue_info einfo = {
4182 .ei_type = LDLM_FLOCK,
4183 .ei_cb_cp = ldlm_flock_completion_ast,
4184 .ei_cbdata = file_lock,
4186 struct md_op_data *op_data;
4187 struct lustre_handle lockh = { 0 };
4188 union ldlm_policy_data flock = { { 0 } };
4189 int fl_type = file_lock->fl_type;
4195 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4196 PFID(ll_inode2fid(inode)), file_lock);
4198 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4200 if (file_lock->fl_flags & FL_FLOCK) {
4201 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4202 /* flocks are whole-file locks */
4203 flock.l_flock.end = OFFSET_MAX;
4204 /* For flocks owner is determined by the local file desctiptor*/
4205 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4206 } else if (file_lock->fl_flags & FL_POSIX) {
4207 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4208 flock.l_flock.start = file_lock->fl_start;
4209 flock.l_flock.end = file_lock->fl_end;
4213 flock.l_flock.pid = file_lock->fl_pid;
4215 /* Somewhat ugly workaround for svc lockd.
4216 * lockd installs custom fl_lmops->lm_compare_owner that checks
4217 * for the fl_owner to be the same (which it always is on local node
4218 * I guess between lockd processes) and then compares pid.
4219 * As such we assign pid to the owner field to make it all work,
4220 * conflict with normal locks is unlikely since pid space and
4221 * pointer space for current->files are not intersecting */
4222 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4223 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4227 einfo.ei_mode = LCK_PR;
4230 /* An unlock request may or may not have any relation to
4231 * existing locks so we may not be able to pass a lock handle
4232 * via a normal ldlm_lock_cancel() request. The request may even
4233 * unlock a byte range in the middle of an existing lock. In
4234 * order to process an unlock request we need all of the same
4235 * information that is given with a normal read or write record
4236 * lock request. To avoid creating another ldlm unlock (cancel)
4237 * message we'll treat a LCK_NL flock request as an unlock. */
4238 einfo.ei_mode = LCK_NL;
4241 einfo.ei_mode = LCK_PW;
4244 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4259 flags = LDLM_FL_BLOCK_NOWAIT;
4265 flags = LDLM_FL_TEST_LOCK;
4268 CERROR("unknown fcntl lock command: %d\n", cmd);
4272 /* Save the old mode so that if the mode in the lock changes we
4273 * can decrement the appropriate reader or writer refcount. */
4274 file_lock->fl_type = einfo.ei_mode;
4276 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4277 LUSTRE_OPC_ANY, NULL);
4278 if (IS_ERR(op_data))
4279 RETURN(PTR_ERR(op_data));
4281 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4282 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4283 flock.l_flock.pid, flags, einfo.ei_mode,
4284 flock.l_flock.start, flock.l_flock.end);
4286 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4289 /* Restore the file lock type if not TEST lock. */
4290 if (!(flags & LDLM_FL_TEST_LOCK))
4291 file_lock->fl_type = fl_type;
4293 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4294 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4295 !(flags & LDLM_FL_TEST_LOCK))
4296 rc2 = locks_lock_file_wait(file, file_lock);
4298 if ((file_lock->fl_flags & FL_FLOCK) &&
4299 (rc == 0 || file_lock->fl_type == F_UNLCK))
4300 rc2 = flock_lock_file_wait(file, file_lock);
4301 if ((file_lock->fl_flags & FL_POSIX) &&
4302 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4303 !(flags & LDLM_FL_TEST_LOCK))
4304 rc2 = posix_lock_file_wait(file, file_lock);
4305 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4307 if (rc2 && file_lock->fl_type != F_UNLCK) {
4308 einfo.ei_mode = LCK_NL;
4309 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4314 ll_finish_md_op_data(op_data);
4319 int ll_get_fid_by_name(struct inode *parent, const char *name,
4320 int namelen, struct lu_fid *fid,
4321 struct inode **inode)
4323 struct md_op_data *op_data = NULL;
4324 struct mdt_body *body;
4325 struct ptlrpc_request *req;
4329 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4330 LUSTRE_OPC_ANY, NULL);
4331 if (IS_ERR(op_data))
4332 RETURN(PTR_ERR(op_data));
4334 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4335 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4336 ll_finish_md_op_data(op_data);
4340 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4342 GOTO(out_req, rc = -EFAULT);
4344 *fid = body->mbo_fid1;
4347 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4349 ptlrpc_req_finished(req);
4353 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4356 struct dentry *dchild = NULL;
4357 struct inode *child_inode = NULL;
4358 struct md_op_data *op_data;
4359 struct ptlrpc_request *request = NULL;
4360 struct obd_client_handle *och = NULL;
4362 struct mdt_body *body;
4363 __u64 data_version = 0;
4364 size_t namelen = strlen(name);
4365 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4369 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4370 PFID(ll_inode2fid(parent)), name,
4371 lum->lum_stripe_offset, lum->lum_stripe_count);
4373 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4374 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4375 lustre_swab_lmv_user_md(lum);
4377 /* Get child FID first */
4378 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4381 dchild = d_lookup(file_dentry(file), &qstr);
4383 if (dchild->d_inode)
4384 child_inode = igrab(dchild->d_inode);
4389 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4398 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4399 OBD_CONNECT2_DIR_MIGRATE)) {
4400 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4401 ll_dir_striped(child_inode)) {
4402 CERROR("%s: MDT doesn't support stripe directory "
4403 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4404 GOTO(out_iput, rc = -EOPNOTSUPP);
4409 * lfs migrate command needs to be blocked on the client
4410 * by checking the migrate FID against the FID of the
4413 if (child_inode == parent->i_sb->s_root->d_inode)
4414 GOTO(out_iput, rc = -EINVAL);
4416 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4417 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4418 if (IS_ERR(op_data))
4419 GOTO(out_iput, rc = PTR_ERR(op_data));
4421 inode_lock(child_inode);
4422 op_data->op_fid3 = *ll_inode2fid(child_inode);
4423 if (!fid_is_sane(&op_data->op_fid3)) {
4424 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4425 ll_i2sbi(parent)->ll_fsname, name,
4426 PFID(&op_data->op_fid3));
4427 GOTO(out_unlock, rc = -EINVAL);
4430 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4431 op_data->op_data = lum;
4432 op_data->op_data_size = lumlen;
4435 if (S_ISREG(child_inode->i_mode)) {
4436 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4440 GOTO(out_unlock, rc);
4443 rc = ll_data_version(child_inode, &data_version,
4446 GOTO(out_close, rc);
4448 op_data->op_open_handle = och->och_open_handle;
4449 op_data->op_data_version = data_version;
4450 op_data->op_lease_handle = och->och_lease_handle;
4451 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4453 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4454 och->och_mod->mod_open_req->rq_replay = 0;
4455 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4458 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4459 name, namelen, &request);
4461 LASSERT(request != NULL);
4462 ll_update_times(request, parent);
4465 if (rc == 0 || rc == -EAGAIN) {
4466 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4467 LASSERT(body != NULL);
4469 /* If the server does release layout lock, then we cleanup
4470 * the client och here, otherwise release it in out_close: */
4471 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4472 obd_mod_put(och->och_mod);
4473 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4475 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4481 if (request != NULL) {
4482 ptlrpc_req_finished(request);
4486 /* Try again if the lease has cancelled. */
4487 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4492 ll_lease_close(och, child_inode, NULL);
4494 clear_nlink(child_inode);
4496 inode_unlock(child_inode);
4497 ll_finish_md_op_data(op_data);
4504 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4506 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4510 * In order to avoid flood of warning messages, only print one message
4511 * for one file. And the entire message rate on the client is limited
4512 * by CDEBUG_LIMIT too.
4514 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4515 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4516 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4517 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4523 * test if some locks matching bits and l_req_mode are acquired
4524 * - bits can be in different locks
4525 * - if found clear the common lock bits in *bits
4526 * - the bits not found, are kept in *bits
4528 * \param bits [IN] searched lock bits [IN]
4529 * \param l_req_mode [IN] searched lock mode
4530 * \retval boolean, true iff all bits are found
4532 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4534 struct lustre_handle lockh;
4535 union ldlm_policy_data policy;
4536 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4537 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4546 fid = &ll_i2info(inode)->lli_fid;
4547 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4548 ldlm_lockname[mode]);
4550 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4551 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4552 policy.l_inodebits.bits = *bits & (1 << i);
4553 if (policy.l_inodebits.bits == 0)
4556 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4557 &policy, mode, &lockh)) {
4558 struct ldlm_lock *lock;
4560 lock = ldlm_handle2lock(&lockh);
4563 ~(lock->l_policy_data.l_inodebits.bits);
4564 LDLM_LOCK_PUT(lock);
4566 *bits &= ~policy.l_inodebits.bits;
4573 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4574 struct lustre_handle *lockh, __u64 flags,
4575 enum ldlm_mode mode)
4577 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4582 fid = &ll_i2info(inode)->lli_fid;
4583 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4585 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4586 fid, LDLM_IBITS, &policy, mode, lockh);
4591 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4593 /* Already unlinked. Just update nlink and return success */
4594 if (rc == -ENOENT) {
4596 /* If it is striped directory, and there is bad stripe
4597 * Let's revalidate the dentry again, instead of returning
4599 if (ll_dir_striped(inode))
4602 /* This path cannot be hit for regular files unless in
4603 * case of obscure races, so no need to to validate
4605 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4607 } else if (rc != 0) {
4608 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4609 "%s: revalidate FID "DFID" error: rc = %d\n",
4610 ll_i2sbi(inode)->ll_fsname,
4611 PFID(ll_inode2fid(inode)), rc);
4617 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4619 struct inode *inode = dentry->d_inode;
4620 struct obd_export *exp = ll_i2mdexp(inode);
4621 struct lookup_intent oit = {
4624 struct ptlrpc_request *req = NULL;
4625 struct md_op_data *op_data;
4629 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4630 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4632 /* Call getattr by fid, so do not provide name at all. */
4633 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4634 LUSTRE_OPC_ANY, NULL);
4635 if (IS_ERR(op_data))
4636 RETURN(PTR_ERR(op_data));
4638 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4639 ll_finish_md_op_data(op_data);
4641 rc = ll_inode_revalidate_fini(inode, rc);
4645 rc = ll_revalidate_it_finish(req, &oit, dentry);
4647 ll_intent_release(&oit);
4651 /* Unlinked? Unhash dentry, so it is not picked up later by
4652 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4653 * here to preserve get_cwd functionality on 2.6.
4655 if (!dentry->d_inode->i_nlink) {
4656 ll_lock_dcache(inode);
4657 d_lustre_invalidate(dentry, 0);
4658 ll_unlock_dcache(inode);
4661 ll_lookup_finish_locks(&oit, dentry);
4663 ptlrpc_req_finished(req);
4668 static int ll_merge_md_attr(struct inode *inode)
4670 struct ll_inode_info *lli = ll_i2info(inode);
4671 struct cl_attr attr = { 0 };
4674 LASSERT(lli->lli_lsm_md != NULL);
4676 if (!lmv_dir_striped(lli->lli_lsm_md))
4679 down_read(&lli->lli_lsm_sem);
4680 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4681 &attr, ll_md_blocking_ast);
4682 up_read(&lli->lli_lsm_sem);
4686 set_nlink(inode, attr.cat_nlink);
4687 inode->i_blocks = attr.cat_blocks;
4688 i_size_write(inode, attr.cat_size);
4690 ll_i2info(inode)->lli_atime = attr.cat_atime;
4691 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4692 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4697 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4699 struct inode *inode = de->d_inode;
4700 struct ll_sb_info *sbi = ll_i2sbi(inode);
4701 struct ll_inode_info *lli = ll_i2info(inode);
4704 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4706 rc = ll_inode_revalidate(de, IT_GETATTR);
4710 if (S_ISREG(inode->i_mode)) {
4713 rc = pcc_inode_getattr(inode, &cached);
4714 if (cached && rc < 0)
4717 /* In case of restore, the MDT has the right size and has
4718 * already send it back without granting the layout lock,
4719 * inode is up-to-date so glimpse is useless.
4720 * Also to glimpse we need the layout, in case of a running
4721 * restore the MDT holds the layout lock so the glimpse will
4722 * block up to the end of restore (getattr will block)
4724 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4725 rc = ll_glimpse_size(inode);
4730 /* If object isn't regular a file then don't validate size. */
4731 if (ll_dir_striped(inode)) {
4732 rc = ll_merge_md_attr(inode);
4737 inode->i_atime.tv_sec = lli->lli_atime;
4738 inode->i_mtime.tv_sec = lli->lli_mtime;
4739 inode->i_ctime.tv_sec = lli->lli_ctime;
4742 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4744 if (ll_need_32bit_api(sbi)) {
4745 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4746 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4747 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4749 stat->ino = inode->i_ino;
4750 stat->dev = inode->i_sb->s_dev;
4751 stat->rdev = inode->i_rdev;
4754 stat->mode = inode->i_mode;
4755 stat->uid = inode->i_uid;
4756 stat->gid = inode->i_gid;
4757 stat->atime = inode->i_atime;
4758 stat->mtime = inode->i_mtime;
4759 stat->ctime = inode->i_ctime;
4760 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4762 stat->nlink = inode->i_nlink;
4763 stat->size = i_size_read(inode);
4764 stat->blocks = inode->i_blocks;
4769 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4770 int ll_getattr(const struct path *path, struct kstat *stat,
4771 u32 request_mask, unsigned int flags)
4773 struct dentry *de = path->dentry;
4775 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4778 return ll_getattr_dentry(de, stat);
4781 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4782 __u64 start, __u64 len)
4786 struct fiemap *fiemap;
4787 unsigned int extent_count = fieinfo->fi_extents_max;
4789 num_bytes = sizeof(*fiemap) + (extent_count *
4790 sizeof(struct fiemap_extent));
4791 OBD_ALLOC_LARGE(fiemap, num_bytes);
4796 fiemap->fm_flags = fieinfo->fi_flags;
4797 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4798 fiemap->fm_start = start;
4799 fiemap->fm_length = len;
4800 if (extent_count > 0 &&
4801 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4802 sizeof(struct fiemap_extent)) != 0)
4803 GOTO(out, rc = -EFAULT);
4805 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4807 fieinfo->fi_flags = fiemap->fm_flags;
4808 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4809 if (extent_count > 0 &&
4810 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4811 fiemap->fm_mapped_extents *
4812 sizeof(struct fiemap_extent)) != 0)
4813 GOTO(out, rc = -EFAULT);
4815 OBD_FREE_LARGE(fiemap, num_bytes);
4819 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4821 struct ll_inode_info *lli = ll_i2info(inode);
4822 struct posix_acl *acl = NULL;
4825 spin_lock(&lli->lli_lock);
4826 /* VFS' acl_permission_check->check_acl will release the refcount */
4827 acl = posix_acl_dup(lli->lli_posix_acl);
4828 spin_unlock(&lli->lli_lock);
4833 #ifdef HAVE_IOP_SET_ACL
4834 #ifdef CONFIG_FS_POSIX_ACL
4835 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4837 struct ll_sb_info *sbi = ll_i2sbi(inode);
4838 struct ptlrpc_request *req = NULL;
4839 const char *name = NULL;
4841 size_t value_size = 0;
4846 case ACL_TYPE_ACCESS:
4847 name = XATTR_NAME_POSIX_ACL_ACCESS;
4849 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4852 case ACL_TYPE_DEFAULT:
4853 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4854 if (!S_ISDIR(inode->i_mode))
4855 rc = acl ? -EACCES : 0;
4866 value_size = posix_acl_xattr_size(acl->a_count);
4867 value = kmalloc(value_size, GFP_NOFS);
4869 GOTO(out, rc = -ENOMEM);
4871 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4873 GOTO(out_value, rc);
4876 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4877 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4878 name, value, value_size, 0, 0, &req);
4880 ptlrpc_req_finished(req);
4885 forget_cached_acl(inode, type);
4887 set_cached_acl(inode, type, acl);
4890 #endif /* CONFIG_FS_POSIX_ACL */
4891 #endif /* HAVE_IOP_SET_ACL */
4893 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4895 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4896 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4898 ll_check_acl(struct inode *inode, int mask)
4901 # ifdef CONFIG_FS_POSIX_ACL
4902 struct posix_acl *acl;
4906 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4907 if (flags & IPERM_FLAG_RCU)
4910 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4915 rc = posix_acl_permission(inode, acl, mask);
4916 posix_acl_release(acl);
4919 # else /* !CONFIG_FS_POSIX_ACL */
4921 # endif /* CONFIG_FS_POSIX_ACL */
4923 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4925 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4926 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4928 # ifdef HAVE_INODE_PERMISION_2ARGS
4929 int ll_inode_permission(struct inode *inode, int mask)
4931 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4936 struct ll_sb_info *sbi;
4937 struct root_squash_info *squash;
4938 struct cred *cred = NULL;
4939 const struct cred *old_cred = NULL;
4941 bool squash_id = false;
4944 #ifdef MAY_NOT_BLOCK
4945 if (mask & MAY_NOT_BLOCK)
4947 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4948 if (flags & IPERM_FLAG_RCU)
4952 /* as root inode are NOT getting validated in lookup operation,
4953 * need to do it before permission check. */
4955 if (inode == inode->i_sb->s_root->d_inode) {
4956 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4961 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4962 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4964 /* squash fsuid/fsgid if needed */
4965 sbi = ll_i2sbi(inode);
4966 squash = &sbi->ll_squash;
4967 if (unlikely(squash->rsi_uid != 0 &&
4968 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4969 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4973 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4974 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4975 squash->rsi_uid, squash->rsi_gid);
4977 /* update current process's credentials
4978 * and FS capability */
4979 cred = prepare_creds();
4983 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4984 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4985 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4986 if ((1 << cap) & CFS_CAP_FS_MASK)
4987 cap_lower(cred->cap_effective, cap);
4989 old_cred = override_creds(cred);
4992 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4993 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4994 /* restore current process's credentials and FS capability */
4996 revert_creds(old_cred);
5003 /* -o localflock - only provides locally consistent flock locks */
5004 struct file_operations ll_file_operations = {
5005 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5006 # ifdef HAVE_SYNC_READ_WRITE
5007 .read = new_sync_read,
5008 .write = new_sync_write,
5010 .read_iter = ll_file_read_iter,
5011 .write_iter = ll_file_write_iter,
5012 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5013 .read = ll_file_read,
5014 .aio_read = ll_file_aio_read,
5015 .write = ll_file_write,
5016 .aio_write = ll_file_aio_write,
5017 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5018 .unlocked_ioctl = ll_file_ioctl,
5019 .open = ll_file_open,
5020 .release = ll_file_release,
5021 .mmap = ll_file_mmap,
5022 .llseek = ll_file_seek,
5023 .splice_read = ll_file_splice_read,
5028 struct file_operations ll_file_operations_flock = {
5029 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5030 # ifdef HAVE_SYNC_READ_WRITE
5031 .read = new_sync_read,
5032 .write = new_sync_write,
5033 # endif /* HAVE_SYNC_READ_WRITE */
5034 .read_iter = ll_file_read_iter,
5035 .write_iter = ll_file_write_iter,
5036 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5037 .read = ll_file_read,
5038 .aio_read = ll_file_aio_read,
5039 .write = ll_file_write,
5040 .aio_write = ll_file_aio_write,
5041 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5042 .unlocked_ioctl = ll_file_ioctl,
5043 .open = ll_file_open,
5044 .release = ll_file_release,
5045 .mmap = ll_file_mmap,
5046 .llseek = ll_file_seek,
5047 .splice_read = ll_file_splice_read,
5050 .flock = ll_file_flock,
5051 .lock = ll_file_flock
5054 /* These are for -o noflock - to return ENOSYS on flock calls */
5055 struct file_operations ll_file_operations_noflock = {
5056 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5057 # ifdef HAVE_SYNC_READ_WRITE
5058 .read = new_sync_read,
5059 .write = new_sync_write,
5060 # endif /* HAVE_SYNC_READ_WRITE */
5061 .read_iter = ll_file_read_iter,
5062 .write_iter = ll_file_write_iter,
5063 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5064 .read = ll_file_read,
5065 .aio_read = ll_file_aio_read,
5066 .write = ll_file_write,
5067 .aio_write = ll_file_aio_write,
5068 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5069 .unlocked_ioctl = ll_file_ioctl,
5070 .open = ll_file_open,
5071 .release = ll_file_release,
5072 .mmap = ll_file_mmap,
5073 .llseek = ll_file_seek,
5074 .splice_read = ll_file_splice_read,
5077 .flock = ll_file_noflock,
5078 .lock = ll_file_noflock
5081 struct inode_operations ll_file_inode_operations = {
5082 .setattr = ll_setattr,
5083 .getattr = ll_getattr,
5084 .permission = ll_inode_permission,
5085 #ifdef HAVE_IOP_XATTR
5086 .setxattr = ll_setxattr,
5087 .getxattr = ll_getxattr,
5088 .removexattr = ll_removexattr,
5090 .listxattr = ll_listxattr,
5091 .fiemap = ll_fiemap,
5092 #ifdef HAVE_IOP_GET_ACL
5093 .get_acl = ll_get_acl,
5095 #ifdef HAVE_IOP_SET_ACL
5096 .set_acl = ll_set_acl,
5100 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5102 struct ll_inode_info *lli = ll_i2info(inode);
5103 struct cl_object *obj = lli->lli_clob;
5112 env = cl_env_get(&refcheck);
5114 RETURN(PTR_ERR(env));
5116 rc = cl_conf_set(env, lli->lli_clob, conf);
5120 if (conf->coc_opc == OBJECT_CONF_SET) {
5121 struct ldlm_lock *lock = conf->coc_lock;
5122 struct cl_layout cl = {
5126 LASSERT(lock != NULL);
5127 LASSERT(ldlm_has_layout(lock));
5129 /* it can only be allowed to match after layout is
5130 * applied to inode otherwise false layout would be
5131 * seen. Applying layout shoud happen before dropping
5132 * the intent lock. */
5133 ldlm_lock_allow_match(lock);
5135 rc = cl_object_layout_get(env, obj, &cl);
5140 DFID": layout version change: %u -> %u\n",
5141 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5143 ll_layout_version_set(lli, cl.cl_layout_gen);
5147 cl_env_put(env, &refcheck);
5152 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5153 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5156 struct ll_sb_info *sbi = ll_i2sbi(inode);
5157 struct ptlrpc_request *req;
5164 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5165 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5166 lock->l_lvb_data, lock->l_lvb_len);
5168 if (lock->l_lvb_data != NULL)
5171 /* if layout lock was granted right away, the layout is returned
5172 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5173 * blocked and then granted via completion ast, we have to fetch
5174 * layout here. Please note that we can't use the LVB buffer in
5175 * completion AST because it doesn't have a large enough buffer */
5176 rc = ll_get_default_mdsize(sbi, &lmmsize);
5180 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5181 XATTR_NAME_LOV, lmmsize, &req);
5184 GOTO(out, rc = 0); /* empty layout */
5191 if (lmmsize == 0) /* empty layout */
5194 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5196 GOTO(out, rc = -EFAULT);
5198 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5199 if (lvbdata == NULL)
5200 GOTO(out, rc = -ENOMEM);
5202 memcpy(lvbdata, lmm, lmmsize);
5203 lock_res_and_lock(lock);
5204 if (unlikely(lock->l_lvb_data == NULL)) {
5205 lock->l_lvb_type = LVB_T_LAYOUT;
5206 lock->l_lvb_data = lvbdata;
5207 lock->l_lvb_len = lmmsize;
5210 unlock_res_and_lock(lock);
5213 OBD_FREE_LARGE(lvbdata, lmmsize);
5218 ptlrpc_req_finished(req);
5223 * Apply the layout to the inode. Layout lock is held and will be released
5226 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5227 struct inode *inode)
5229 struct ll_inode_info *lli = ll_i2info(inode);
5230 struct ll_sb_info *sbi = ll_i2sbi(inode);
5231 struct ldlm_lock *lock;
5232 struct cl_object_conf conf;
5235 bool wait_layout = false;
5238 LASSERT(lustre_handle_is_used(lockh));
5240 lock = ldlm_handle2lock(lockh);
5241 LASSERT(lock != NULL);
5242 LASSERT(ldlm_has_layout(lock));
5244 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5245 PFID(&lli->lli_fid), inode);
5247 /* in case this is a caching lock and reinstate with new inode */
5248 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5250 lock_res_and_lock(lock);
5251 lvb_ready = ldlm_is_lvb_ready(lock);
5252 unlock_res_and_lock(lock);
5254 /* checking lvb_ready is racy but this is okay. The worst case is
5255 * that multi processes may configure the file on the same time. */
5259 rc = ll_layout_fetch(inode, lock);
5263 /* for layout lock, lmm is stored in lock's lvb.
5264 * lvb_data is immutable if the lock is held so it's safe to access it
5267 * set layout to file. Unlikely this will fail as old layout was
5268 * surely eliminated */
5269 memset(&conf, 0, sizeof conf);
5270 conf.coc_opc = OBJECT_CONF_SET;
5271 conf.coc_inode = inode;
5272 conf.coc_lock = lock;
5273 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5274 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5275 rc = ll_layout_conf(inode, &conf);
5277 /* refresh layout failed, need to wait */
5278 wait_layout = rc == -EBUSY;
5281 LDLM_LOCK_PUT(lock);
5282 ldlm_lock_decref(lockh, mode);
5284 /* wait for IO to complete if it's still being used. */
5286 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5287 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5289 memset(&conf, 0, sizeof conf);
5290 conf.coc_opc = OBJECT_CONF_WAIT;
5291 conf.coc_inode = inode;
5292 rc = ll_layout_conf(inode, &conf);
5296 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5297 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5303 * Issue layout intent RPC to MDS.
5304 * \param inode [in] file inode
5305 * \param intent [in] layout intent
5307 * \retval 0 on success
5308 * \retval < 0 error code
5310 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5312 struct ll_inode_info *lli = ll_i2info(inode);
5313 struct ll_sb_info *sbi = ll_i2sbi(inode);
5314 struct md_op_data *op_data;
5315 struct lookup_intent it;
5316 struct ptlrpc_request *req;
5320 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5321 0, 0, LUSTRE_OPC_ANY, NULL);
5322 if (IS_ERR(op_data))
5323 RETURN(PTR_ERR(op_data));
5325 op_data->op_data = intent;
5326 op_data->op_data_size = sizeof(*intent);
5328 memset(&it, 0, sizeof(it));
5329 it.it_op = IT_LAYOUT;
5330 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5331 intent->li_opc == LAYOUT_INTENT_TRUNC)
5332 it.it_flags = FMODE_WRITE;
5334 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5335 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5337 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5338 &ll_md_blocking_ast, 0);
5339 if (it.it_request != NULL)
5340 ptlrpc_req_finished(it.it_request);
5341 it.it_request = NULL;
5343 ll_finish_md_op_data(op_data);
5345 /* set lock data in case this is a new lock */
5347 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5349 ll_intent_drop_lock(&it);
5355 * This function checks if there exists a LAYOUT lock on the client side,
5356 * or enqueues it if it doesn't have one in cache.
5358 * This function will not hold layout lock so it may be revoked any time after
5359 * this function returns. Any operations depend on layout should be redone
5362 * This function should be called before lov_io_init() to get an uptodate
5363 * layout version, the caller should save the version number and after IO
5364 * is finished, this function should be called again to verify that layout
5365 * is not changed during IO time.
5367 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5369 struct ll_inode_info *lli = ll_i2info(inode);
5370 struct ll_sb_info *sbi = ll_i2sbi(inode);
5371 struct lustre_handle lockh;
5372 struct layout_intent intent = {
5373 .li_opc = LAYOUT_INTENT_ACCESS,
5375 enum ldlm_mode mode;
5379 *gen = ll_layout_version_get(lli);
5380 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5384 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5385 LASSERT(S_ISREG(inode->i_mode));
5387 /* take layout lock mutex to enqueue layout lock exclusively. */
5388 mutex_lock(&lli->lli_layout_mutex);
5391 /* mostly layout lock is caching on the local side, so try to
5392 * match it before grabbing layout lock mutex. */
5393 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5394 LCK_CR | LCK_CW | LCK_PR |
5396 if (mode != 0) { /* hit cached lock */
5397 rc = ll_layout_lock_set(&lockh, mode, inode);
5403 rc = ll_layout_intent(inode, &intent);
5409 *gen = ll_layout_version_get(lli);
5410 mutex_unlock(&lli->lli_layout_mutex);
5416 * Issue layout intent RPC indicating where in a file an IO is about to write.
5418 * \param[in] inode file inode.
5419 * \param[in] ext write range with start offset of fille in bytes where
5420 * an IO is about to write, and exclusive end offset in
5423 * \retval 0 on success
5424 * \retval < 0 error code
5426 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5427 struct lu_extent *ext)
5429 struct layout_intent intent = {
5431 .li_extent.e_start = ext->e_start,
5432 .li_extent.e_end = ext->e_end,
5437 rc = ll_layout_intent(inode, &intent);
5443 * This function send a restore request to the MDT
5445 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5447 struct hsm_user_request *hur;
5451 len = sizeof(struct hsm_user_request) +
5452 sizeof(struct hsm_user_item);
5453 OBD_ALLOC(hur, len);
5457 hur->hur_request.hr_action = HUA_RESTORE;
5458 hur->hur_request.hr_archive_id = 0;
5459 hur->hur_request.hr_flags = 0;
5460 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5461 sizeof(hur->hur_user_item[0].hui_fid));
5462 hur->hur_user_item[0].hui_extent.offset = offset;
5463 hur->hur_user_item[0].hui_extent.length = length;
5464 hur->hur_request.hr_itemcount = 1;
5465 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,