4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #include <linux/uidgid.h>
47 #include <uapi/linux/lustre/lustre_ioctl.h>
48 #include <lustre_swab.h>
50 #include "cl_object.h"
51 #include "llite_internal.h"
52 #include "vvp_internal.h"
55 struct inode *sp_inode;
60 __u64 pa_data_version;
66 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
68 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
80 pcc_file_init(&fd->fd_pcc_file);
85 static void ll_file_data_put(struct ll_file_data *fd)
88 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
92 * Packs all the attributes into @op_data for the CLOSE rpc.
94 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
95 struct obd_client_handle *och)
99 ll_prep_md_op_data(op_data, inode, NULL, NULL,
100 0, 0, LUSTRE_OPC_ANY, NULL);
102 op_data->op_attr.ia_mode = inode->i_mode;
103 op_data->op_attr.ia_atime = inode->i_atime;
104 op_data->op_attr.ia_mtime = inode->i_mtime;
105 op_data->op_attr.ia_ctime = inode->i_ctime;
106 op_data->op_attr.ia_size = i_size_read(inode);
107 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 ATTR_MTIME | ATTR_MTIME_SET |
110 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
111 op_data->op_attr_blocks = inode->i_blocks;
112 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
113 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
114 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
115 op_data->op_open_handle = och->och_open_handle;
117 if (och->och_flags & FMODE_WRITE &&
118 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
119 /* For HSM: if inode data has been modified, pack it so that
120 * MDT can set data dirty flag in the archive. */
121 op_data->op_bias |= MDS_DATA_MODIFIED;
127 * Perform a close, possibly with a bias.
128 * The meaning of "data" depends on the value of "bias".
130 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
131 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
134 static int ll_close_inode_openhandle(struct inode *inode,
135 struct obd_client_handle *och,
136 enum mds_op_bias bias, void *data)
138 struct obd_export *md_exp = ll_i2mdexp(inode);
139 const struct ll_inode_info *lli = ll_i2info(inode);
140 struct md_op_data *op_data;
141 struct ptlrpc_request *req = NULL;
145 if (class_exp2obd(md_exp) == NULL) {
146 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
147 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
151 OBD_ALLOC_PTR(op_data);
152 /* We leak openhandle and request here on error, but not much to be
153 * done in OOM case since app won't retry close on error either. */
155 GOTO(out, rc = -ENOMEM);
157 ll_prepare_close(inode, op_data, och);
159 case MDS_CLOSE_LAYOUT_MERGE:
160 /* merge blocks from the victim inode */
161 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
162 op_data->op_attr.ia_valid |= ATTR_SIZE;
163 op_data->op_xvalid |= OP_XVALID_BLOCKS;
165 case MDS_CLOSE_LAYOUT_SPLIT:
166 case MDS_CLOSE_LAYOUT_SWAP: {
167 struct split_param *sp = data;
169 LASSERT(data != NULL);
170 op_data->op_bias |= bias;
171 op_data->op_data_version = 0;
172 op_data->op_lease_handle = och->och_lease_handle;
173 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
174 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
175 op_data->op_mirror_id = sp->sp_mirror_id;
177 op_data->op_fid2 = *ll_inode2fid(data);
182 case MDS_CLOSE_RESYNC_DONE: {
183 struct ll_ioc_lease *ioc = data;
185 LASSERT(data != NULL);
186 op_data->op_attr_blocks +=
187 ioc->lil_count * op_data->op_attr_blocks;
188 op_data->op_attr.ia_valid |= ATTR_SIZE;
189 op_data->op_xvalid |= OP_XVALID_BLOCKS;
190 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
192 op_data->op_lease_handle = och->och_lease_handle;
193 op_data->op_data = &ioc->lil_ids[0];
194 op_data->op_data_size =
195 ioc->lil_count * sizeof(ioc->lil_ids[0]);
199 case MDS_PCC_ATTACH: {
200 struct pcc_param *param = data;
202 LASSERT(data != NULL);
203 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
204 op_data->op_archive_id = param->pa_archive_id;
205 op_data->op_data_version = param->pa_data_version;
206 op_data->op_lease_handle = och->och_lease_handle;
210 case MDS_HSM_RELEASE:
211 LASSERT(data != NULL);
212 op_data->op_bias |= MDS_HSM_RELEASE;
213 op_data->op_data_version = *(__u64 *)data;
214 op_data->op_lease_handle = och->och_lease_handle;
215 op_data->op_attr.ia_valid |= ATTR_SIZE;
216 op_data->op_xvalid |= OP_XVALID_BLOCKS;
220 LASSERT(data == NULL);
224 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
225 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
226 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
227 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
229 rc = md_close(md_exp, op_data, och->och_mod, &req);
230 if (rc != 0 && rc != -EINTR)
231 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
232 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
234 if (rc == 0 && op_data->op_bias & bias) {
235 struct mdt_body *body;
237 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
238 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
241 if (bias & MDS_PCC_ATTACH) {
242 struct pcc_param *param = data;
244 param->pa_layout_gen = body->mbo_layout_gen;
248 ll_finish_md_op_data(op_data);
252 md_clear_open_replay_data(md_exp, och);
253 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
256 ptlrpc_req_finished(req); /* This is close request */
260 int ll_md_real_close(struct inode *inode, fmode_t fmode)
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct obd_client_handle **och_p;
264 struct obd_client_handle *och;
269 if (fmode & FMODE_WRITE) {
270 och_p = &lli->lli_mds_write_och;
271 och_usecount = &lli->lli_open_fd_write_count;
272 } else if (fmode & FMODE_EXEC) {
273 och_p = &lli->lli_mds_exec_och;
274 och_usecount = &lli->lli_open_fd_exec_count;
276 LASSERT(fmode & FMODE_READ);
277 och_p = &lli->lli_mds_read_och;
278 och_usecount = &lli->lli_open_fd_read_count;
281 mutex_lock(&lli->lli_och_mutex);
282 if (*och_usecount > 0) {
283 /* There are still users of this handle, so skip
285 mutex_unlock(&lli->lli_och_mutex);
291 mutex_unlock(&lli->lli_och_mutex);
294 /* There might be a race and this handle may already
296 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
302 static int ll_md_close(struct inode *inode, struct file *file)
304 union ldlm_policy_data policy = {
305 .l_inodebits = { MDS_INODELOCK_OPEN },
307 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
308 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
309 struct ll_inode_info *lli = ll_i2info(inode);
310 struct lustre_handle lockh;
311 enum ldlm_mode lockmode;
315 /* clear group lock, if present */
316 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
317 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
319 if (fd->fd_lease_och != NULL) {
322 /* Usually the lease is not released when the
323 * application crashed, we need to release here. */
324 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
325 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
326 PFID(&lli->lli_fid), rc, lease_broken);
328 fd->fd_lease_och = NULL;
331 if (fd->fd_och != NULL) {
332 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
337 /* Let's see if we have good enough OPEN lock on the file and if
338 we can skip talking to MDS */
339 mutex_lock(&lli->lli_och_mutex);
340 if (fd->fd_omode & FMODE_WRITE) {
342 LASSERT(lli->lli_open_fd_write_count);
343 lli->lli_open_fd_write_count--;
344 } else if (fd->fd_omode & FMODE_EXEC) {
346 LASSERT(lli->lli_open_fd_exec_count);
347 lli->lli_open_fd_exec_count--;
350 LASSERT(lli->lli_open_fd_read_count);
351 lli->lli_open_fd_read_count--;
353 mutex_unlock(&lli->lli_och_mutex);
355 /* LU-4398: do not cache write open lock if the file has exec bit */
356 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
357 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
358 LDLM_IBITS, &policy, lockmode, &lockh))
359 rc = ll_md_real_close(inode, fd->fd_omode);
362 LUSTRE_FPRIVATE(file) = NULL;
363 ll_file_data_put(fd);
368 /* While this returns an error code, fput() the caller does not, so we need
369 * to make every effort to clean up all of our state here. Also, applications
370 * rarely check close errors and even if an error is returned they will not
371 * re-try the close call.
373 int ll_file_release(struct inode *inode, struct file *file)
375 struct ll_file_data *fd;
376 struct ll_sb_info *sbi = ll_i2sbi(inode);
377 struct ll_inode_info *lli = ll_i2info(inode);
378 ktime_t kstart = ktime_get();
383 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
384 PFID(ll_inode2fid(inode)), inode);
386 fd = LUSTRE_FPRIVATE(file);
389 /* The last ref on @file, maybe not the the owner pid of statahead,
390 * because parent and child process can share the same file handle. */
391 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
392 ll_deauthorize_statahead(inode, fd);
394 if (inode->i_sb->s_root == file_dentry(file)) {
395 LUSTRE_FPRIVATE(file) = NULL;
396 ll_file_data_put(fd);
400 pcc_file_release(inode, file);
402 if (!S_ISDIR(inode->i_mode)) {
403 if (lli->lli_clob != NULL)
404 lov_read_and_clear_async_rc(lli->lli_clob);
405 lli->lli_async_rc = 0;
408 rc = ll_md_close(inode, file);
410 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
411 libcfs_debug_dumplog();
414 if (!rc && inode->i_sb->s_root != file_dentry(file))
415 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
416 ktime_us_delta(ktime_get(), kstart));
420 static inline int ll_dom_readpage(void *data, struct page *page)
422 struct niobuf_local *lnb = data;
425 kaddr = ll_kmap_atomic(page, KM_USER0);
426 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
427 if (lnb->lnb_len < PAGE_SIZE)
428 memset(kaddr + lnb->lnb_len, 0,
429 PAGE_SIZE - lnb->lnb_len);
430 flush_dcache_page(page);
431 SetPageUptodate(page);
432 ll_kunmap_atomic(kaddr, KM_USER0);
438 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
439 struct lookup_intent *it)
441 struct ll_inode_info *lli = ll_i2info(inode);
442 struct cl_object *obj = lli->lli_clob;
443 struct address_space *mapping = inode->i_mapping;
445 struct niobuf_remote *rnb;
446 struct mdt_body *body;
448 unsigned long index, start;
449 struct niobuf_local lnb;
456 if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
460 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
461 if (rnb == NULL || rnb->rnb_len == 0)
464 /* LU-11595: Server may return whole file and that is OK always or
465 * it may return just file tail and its offset must be aligned with
466 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
467 * smaller then offset may be not aligned and that data is just ignored.
469 if (rnb->rnb_offset & ~PAGE_MASK)
472 /* Server returns whole file or just file tail if it fills in reply
473 * buffer, in both cases total size should be equal to the file size.
475 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
476 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
477 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
478 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
479 rnb->rnb_len, body->mbo_dom_size);
483 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
484 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
486 data = (char *)rnb + sizeof(*rnb);
488 lnb.lnb_file_offset = rnb->rnb_offset;
489 start = lnb.lnb_file_offset >> PAGE_SHIFT;
491 LASSERT((lnb.lnb_file_offset & ~PAGE_MASK) == 0);
492 lnb.lnb_page_offset = 0;
494 lnb.lnb_data = data + (index << PAGE_SHIFT);
495 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
496 if (lnb.lnb_len > PAGE_SIZE)
497 lnb.lnb_len = PAGE_SIZE;
499 vmpage = read_cache_page(mapping, index + start,
500 ll_dom_readpage, &lnb);
501 if (IS_ERR(vmpage)) {
502 CWARN("%s: cannot fill page %lu for "DFID
503 " with data: rc = %li\n",
504 ll_i2sbi(inode)->ll_fsname, index + start,
505 PFID(lu_object_fid(&obj->co_lu)),
511 } while (rnb->rnb_len > (index << PAGE_SHIFT));
515 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
516 struct lookup_intent *itp)
518 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
519 struct dentry *parent = de->d_parent;
522 struct md_op_data *op_data;
523 struct ptlrpc_request *req = NULL;
527 LASSERT(parent != NULL);
528 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
530 /* if server supports open-by-fid, or file name is invalid, don't pack
531 * name in open request */
532 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
533 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
535 len = de->d_name.len;
536 name = kmalloc(len + 1, GFP_NOFS);
541 spin_lock(&de->d_lock);
542 if (len != de->d_name.len) {
543 spin_unlock(&de->d_lock);
547 memcpy(name, de->d_name.name, len);
549 spin_unlock(&de->d_lock);
551 if (!lu_name_is_valid_2(name, len)) {
557 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
558 name, len, 0, LUSTRE_OPC_ANY, NULL);
559 if (IS_ERR(op_data)) {
561 RETURN(PTR_ERR(op_data));
563 op_data->op_data = lmm;
564 op_data->op_data_size = lmmsize;
566 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
567 &ll_md_blocking_ast, 0);
569 ll_finish_md_op_data(op_data);
571 /* reason for keep own exit path - don`t flood log
572 * with messages with -ESTALE errors.
574 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
575 it_open_error(DISP_OPEN_OPEN, itp))
577 ll_release_openhandle(de, itp);
581 if (it_disposition(itp, DISP_LOOKUP_NEG))
582 GOTO(out, rc = -ENOENT);
584 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
585 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
586 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
590 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
592 if (!rc && itp->it_lock_mode) {
593 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
594 struct ldlm_lock *lock;
595 bool has_dom_bit = false;
597 /* If we got a lock back and it has a LOOKUP bit set,
598 * make sure the dentry is marked as valid so we can find it.
599 * We don't need to care about actual hashing since other bits
600 * of kernel will deal with that later.
602 lock = ldlm_handle2lock(&handle);
604 has_dom_bit = ldlm_has_dom(lock);
605 if (lock->l_policy_data.l_inodebits.bits &
606 MDS_INODELOCK_LOOKUP)
607 d_lustre_revalidate(de);
611 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
613 ll_dom_finish_open(de->d_inode, req, itp);
617 ptlrpc_req_finished(req);
618 ll_intent_drop_lock(itp);
620 /* We did open by fid, but by the time we got to the server,
621 * the object disappeared. If this is a create, we cannot really
622 * tell the userspace that the file it was trying to create
623 * does not exist. Instead let's return -ESTALE, and the VFS will
624 * retry the create with LOOKUP_REVAL that we are going to catch
625 * in ll_revalidate_dentry() and use lookup then.
627 if (rc == -ENOENT && itp->it_op & IT_CREAT)
633 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
634 struct obd_client_handle *och)
636 struct mdt_body *body;
638 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
639 och->och_open_handle = body->mbo_open_handle;
640 och->och_fid = body->mbo_fid1;
641 och->och_lease_handle.cookie = it->it_lock_handle;
642 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
643 och->och_flags = it->it_flags;
645 return md_set_open_replay_data(md_exp, och, it);
648 static int ll_local_open(struct file *file, struct lookup_intent *it,
649 struct ll_file_data *fd, struct obd_client_handle *och)
651 struct inode *inode = file_inode(file);
654 LASSERT(!LUSTRE_FPRIVATE(file));
661 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
666 LUSTRE_FPRIVATE(file) = fd;
667 ll_readahead_init(inode, &fd->fd_ras);
668 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
670 /* ll_cl_context initialize */
671 rwlock_init(&fd->fd_lock);
672 INIT_LIST_HEAD(&fd->fd_lccs);
677 /* Open a file, and (for the very first open) create objects on the OSTs at
678 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
679 * creation or open until ll_lov_setstripe() ioctl is called.
681 * If we already have the stripe MD locally then we don't request it in
682 * md_open(), by passing a lmm_size = 0.
684 * It is up to the application to ensure no other processes open this file
685 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
686 * used. We might be able to avoid races of that sort by getting lli_open_sem
687 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
688 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
690 int ll_file_open(struct inode *inode, struct file *file)
692 struct ll_inode_info *lli = ll_i2info(inode);
693 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
694 .it_flags = file->f_flags };
695 struct obd_client_handle **och_p = NULL;
696 __u64 *och_usecount = NULL;
697 struct ll_file_data *fd;
698 ktime_t kstart = ktime_get();
702 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
703 PFID(ll_inode2fid(inode)), inode, file->f_flags);
705 it = file->private_data; /* XXX: compat macro */
706 file->private_data = NULL; /* prevent ll_local_open assertion */
708 fd = ll_file_data_get();
710 GOTO(out_nofiledata, rc = -ENOMEM);
713 if (S_ISDIR(inode->i_mode))
714 ll_authorize_statahead(inode, fd);
716 if (inode->i_sb->s_root == file_dentry(file)) {
717 LUSTRE_FPRIVATE(file) = fd;
721 if (!it || !it->it_disposition) {
722 /* Convert f_flags into access mode. We cannot use file->f_mode,
723 * because everything but O_ACCMODE mask was stripped from
725 if ((oit.it_flags + 1) & O_ACCMODE)
727 if (file->f_flags & O_TRUNC)
728 oit.it_flags |= FMODE_WRITE;
730 /* kernel only call f_op->open in dentry_open. filp_open calls
731 * dentry_open after call to open_namei that checks permissions.
732 * Only nfsd_open call dentry_open directly without checking
733 * permissions and because of that this code below is safe.
735 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
736 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
738 /* We do not want O_EXCL here, presumably we opened the file
739 * already? XXX - NFS implications? */
740 oit.it_flags &= ~O_EXCL;
742 /* bug20584, if "it_flags" contains O_CREAT, the file will be
743 * created if necessary, then "IT_CREAT" should be set to keep
744 * consistent with it */
745 if (oit.it_flags & O_CREAT)
746 oit.it_op |= IT_CREAT;
752 /* Let's see if we have file open on MDS already. */
753 if (it->it_flags & FMODE_WRITE) {
754 och_p = &lli->lli_mds_write_och;
755 och_usecount = &lli->lli_open_fd_write_count;
756 } else if (it->it_flags & FMODE_EXEC) {
757 och_p = &lli->lli_mds_exec_och;
758 och_usecount = &lli->lli_open_fd_exec_count;
760 och_p = &lli->lli_mds_read_och;
761 och_usecount = &lli->lli_open_fd_read_count;
764 mutex_lock(&lli->lli_och_mutex);
765 if (*och_p) { /* Open handle is present */
766 if (it_disposition(it, DISP_OPEN_OPEN)) {
767 /* Well, there's extra open request that we do not need,
768 let's close it somehow. This will decref request. */
769 rc = it_open_error(DISP_OPEN_OPEN, it);
771 mutex_unlock(&lli->lli_och_mutex);
772 GOTO(out_openerr, rc);
775 ll_release_openhandle(file_dentry(file), it);
779 rc = ll_local_open(file, it, fd, NULL);
782 mutex_unlock(&lli->lli_och_mutex);
783 GOTO(out_openerr, rc);
786 LASSERT(*och_usecount == 0);
787 if (!it->it_disposition) {
788 struct dentry *dentry = file_dentry(file);
789 struct ll_dentry_data *ldd;
791 /* We cannot just request lock handle now, new ELC code
792 means that one of other OPEN locks for this file
793 could be cancelled, and since blocking ast handler
794 would attempt to grab och_mutex as well, that would
795 result in a deadlock */
796 mutex_unlock(&lli->lli_och_mutex);
798 * Normally called under two situations:
800 * 2. A race/condition on MDS resulting in no open
801 * handle to be returned from LOOKUP|OPEN request,
802 * for example if the target entry was a symlink.
804 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
805 * marked by a bit set in ll_iget_for_nfs. Clear the
806 * bit so that it's not confusing later callers.
808 * NB; when ldd is NULL, it must have come via normal
809 * lookup path only, since ll_iget_for_nfs always calls
812 ldd = ll_d2d(dentry);
813 if (ldd && ldd->lld_nfs_dentry) {
814 ldd->lld_nfs_dentry = 0;
815 if (!filename_is_volatile(dentry->d_name.name,
818 it->it_flags |= MDS_OPEN_LOCK;
822 * Always specify MDS_OPEN_BY_FID because we don't want
823 * to get file with different fid.
825 it->it_flags |= MDS_OPEN_BY_FID;
826 rc = ll_intent_file_open(dentry, NULL, 0, it);
828 GOTO(out_openerr, rc);
832 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
834 GOTO(out_och_free, rc = -ENOMEM);
838 /* md_intent_lock() didn't get a request ref if there was an
839 * open error, so don't do cleanup on the request here
841 /* XXX (green): Should not we bail out on any error here, not
842 * just open error? */
843 rc = it_open_error(DISP_OPEN_OPEN, it);
845 GOTO(out_och_free, rc);
847 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
848 "inode %p: disposition %x, status %d\n", inode,
849 it_disposition(it, ~0), it->it_status);
851 rc = ll_local_open(file, it, fd, *och_p);
853 GOTO(out_och_free, rc);
856 rc = pcc_file_open(inode, file);
858 GOTO(out_och_free, rc);
860 mutex_unlock(&lli->lli_och_mutex);
862 /* lockless for direct IO so that it can do IO in parallel */
863 if (file->f_flags & O_DIRECT)
864 fd->fd_flags |= LL_FILE_LOCKLESS_IO;
867 /* Must do this outside lli_och_mutex lock to prevent deadlock where
868 different kind of OPEN lock for this same inode gets cancelled
869 by ldlm_cancel_lru */
870 if (!S_ISREG(inode->i_mode))
871 GOTO(out_och_free, rc);
872 cl_lov_delay_create_clear(&file->f_flags);
873 GOTO(out_och_free, rc);
877 if (och_p && *och_p) {
878 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
879 *och_p = NULL; /* OBD_FREE writes some magic there */
882 mutex_unlock(&lli->lli_och_mutex);
885 if (lli->lli_opendir_key == fd)
886 ll_deauthorize_statahead(inode, fd);
889 ll_file_data_put(fd);
891 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
892 ktime_us_delta(ktime_get(), kstart));
896 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
897 ptlrpc_req_finished(it->it_request);
898 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
904 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
905 struct ldlm_lock_desc *desc, void *data, int flag)
908 struct lustre_handle lockh;
912 case LDLM_CB_BLOCKING:
913 ldlm_lock2handle(lock, &lockh);
914 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
916 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
920 case LDLM_CB_CANCELING:
928 * When setting a lease on a file, we take ownership of the lli_mds_*_och
929 * and save it as fd->fd_och so as to force client to reopen the file even
930 * if it has an open lock in cache already.
932 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
933 struct lustre_handle *old_open_handle)
935 struct ll_inode_info *lli = ll_i2info(inode);
936 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
937 struct obd_client_handle **och_p;
942 /* Get the openhandle of the file */
943 mutex_lock(&lli->lli_och_mutex);
944 if (fd->fd_lease_och != NULL)
945 GOTO(out_unlock, rc = -EBUSY);
947 if (fd->fd_och == NULL) {
948 if (file->f_mode & FMODE_WRITE) {
949 LASSERT(lli->lli_mds_write_och != NULL);
950 och_p = &lli->lli_mds_write_och;
951 och_usecount = &lli->lli_open_fd_write_count;
953 LASSERT(lli->lli_mds_read_och != NULL);
954 och_p = &lli->lli_mds_read_och;
955 och_usecount = &lli->lli_open_fd_read_count;
958 if (*och_usecount > 1)
959 GOTO(out_unlock, rc = -EBUSY);
966 *old_open_handle = fd->fd_och->och_open_handle;
970 mutex_unlock(&lli->lli_och_mutex);
975 * Release ownership on lli_mds_*_och when putting back a file lease.
977 static int ll_lease_och_release(struct inode *inode, struct file *file)
979 struct ll_inode_info *lli = ll_i2info(inode);
980 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
981 struct obd_client_handle **och_p;
982 struct obd_client_handle *old_och = NULL;
987 mutex_lock(&lli->lli_och_mutex);
988 if (file->f_mode & FMODE_WRITE) {
989 och_p = &lli->lli_mds_write_och;
990 och_usecount = &lli->lli_open_fd_write_count;
992 och_p = &lli->lli_mds_read_och;
993 och_usecount = &lli->lli_open_fd_read_count;
996 /* The file may have been open by another process (broken lease) so
997 * *och_p is not NULL. In this case we should simply increase usecount
1000 if (*och_p != NULL) {
1001 old_och = fd->fd_och;
1004 *och_p = fd->fd_och;
1008 mutex_unlock(&lli->lli_och_mutex);
1010 if (old_och != NULL)
1011 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1017 * Acquire a lease and open the file.
1019 static struct obd_client_handle *
1020 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1023 struct lookup_intent it = { .it_op = IT_OPEN };
1024 struct ll_sb_info *sbi = ll_i2sbi(inode);
1025 struct md_op_data *op_data;
1026 struct ptlrpc_request *req = NULL;
1027 struct lustre_handle old_open_handle = { 0 };
1028 struct obd_client_handle *och = NULL;
1033 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1034 RETURN(ERR_PTR(-EINVAL));
1037 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1038 RETURN(ERR_PTR(-EPERM));
1040 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1042 RETURN(ERR_PTR(rc));
1047 RETURN(ERR_PTR(-ENOMEM));
1049 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1050 LUSTRE_OPC_ANY, NULL);
1051 if (IS_ERR(op_data))
1052 GOTO(out, rc = PTR_ERR(op_data));
1054 /* To tell the MDT this openhandle is from the same owner */
1055 op_data->op_open_handle = old_open_handle;
1057 it.it_flags = fmode | open_flags;
1058 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1059 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1060 &ll_md_blocking_lease_ast,
1061 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1062 * it can be cancelled which may mislead applications that the lease is
1064 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1065 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1066 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1067 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1068 ll_finish_md_op_data(op_data);
1069 ptlrpc_req_finished(req);
1071 GOTO(out_release_it, rc);
1073 if (it_disposition(&it, DISP_LOOKUP_NEG))
1074 GOTO(out_release_it, rc = -ENOENT);
1076 rc = it_open_error(DISP_OPEN_OPEN, &it);
1078 GOTO(out_release_it, rc);
1080 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1081 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1083 GOTO(out_release_it, rc);
1085 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1086 GOTO(out_close, rc = -EOPNOTSUPP);
1088 /* already get lease, handle lease lock */
1089 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1090 if (it.it_lock_mode == 0 ||
1091 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1092 /* open lock must return for lease */
1093 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1094 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1096 GOTO(out_close, rc = -EPROTO);
1099 ll_intent_release(&it);
1103 /* Cancel open lock */
1104 if (it.it_lock_mode != 0) {
1105 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1107 it.it_lock_mode = 0;
1108 och->och_lease_handle.cookie = 0ULL;
1110 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1112 CERROR("%s: error closing file "DFID": %d\n",
1113 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1114 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1116 ll_intent_release(&it);
1120 RETURN(ERR_PTR(rc));
1124 * Check whether a layout swap can be done between two inodes.
1126 * \param[in] inode1 First inode to check
1127 * \param[in] inode2 Second inode to check
1129 * \retval 0 on success, layout swap can be performed between both inodes
1130 * \retval negative error code if requirements are not met
1132 static int ll_check_swap_layouts_validity(struct inode *inode1,
1133 struct inode *inode2)
1135 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1138 if (inode_permission(inode1, MAY_WRITE) ||
1139 inode_permission(inode2, MAY_WRITE))
1142 if (inode1->i_sb != inode2->i_sb)
1148 static int ll_swap_layouts_close(struct obd_client_handle *och,
1149 struct inode *inode, struct inode *inode2)
1151 const struct lu_fid *fid1 = ll_inode2fid(inode);
1152 const struct lu_fid *fid2;
1156 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1157 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1159 rc = ll_check_swap_layouts_validity(inode, inode2);
1161 GOTO(out_free_och, rc);
1163 /* We now know that inode2 is a lustre inode */
1164 fid2 = ll_inode2fid(inode2);
1166 rc = lu_fid_cmp(fid1, fid2);
1168 GOTO(out_free_och, rc = -EINVAL);
1170 /* Close the file and {swap,merge} layouts between inode & inode2.
1171 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1172 * because we still need it to pack l_remote_handle to MDT. */
1173 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1176 och = NULL; /* freed in ll_close_inode_openhandle() */
1186 * Release lease and close the file.
1187 * It will check if the lease has ever broken.
1189 static int ll_lease_close_intent(struct obd_client_handle *och,
1190 struct inode *inode,
1191 bool *lease_broken, enum mds_op_bias bias,
1194 struct ldlm_lock *lock;
1195 bool cancelled = true;
1199 lock = ldlm_handle2lock(&och->och_lease_handle);
1201 lock_res_and_lock(lock);
1202 cancelled = ldlm_is_cancel(lock);
1203 unlock_res_and_lock(lock);
1204 LDLM_LOCK_PUT(lock);
1207 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1208 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1210 if (lease_broken != NULL)
1211 *lease_broken = cancelled;
1213 if (!cancelled && !bias)
1214 ldlm_cli_cancel(&och->och_lease_handle, 0);
1216 if (cancelled) { /* no need to excute intent */
1221 rc = ll_close_inode_openhandle(inode, och, bias, data);
1225 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1228 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1232 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1234 static int ll_lease_file_resync(struct obd_client_handle *och,
1235 struct inode *inode, unsigned long arg)
1237 struct ll_sb_info *sbi = ll_i2sbi(inode);
1238 struct md_op_data *op_data;
1239 struct ll_ioc_lease_id ioc;
1240 __u64 data_version_unused;
1244 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1245 LUSTRE_OPC_ANY, NULL);
1246 if (IS_ERR(op_data))
1247 RETURN(PTR_ERR(op_data));
1249 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1253 /* before starting file resync, it's necessary to clean up page cache
1254 * in client memory, otherwise once the layout version is increased,
1255 * writing back cached data will be denied the OSTs. */
1256 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1260 op_data->op_lease_handle = och->och_lease_handle;
1261 op_data->op_mirror_id = ioc.lil_mirror_id;
1262 rc = md_file_resync(sbi->ll_md_exp, op_data);
1268 ll_finish_md_op_data(op_data);
1272 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1274 struct ll_inode_info *lli = ll_i2info(inode);
1275 struct cl_object *obj = lli->lli_clob;
1276 struct cl_attr *attr = vvp_env_thread_attr(env);
1284 ll_inode_size_lock(inode);
1286 /* Merge timestamps the most recently obtained from MDS with
1287 * timestamps obtained from OSTs.
1289 * Do not overwrite atime of inode because it may be refreshed
1290 * by file_accessed() function. If the read was served by cache
1291 * data, there is no RPC to be sent so that atime may not be
1292 * transferred to OSTs at all. MDT only updates atime at close time
1293 * if it's at least 'mdd.*.atime_diff' older.
1294 * All in all, the atime in Lustre does not strictly comply with
1295 * POSIX. Solving this problem needs to send an RPC to MDT for each
1296 * read, this will hurt performance.
1298 if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
1299 inode->i_atime.tv_sec < lli->lli_atime)
1300 inode->i_atime.tv_sec = lli->lli_atime;
1302 inode->i_mtime.tv_sec = lli->lli_mtime;
1303 inode->i_ctime.tv_sec = lli->lli_ctime;
1305 mtime = inode->i_mtime.tv_sec;
1306 atime = inode->i_atime.tv_sec;
1307 ctime = inode->i_ctime.tv_sec;
1309 cl_object_attr_lock(obj);
1310 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1313 rc = cl_object_attr_get(env, obj, attr);
1314 cl_object_attr_unlock(obj);
1317 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1319 if (atime < attr->cat_atime)
1320 atime = attr->cat_atime;
1322 if (ctime < attr->cat_ctime)
1323 ctime = attr->cat_ctime;
1325 if (mtime < attr->cat_mtime)
1326 mtime = attr->cat_mtime;
1328 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1329 PFID(&lli->lli_fid), attr->cat_size);
1331 i_size_write(inode, attr->cat_size);
1332 inode->i_blocks = attr->cat_blocks;
1334 inode->i_mtime.tv_sec = mtime;
1335 inode->i_atime.tv_sec = atime;
1336 inode->i_ctime.tv_sec = ctime;
1339 ll_inode_size_unlock(inode);
1345 * Set designated mirror for I/O.
1347 * So far only read, write, and truncated can support to issue I/O to
1348 * designated mirror.
1350 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1352 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1354 /* clear layout version for generic(non-resync) I/O in case it carries
1355 * stale layout version due to I/O restart */
1356 io->ci_layout_version = 0;
1358 /* FLR: disable non-delay for designated mirror I/O because obviously
1359 * only one mirror is available */
1360 if (fd->fd_designated_mirror > 0) {
1362 io->ci_designated_mirror = fd->fd_designated_mirror;
1363 io->ci_layout_version = fd->fd_layout_version;
1366 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1367 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1370 static bool file_is_noatime(const struct file *file)
1372 const struct vfsmount *mnt = file->f_path.mnt;
1373 const struct inode *inode = file_inode((struct file *)file);
1375 /* Adapted from file_accessed() and touch_atime().*/
1376 if (file->f_flags & O_NOATIME)
1379 if (inode->i_flags & S_NOATIME)
1382 if (IS_NOATIME(inode))
1385 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1388 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1391 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1397 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1398 struct vvp_io_args *args)
1400 struct inode *inode = file_inode(file);
1401 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1403 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1404 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1406 if (iot == CIT_WRITE) {
1407 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1408 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1409 file->f_flags & O_DIRECT ||
1411 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1412 io->u.ci_wr.wr_sync |= !!(args &&
1413 args->via_io_subtype == IO_NORMAL &&
1414 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1418 io->ci_obj = ll_i2info(inode)->lli_clob;
1419 io->ci_lockreq = CILR_MAYBE;
1420 if (ll_file_nolock(file)) {
1421 io->ci_lockreq = CILR_NEVER;
1422 io->ci_no_srvlock = 1;
1423 } else if (file->f_flags & O_APPEND) {
1424 io->ci_lockreq = CILR_MANDATORY;
1426 io->ci_noatime = file_is_noatime(file);
1427 io->ci_async_readahead = false;
1429 /* FLR: only use non-delay I/O for read as there is only one
1430 * avaliable mirror for write. */
1431 io->ci_ndelay = !(iot == CIT_WRITE);
1433 ll_io_set_mirror(io, file);
1436 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1439 struct ll_inode_info *lli = ll_i2info(inode);
1440 struct ll_sb_info *sbi = ll_i2sbi(inode);
1441 enum obd_heat_type sample_type;
1442 enum obd_heat_type iobyte_type;
1443 __u64 now = ktime_get_real_seconds();
1445 if (!ll_sbi_has_file_heat(sbi) ||
1446 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1449 if (iot == CIT_READ) {
1450 sample_type = OBD_HEAT_READSAMPLE;
1451 iobyte_type = OBD_HEAT_READBYTE;
1452 } else if (iot == CIT_WRITE) {
1453 sample_type = OBD_HEAT_WRITESAMPLE;
1454 iobyte_type = OBD_HEAT_WRITEBYTE;
1459 spin_lock(&lli->lli_heat_lock);
1460 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1461 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1462 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1463 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1464 spin_unlock(&lli->lli_heat_lock);
1468 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1469 struct file *file, enum cl_io_type iot,
1470 loff_t *ppos, size_t count)
1472 struct vvp_io *vio = vvp_env_io(env);
1473 struct inode *inode = file_inode(file);
1474 struct ll_inode_info *lli = ll_i2info(inode);
1475 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1476 struct range_lock range;
1480 unsigned retried = 0;
1481 bool restarted = false;
1482 unsigned ignore_lockless = 0;
1486 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1487 file_dentry(file)->d_name.name,
1488 iot == CIT_READ ? "read" : "write", *ppos, count);
1491 io = vvp_env_thread_io(env);
1492 ll_io_init(io, file, iot, args);
1493 io->ci_ignore_lockless = ignore_lockless;
1494 io->ci_ndelay_tried = retried;
1496 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1497 bool range_locked = false;
1499 if (file->f_flags & O_APPEND)
1500 range_lock_init(&range, 0, LUSTRE_EOF);
1502 range_lock_init(&range, *ppos, *ppos + count - 1);
1504 vio->vui_fd = LUSTRE_FPRIVATE(file);
1505 vio->vui_io_subtype = args->via_io_subtype;
1507 switch (vio->vui_io_subtype) {
1509 vio->vui_iter = args->u.normal.via_iter;
1510 vio->vui_iocb = args->u.normal.via_iocb;
1511 /* Direct IO reads must also take range lock,
1512 * or multiple reads will try to work on the same pages
1513 * See LU-6227 for details. */
1514 if (((iot == CIT_WRITE) ||
1515 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1516 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1517 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1519 rc = range_lock(&lli->lli_write_tree, &range);
1523 range_locked = true;
1527 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1528 vio->u.splice.vui_flags = args->u.splice.via_flags;
1531 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1535 ll_cl_add(file, env, io, LCC_RW);
1536 rc = cl_io_loop(env, io);
1537 ll_cl_remove(file, env);
1540 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1542 range_unlock(&lli->lli_write_tree, &range);
1545 /* cl_io_rw_init() handled IO */
1549 if (io->ci_nob > 0) {
1550 result += io->ci_nob;
1551 count -= io->ci_nob;
1552 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1554 /* prepare IO restart */
1555 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1556 args->u.normal.via_iter = vio->vui_iter;
1559 cl_io_fini(env, io);
1562 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1563 file->f_path.dentry->d_name.name,
1564 iot, rc, result, io->ci_need_restart);
1566 if ((rc == 0 || rc == -ENODATA || rc == -ENOLCK) &&
1567 count > 0 && io->ci_need_restart) {
1569 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1570 file_dentry(file)->d_name.name,
1571 iot == CIT_READ ? "read" : "write",
1572 *ppos, count, result, rc);
1573 /* preserve the tried count for FLR */
1574 retried = io->ci_ndelay_tried;
1575 ignore_lockless = io->ci_ignore_lockless;
1580 if (iot == CIT_READ) {
1582 ll_stats_ops_tally(ll_i2sbi(inode),
1583 LPROC_LL_READ_BYTES, result);
1584 } else if (iot == CIT_WRITE) {
1586 ll_stats_ops_tally(ll_i2sbi(inode),
1587 LPROC_LL_WRITE_BYTES, result);
1588 fd->fd_write_failed = false;
1589 } else if (result == 0 && rc == 0) {
1592 fd->fd_write_failed = true;
1594 fd->fd_write_failed = false;
1595 } else if (rc != -ERESTARTSYS) {
1596 fd->fd_write_failed = true;
1600 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1602 ll_heat_add(inode, iot, result);
1604 RETURN(result > 0 ? result : rc);
1608 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1609 * especially for small I/O.
1611 * To serve a read request, CLIO has to create and initialize a cl_io and
1612 * then request DLM lock. This has turned out to have siginificant overhead
1613 * and affects the performance of small I/O dramatically.
1615 * It's not necessary to create a cl_io for each I/O. Under the help of read
1616 * ahead, most of the pages being read are already in memory cache and we can
1617 * read those pages directly because if the pages exist, the corresponding DLM
1618 * lock must exist so that page content must be valid.
1620 * In fast read implementation, the llite speculatively finds and reads pages
1621 * in memory cache. There are three scenarios for fast read:
1622 * - If the page exists and is uptodate, kernel VM will provide the data and
1623 * CLIO won't be intervened;
1624 * - If the page was brought into memory by read ahead, it will be exported
1625 * and read ahead parameters will be updated;
1626 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1627 * it will go back and invoke normal read, i.e., a cl_io will be created
1628 * and DLM lock will be requested.
1630 * POSIX compliance: posix standard states that read is intended to be atomic.
1631 * Lustre read implementation is in line with Linux kernel read implementation
1632 * and neither of them complies with POSIX standard in this matter. Fast read
1633 * doesn't make the situation worse on single node but it may interleave write
1634 * results from multiple nodes due to short read handling in ll_file_aio_read().
1636 * \param env - lu_env
1637 * \param iocb - kiocb from kernel
1638 * \param iter - user space buffers where the data will be copied
1640 * \retval - number of bytes have been read, or error code if error occurred.
1643 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1647 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1650 /* NB: we can't do direct IO for fast read because it will need a lock
1651 * to make IO engine happy. */
1652 if (iocb->ki_filp->f_flags & O_DIRECT)
1655 result = generic_file_read_iter(iocb, iter);
1657 /* If the first page is not in cache, generic_file_aio_read() will be
1658 * returned with -ENODATA.
1659 * See corresponding code in ll_readpage(). */
1660 if (result == -ENODATA)
1664 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1665 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1666 LPROC_LL_READ_BYTES, result);
1673 * Read from a file (through the page cache).
1675 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1678 struct vvp_io_args *args;
1679 struct file *file = iocb->ki_filp;
1683 ktime_t kstart = ktime_get();
1686 if (!iov_iter_count(to))
1690 * Currently when PCC read failed, we do not fall back to the
1691 * normal read path, just return the error.
1692 * The resaon is that: for RW-PCC, the file data may be modified
1693 * in the PCC and inconsistent with the data on OSTs (or file
1694 * data has been removed from the Lustre file system), at this
1695 * time, fallback to the normal read path may read the wrong
1697 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1698 * path: read data from data copy on OSTs.
1700 result = pcc_file_read_iter(iocb, to, &cached);
1704 ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1706 result = ll_do_fast_read(iocb, to);
1707 if (result < 0 || iov_iter_count(to) == 0)
1710 env = cl_env_get(&refcheck);
1712 return PTR_ERR(env);
1714 args = ll_env_args(env, IO_NORMAL);
1715 args->u.normal.via_iter = to;
1716 args->u.normal.via_iocb = iocb;
1718 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1719 &iocb->ki_pos, iov_iter_count(to));
1722 else if (result == 0)
1725 cl_env_put(env, &refcheck);
1728 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1729 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1731 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1732 ktime_us_delta(ktime_get(), kstart));
1739 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1740 * If a page is already in the page cache and dirty (and some other things -
1741 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1742 * write to it without doing a full I/O, because Lustre already knows about it
1743 * and will write it out. This saves a lot of processing time.
1745 * All writes here are within one page, so exclusion is handled by the page
1746 * lock on the vm page. We do not do tiny writes for writes which touch
1747 * multiple pages because it's very unlikely multiple sequential pages are
1748 * are already dirty.
1750 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1751 * and are unlikely to be to already dirty pages.
1753 * Attribute updates are important here, we do them in ll_tiny_write_end.
1755 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1757 ssize_t count = iov_iter_count(iter);
1758 struct file *file = iocb->ki_filp;
1759 struct inode *inode = file_inode(file);
1760 bool lock_inode = !IS_NOSEC(inode);
1765 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1766 * of function for why.
1768 if (count >= PAGE_SIZE ||
1769 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1772 if (unlikely(lock_inode))
1774 result = __generic_file_write_iter(iocb, iter);
1776 if (unlikely(lock_inode))
1777 inode_unlock(inode);
1779 /* If the page is not already dirty, ll_tiny_write_begin returns
1780 * -ENODATA. We continue on to normal write.
1782 if (result == -ENODATA)
1786 ll_heat_add(inode, CIT_WRITE, result);
1787 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1789 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1792 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1798 * Write to a file (through the page cache).
1800 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1802 struct vvp_io_args *args;
1804 ssize_t rc_tiny = 0, rc_normal;
1805 struct file *file = iocb->ki_filp;
1808 ktime_t kstart = ktime_get();
1813 if (!iov_iter_count(from))
1814 GOTO(out, rc_normal = 0);
1817 * When PCC write failed, we usually do not fall back to the normal
1818 * write path, just return the error. But there is a special case when
1819 * returned error code is -ENOSPC due to running out of space on PCC HSM
1820 * bakcend. At this time, it will fall back to normal I/O path and
1821 * retry the I/O. As the file is in HSM released state, it will restore
1822 * the file data to OSTs first and redo the write again. And the
1823 * restore process will revoke the layout lock and detach the file
1824 * from PCC cache automatically.
1826 result = pcc_file_write_iter(iocb, from, &cached);
1827 if (cached && result != -ENOSPC && result != -EDQUOT)
1828 GOTO(out, rc_normal = result);
1830 /* NB: we can't do direct IO for tiny writes because they use the page
1831 * cache, we can't do sync writes because tiny writes can't flush
1832 * pages, and we can't do append writes because we can't guarantee the
1833 * required DLM locks are held to protect file size.
1835 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1836 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1837 rc_tiny = ll_do_tiny_write(iocb, from);
1839 /* In case of error, go on and try normal write - Only stop if tiny
1840 * write completed I/O.
1842 if (iov_iter_count(from) == 0)
1843 GOTO(out, rc_normal = rc_tiny);
1845 env = cl_env_get(&refcheck);
1847 return PTR_ERR(env);
1849 args = ll_env_args(env, IO_NORMAL);
1850 args->u.normal.via_iter = from;
1851 args->u.normal.via_iocb = iocb;
1853 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1854 &iocb->ki_pos, iov_iter_count(from));
1856 /* On success, combine bytes written. */
1857 if (rc_tiny >= 0 && rc_normal > 0)
1858 rc_normal += rc_tiny;
1859 /* On error, only return error from normal write if tiny write did not
1860 * write any bytes. Otherwise return bytes written by tiny write.
1862 else if (rc_tiny > 0)
1863 rc_normal = rc_tiny;
1865 cl_env_put(env, &refcheck);
1867 if (rc_normal > 0) {
1868 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1869 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1871 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1872 ktime_us_delta(ktime_get(), kstart));
1878 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1880 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1882 static int ll_file_get_iov_count(const struct iovec *iov,
1883 unsigned long *nr_segs, size_t *count)
1888 for (seg = 0; seg < *nr_segs; seg++) {
1889 const struct iovec *iv = &iov[seg];
1892 * If any segment has a negative length, or the cumulative
1893 * length ever wraps negative then return -EINVAL.
1896 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1898 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1903 cnt -= iv->iov_len; /* This segment is no good */
1910 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1911 unsigned long nr_segs, loff_t pos)
1918 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1925 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1926 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1927 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1928 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1929 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1931 result = ll_file_read_iter(iocb, &to);
1936 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1939 struct iovec iov = { .iov_base = buf, .iov_len = count };
1948 init_sync_kiocb(&kiocb, file);
1949 kiocb.ki_pos = *ppos;
1950 #ifdef HAVE_KIOCB_KI_LEFT
1951 kiocb.ki_left = count;
1952 #elif defined(HAVE_KI_NBYTES)
1953 kiocb.i_nbytes = count;
1956 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1957 *ppos = kiocb.ki_pos;
1963 * Write to a file (through the page cache).
1966 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1967 unsigned long nr_segs, loff_t pos)
1969 struct iov_iter from;
1974 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1981 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1982 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1983 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1984 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1985 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1987 result = ll_file_write_iter(iocb, &from);
1992 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1993 size_t count, loff_t *ppos)
1995 struct iovec iov = { .iov_base = (void __user *)buf,
2005 init_sync_kiocb(&kiocb, file);
2006 kiocb.ki_pos = *ppos;
2007 #ifdef HAVE_KIOCB_KI_LEFT
2008 kiocb.ki_left = count;
2009 #elif defined(HAVE_KI_NBYTES)
2010 kiocb.ki_nbytes = count;
2013 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2014 *ppos = kiocb.ki_pos;
2018 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2021 * Send file content (through pagecache) somewhere with helper
2023 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2024 struct pipe_inode_info *pipe, size_t count,
2028 struct vvp_io_args *args;
2035 result = pcc_file_splice_read(in_file, ppos, pipe,
2036 count, flags, &cached);
2040 ll_ras_enter(in_file, *ppos, count);
2042 env = cl_env_get(&refcheck);
2044 RETURN(PTR_ERR(env));
2046 args = ll_env_args(env, IO_SPLICE);
2047 args->u.splice.via_pipe = pipe;
2048 args->u.splice.via_flags = flags;
2050 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2051 cl_env_put(env, &refcheck);
2054 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2055 LUSTRE_FPRIVATE(in_file), *ppos, result,
2060 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2061 __u64 flags, struct lov_user_md *lum, int lum_size)
2063 struct lookup_intent oit = {
2065 .it_flags = flags | MDS_OPEN_BY_FID,
2070 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2071 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2072 /* this code will only exist for big-endian systems */
2073 lustre_swab_lov_user_md(lum, 0);
2076 ll_inode_size_lock(inode);
2077 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2079 GOTO(out_unlock, rc);
2081 ll_release_openhandle(dentry, &oit);
2084 ll_inode_size_unlock(inode);
2085 ll_intent_release(&oit);
2090 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2091 struct lov_mds_md **lmmp, int *lmm_size,
2092 struct ptlrpc_request **request)
2094 struct ll_sb_info *sbi = ll_i2sbi(inode);
2095 struct mdt_body *body;
2096 struct lov_mds_md *lmm = NULL;
2097 struct ptlrpc_request *req = NULL;
2098 struct md_op_data *op_data;
2101 rc = ll_get_default_mdsize(sbi, &lmmsize);
2105 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2106 strlen(filename), lmmsize,
2107 LUSTRE_OPC_ANY, NULL);
2108 if (IS_ERR(op_data))
2109 RETURN(PTR_ERR(op_data));
2111 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2112 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2113 ll_finish_md_op_data(op_data);
2115 CDEBUG(D_INFO, "md_getattr_name failed "
2116 "on %s: rc %d\n", filename, rc);
2120 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2121 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2123 lmmsize = body->mbo_eadatasize;
2125 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2127 GOTO(out, rc = -ENODATA);
2130 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2131 LASSERT(lmm != NULL);
2133 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2134 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2135 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2136 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2137 GOTO(out, rc = -EPROTO);
2140 * This is coming from the MDS, so is probably in
2141 * little endian. We convert it to host endian before
2142 * passing it to userspace.
2144 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2145 __swab32(LOV_MAGIC_MAGIC)) {
2146 int stripe_count = 0;
2148 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2149 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2150 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2151 if (le32_to_cpu(lmm->lmm_pattern) &
2152 LOV_PATTERN_F_RELEASED)
2156 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2158 /* if function called for directory - we should
2159 * avoid swab not existent lsm objects */
2160 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2161 lustre_swab_lov_user_md_objects(
2162 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2164 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2165 S_ISREG(body->mbo_mode))
2166 lustre_swab_lov_user_md_objects(
2167 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2173 *lmm_size = lmmsize;
2178 static int ll_lov_setea(struct inode *inode, struct file *file,
2181 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2182 struct lov_user_md *lump;
2183 int lum_size = sizeof(struct lov_user_md) +
2184 sizeof(struct lov_user_ost_data);
2188 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2191 OBD_ALLOC_LARGE(lump, lum_size);
2195 if (copy_from_user(lump, arg, lum_size))
2196 GOTO(out_lump, rc = -EFAULT);
2198 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2200 cl_lov_delay_create_clear(&file->f_flags);
2203 OBD_FREE_LARGE(lump, lum_size);
2207 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2214 env = cl_env_get(&refcheck);
2216 RETURN(PTR_ERR(env));
2218 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2219 cl_env_put(env, &refcheck);
2223 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2226 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2227 struct lov_user_md *klum;
2229 __u64 flags = FMODE_WRITE;
2232 rc = ll_copy_user_md(lum, &klum);
2237 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2242 rc = put_user(0, &lum->lmm_stripe_count);
2246 rc = ll_layout_refresh(inode, &gen);
2250 rc = ll_file_getstripe(inode, arg, lum_size);
2252 cl_lov_delay_create_clear(&file->f_flags);
2255 OBD_FREE_LARGE(klum, lum_size);
2261 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2263 struct ll_inode_info *lli = ll_i2info(inode);
2264 struct cl_object *obj = lli->lli_clob;
2265 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2266 struct ll_grouplock grouplock;
2271 CWARN("group id for group lock must not be 0\n");
2275 if (ll_file_nolock(file))
2276 RETURN(-EOPNOTSUPP);
2278 if (file->f_flags & O_NONBLOCK) {
2279 if (!mutex_trylock(&lli->lli_group_mutex))
2282 mutex_lock(&lli->lli_group_mutex);
2284 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2285 CWARN("group lock already existed with gid %lu\n",
2286 fd->fd_grouplock.lg_gid);
2287 GOTO(out, rc = -EINVAL);
2289 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2290 if (file->f_flags & O_NONBLOCK)
2291 GOTO(out, rc = -EAGAIN);
2292 mutex_unlock(&lli->lli_group_mutex);
2293 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2294 GOTO(retry, rc = 0);
2296 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2299 * XXX: group lock needs to protect all OST objects while PFL
2300 * can add new OST objects during the IO, so we'd instantiate
2301 * all OST objects before getting its group lock.
2306 struct cl_layout cl = {
2307 .cl_is_composite = false,
2309 struct lu_extent ext = {
2311 .e_end = OBD_OBJECT_EOF,
2314 env = cl_env_get(&refcheck);
2316 GOTO(out, rc = PTR_ERR(env));
2318 rc = cl_object_layout_get(env, obj, &cl);
2319 if (!rc && cl.cl_is_composite)
2320 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2323 cl_env_put(env, &refcheck);
2328 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2329 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2334 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2335 fd->fd_grouplock = grouplock;
2336 if (lli->lli_group_users == 0)
2337 lli->lli_group_gid = grouplock.lg_gid;
2338 lli->lli_group_users++;
2340 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2342 mutex_unlock(&lli->lli_group_mutex);
2347 static int ll_put_grouplock(struct inode *inode, struct file *file,
2350 struct ll_inode_info *lli = ll_i2info(inode);
2351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2352 struct ll_grouplock grouplock;
2356 mutex_lock(&lli->lli_group_mutex);
2357 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2358 CWARN("no group lock held\n");
2359 GOTO(out, rc = -EINVAL);
2362 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2364 if (fd->fd_grouplock.lg_gid != arg) {
2365 CWARN("group lock %lu doesn't match current id %lu\n",
2366 arg, fd->fd_grouplock.lg_gid);
2367 GOTO(out, rc = -EINVAL);
2370 grouplock = fd->fd_grouplock;
2371 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2372 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2374 cl_put_grouplock(&grouplock);
2376 lli->lli_group_users--;
2377 if (lli->lli_group_users == 0) {
2378 lli->lli_group_gid = 0;
2379 wake_up_var(&lli->lli_group_users);
2381 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2384 mutex_unlock(&lli->lli_group_mutex);
2390 * Close inode open handle
2392 * \param dentry [in] dentry which contains the inode
2393 * \param it [in,out] intent which contains open info and result
2396 * \retval <0 failure
2398 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2400 struct inode *inode = dentry->d_inode;
2401 struct obd_client_handle *och;
2407 /* Root ? Do nothing. */
2408 if (dentry->d_inode->i_sb->s_root == dentry)
2411 /* No open handle to close? Move away */
2412 if (!it_disposition(it, DISP_OPEN_OPEN))
2415 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2417 OBD_ALLOC(och, sizeof(*och));
2419 GOTO(out, rc = -ENOMEM);
2421 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2425 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2427 /* this one is in place of ll_file_open */
2428 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2429 ptlrpc_req_finished(it->it_request);
2430 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2436 * Get size for inode for which FIEMAP mapping is requested.
2437 * Make the FIEMAP get_info call and returns the result.
2438 * \param fiemap kernel buffer to hold extens
2439 * \param num_bytes kernel buffer size
2441 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2447 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2450 /* Checks for fiemap flags */
2451 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2452 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2456 /* Check for FIEMAP_FLAG_SYNC */
2457 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2458 rc = filemap_fdatawrite(inode->i_mapping);
2463 env = cl_env_get(&refcheck);
2465 RETURN(PTR_ERR(env));
2467 if (i_size_read(inode) == 0) {
2468 rc = ll_glimpse_size(inode);
2473 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2474 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2475 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2477 /* If filesize is 0, then there would be no objects for mapping */
2478 if (fmkey.lfik_oa.o_size == 0) {
2479 fiemap->fm_mapped_extents = 0;
2483 fmkey.lfik_fiemap = *fiemap;
2485 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2486 &fmkey, fiemap, &num_bytes);
2488 cl_env_put(env, &refcheck);
2492 int ll_fid2path(struct inode *inode, void __user *arg)
2494 struct obd_export *exp = ll_i2mdexp(inode);
2495 const struct getinfo_fid2path __user *gfin = arg;
2497 struct getinfo_fid2path *gfout;
2503 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2504 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2507 /* Only need to get the buflen */
2508 if (get_user(pathlen, &gfin->gf_pathlen))
2511 if (pathlen > PATH_MAX)
2514 outsize = sizeof(*gfout) + pathlen;
2515 OBD_ALLOC(gfout, outsize);
2519 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2520 GOTO(gf_free, rc = -EFAULT);
2521 /* append root FID after gfout to let MDT know the root FID so that it
2522 * can lookup the correct path, this is mainly for fileset.
2523 * old server without fileset mount support will ignore this. */
2524 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2526 /* Call mdc_iocontrol */
2527 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2531 if (copy_to_user(arg, gfout, outsize))
2535 OBD_FREE(gfout, outsize);
2540 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2542 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2550 ioc->idv_version = 0;
2551 ioc->idv_layout_version = UINT_MAX;
2553 /* If no file object initialized, we consider its version is 0. */
2557 env = cl_env_get(&refcheck);
2559 RETURN(PTR_ERR(env));
2561 io = vvp_env_thread_io(env);
2563 io->u.ci_data_version.dv_data_version = 0;
2564 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2565 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2568 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2569 result = cl_io_loop(env, io);
2571 result = io->ci_result;
2573 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2574 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2576 cl_io_fini(env, io);
2578 if (unlikely(io->ci_need_restart))
2581 cl_env_put(env, &refcheck);
2587 * Read the data_version for inode.
2589 * This value is computed using stripe object version on OST.
2590 * Version is computed using server side locking.
2592 * @param flags if do sync on the OST side;
2594 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2595 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2597 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2599 struct ioc_data_version ioc = { .idv_flags = flags };
2602 rc = ll_ioc_data_version(inode, &ioc);
2604 *data_version = ioc.idv_version;
2610 * Trigger a HSM release request for the provided inode.
2612 int ll_hsm_release(struct inode *inode)
2615 struct obd_client_handle *och = NULL;
2616 __u64 data_version = 0;
2621 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2622 ll_i2sbi(inode)->ll_fsname,
2623 PFID(&ll_i2info(inode)->lli_fid));
2625 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2627 GOTO(out, rc = PTR_ERR(och));
2629 /* Grab latest data_version and [am]time values */
2630 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2634 env = cl_env_get(&refcheck);
2636 GOTO(out, rc = PTR_ERR(env));
2638 rc = ll_merge_attr(env, inode);
2639 cl_env_put(env, &refcheck);
2641 /* If error happen, we have the wrong size for a file.
2647 /* Release the file.
2648 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2649 * we still need it to pack l_remote_handle to MDT. */
2650 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2656 if (och != NULL && !IS_ERR(och)) /* close the file */
2657 ll_lease_close(och, inode, NULL);
2662 struct ll_swap_stack {
2665 struct inode *inode1;
2666 struct inode *inode2;
2671 static int ll_swap_layouts(struct file *file1, struct file *file2,
2672 struct lustre_swap_layouts *lsl)
2674 struct mdc_swap_layouts msl;
2675 struct md_op_data *op_data;
2678 struct ll_swap_stack *llss = NULL;
2681 OBD_ALLOC_PTR(llss);
2685 llss->inode1 = file_inode(file1);
2686 llss->inode2 = file_inode(file2);
2688 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2692 /* we use 2 bool because it is easier to swap than 2 bits */
2693 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2694 llss->check_dv1 = true;
2696 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2697 llss->check_dv2 = true;
2699 /* we cannot use lsl->sl_dvX directly because we may swap them */
2700 llss->dv1 = lsl->sl_dv1;
2701 llss->dv2 = lsl->sl_dv2;
2703 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2704 if (rc == 0) /* same file, done! */
2707 if (rc < 0) { /* sequentialize it */
2708 swap(llss->inode1, llss->inode2);
2710 swap(llss->dv1, llss->dv2);
2711 swap(llss->check_dv1, llss->check_dv2);
2715 if (gid != 0) { /* application asks to flush dirty cache */
2716 rc = ll_get_grouplock(llss->inode1, file1, gid);
2720 rc = ll_get_grouplock(llss->inode2, file2, gid);
2722 ll_put_grouplock(llss->inode1, file1, gid);
2727 /* ultimate check, before swaping the layouts we check if
2728 * dataversion has changed (if requested) */
2729 if (llss->check_dv1) {
2730 rc = ll_data_version(llss->inode1, &dv, 0);
2733 if (dv != llss->dv1)
2734 GOTO(putgl, rc = -EAGAIN);
2737 if (llss->check_dv2) {
2738 rc = ll_data_version(llss->inode2, &dv, 0);
2741 if (dv != llss->dv2)
2742 GOTO(putgl, rc = -EAGAIN);
2745 /* struct md_op_data is used to send the swap args to the mdt
2746 * only flags is missing, so we use struct mdc_swap_layouts
2747 * through the md_op_data->op_data */
2748 /* flags from user space have to be converted before they are send to
2749 * server, no flag is sent today, they are only used on the client */
2752 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2753 0, LUSTRE_OPC_ANY, &msl);
2754 if (IS_ERR(op_data))
2755 GOTO(free, rc = PTR_ERR(op_data));
2757 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2758 sizeof(*op_data), op_data, NULL);
2759 ll_finish_md_op_data(op_data);
2766 ll_put_grouplock(llss->inode2, file2, gid);
2767 ll_put_grouplock(llss->inode1, file1, gid);
2777 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2779 struct obd_export *exp = ll_i2mdexp(inode);
2780 struct md_op_data *op_data;
2784 /* Detect out-of range masks */
2785 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2788 /* Non-root users are forbidden to set or clear flags which are
2789 * NOT defined in HSM_USER_MASK. */
2790 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2791 !cfs_capable(CFS_CAP_SYS_ADMIN))
2794 if (!exp_connect_archive_id_array(exp)) {
2795 /* Detect out-of range archive id */
2796 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2797 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2801 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2802 LUSTRE_OPC_ANY, hss);
2803 if (IS_ERR(op_data))
2804 RETURN(PTR_ERR(op_data));
2806 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2809 ll_finish_md_op_data(op_data);
2814 static int ll_hsm_import(struct inode *inode, struct file *file,
2815 struct hsm_user_import *hui)
2817 struct hsm_state_set *hss = NULL;
2818 struct iattr *attr = NULL;
2822 if (!S_ISREG(inode->i_mode))
2828 GOTO(out, rc = -ENOMEM);
2830 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2831 hss->hss_archive_id = hui->hui_archive_id;
2832 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2833 rc = ll_hsm_state_set(inode, hss);
2837 OBD_ALLOC_PTR(attr);
2839 GOTO(out, rc = -ENOMEM);
2841 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2842 attr->ia_mode |= S_IFREG;
2843 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2844 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2845 attr->ia_size = hui->hui_size;
2846 attr->ia_mtime.tv_sec = hui->hui_mtime;
2847 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2848 attr->ia_atime.tv_sec = hui->hui_atime;
2849 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2851 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2852 ATTR_UID | ATTR_GID |
2853 ATTR_MTIME | ATTR_MTIME_SET |
2854 ATTR_ATIME | ATTR_ATIME_SET;
2858 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2862 inode_unlock(inode);
2874 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2876 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2877 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2880 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2882 struct inode *inode = file_inode(file);
2884 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2885 ATTR_MTIME | ATTR_MTIME_SET |
2888 .tv_sec = lfu->lfu_atime_sec,
2889 .tv_nsec = lfu->lfu_atime_nsec,
2892 .tv_sec = lfu->lfu_mtime_sec,
2893 .tv_nsec = lfu->lfu_mtime_nsec,
2896 .tv_sec = lfu->lfu_ctime_sec,
2897 .tv_nsec = lfu->lfu_ctime_nsec,
2903 if (!capable(CAP_SYS_ADMIN))
2906 if (!S_ISREG(inode->i_mode))
2910 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2912 inode_unlock(inode);
2917 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2920 case MODE_READ_USER:
2922 case MODE_WRITE_USER:
2929 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2931 /* Used to allow the upper layers of the client to request an LDLM lock
2932 * without doing an actual read or write.
2934 * Used for ladvise lockahead to manually request specific locks.
2936 * \param[in] file file this ladvise lock request is on
2937 * \param[in] ladvise ladvise struct describing this lock request
2939 * \retval 0 success, no detailed result available (sync requests
2940 * and requests sent to the server [not handled locally]
2941 * cannot return detailed results)
2942 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2943 * see definitions for details.
2944 * \retval negative negative errno on error
2946 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2948 struct lu_env *env = NULL;
2949 struct cl_io *io = NULL;
2950 struct cl_lock *lock = NULL;
2951 struct cl_lock_descr *descr = NULL;
2952 struct dentry *dentry = file->f_path.dentry;
2953 struct inode *inode = dentry->d_inode;
2954 enum cl_lock_mode cl_mode;
2955 off_t start = ladvise->lla_start;
2956 off_t end = ladvise->lla_end;
2962 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2963 "start=%llu, end=%llu\n", dentry->d_name.len,
2964 dentry->d_name.name, dentry->d_inode,
2965 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2968 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2970 GOTO(out, result = cl_mode);
2972 /* Get IO environment */
2973 result = cl_io_get(inode, &env, &io, &refcheck);
2977 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2980 * nothing to do for this io. This currently happens when
2981 * stripe sub-object's are not yet created.
2983 result = io->ci_result;
2984 } else if (result == 0) {
2985 lock = vvp_env_lock(env);
2986 descr = &lock->cll_descr;
2988 descr->cld_obj = io->ci_obj;
2989 /* Convert byte offsets to pages */
2990 descr->cld_start = cl_index(io->ci_obj, start);
2991 descr->cld_end = cl_index(io->ci_obj, end);
2992 descr->cld_mode = cl_mode;
2993 /* CEF_MUST is used because we do not want to convert a
2994 * lockahead request to a lockless lock */
2995 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2998 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2999 descr->cld_enq_flags |= CEF_SPECULATIVE;
3001 result = cl_lock_request(env, io, lock);
3003 /* On success, we need to release the lock */
3005 cl_lock_release(env, lock);
3007 cl_io_fini(env, io);
3008 cl_env_put(env, &refcheck);
3010 /* -ECANCELED indicates a matching lock with a different extent
3011 * was already present, and -EEXIST indicates a matching lock
3012 * on exactly the same extent was already present.
3013 * We convert them to positive values for userspace to make
3014 * recognizing true errors easier.
3015 * Note we can only return these detailed results on async requests,
3016 * as sync requests look the same as i/o requests for locking. */
3017 if (result == -ECANCELED)
3018 result = LLA_RESULT_DIFFERENT;
3019 else if (result == -EEXIST)
3020 result = LLA_RESULT_SAME;
3025 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3027 static int ll_ladvise_sanity(struct inode *inode,
3028 struct llapi_lu_ladvise *ladvise)
3030 struct ll_sb_info *sbi = ll_i2sbi(inode);
3031 enum lu_ladvise_type advice = ladvise->lla_advice;
3032 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3033 * be in the first 32 bits of enum ladvise_flags */
3034 __u32 flags = ladvise->lla_peradvice_flags;
3035 /* 3 lines at 80 characters per line, should be plenty */
3038 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3040 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3041 "last supported advice is %s (value '%d'): rc = %d\n",
3042 sbi->ll_fsname, advice,
3043 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3047 /* Per-advice checks */
3049 case LU_LADVISE_LOCKNOEXPAND:
3050 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3052 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3053 "rc = %d\n", sbi->ll_fsname, flags,
3054 ladvise_names[advice], rc);
3058 case LU_LADVISE_LOCKAHEAD:
3059 /* Currently only READ and WRITE modes can be requested */
3060 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3061 ladvise->lla_lockahead_mode == 0) {
3063 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3064 "rc = %d\n", sbi->ll_fsname,
3065 ladvise->lla_lockahead_mode,
3066 ladvise_names[advice], rc);
3070 case LU_LADVISE_WILLREAD:
3071 case LU_LADVISE_DONTNEED:
3073 /* Note fall through above - These checks apply to all advices
3074 * except LOCKNOEXPAND */
3075 if (flags & ~LF_DEFAULT_MASK) {
3077 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3078 "rc = %d\n", sbi->ll_fsname, flags,
3079 ladvise_names[advice], rc);
3082 if (ladvise->lla_start >= ladvise->lla_end) {
3084 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3085 "for %s: rc = %d\n", sbi->ll_fsname,
3086 ladvise->lla_start, ladvise->lla_end,
3087 ladvise_names[advice], rc);
3099 * Give file access advices
3101 * The ladvise interface is similar to Linux fadvise() system call, except it
3102 * forwards the advices directly from Lustre client to server. The server side
3103 * codes will apply appropriate read-ahead and caching techniques for the
3104 * corresponding files.
3106 * A typical workload for ladvise is e.g. a bunch of different clients are
3107 * doing small random reads of a file, so prefetching pages into OSS cache
3108 * with big linear reads before the random IO is a net benefit. Fetching
3109 * all that data into each client cache with fadvise() may not be, due to
3110 * much more data being sent to the client.
3112 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3113 struct llapi_lu_ladvise *ladvise)
3117 struct cl_ladvise_io *lio;
3122 env = cl_env_get(&refcheck);
3124 RETURN(PTR_ERR(env));
3126 io = vvp_env_thread_io(env);
3127 io->ci_obj = ll_i2info(inode)->lli_clob;
3129 /* initialize parameters for ladvise */
3130 lio = &io->u.ci_ladvise;
3131 lio->li_start = ladvise->lla_start;
3132 lio->li_end = ladvise->lla_end;
3133 lio->li_fid = ll_inode2fid(inode);
3134 lio->li_advice = ladvise->lla_advice;
3135 lio->li_flags = flags;
3137 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3138 rc = cl_io_loop(env, io);
3142 cl_io_fini(env, io);
3143 cl_env_put(env, &refcheck);
3147 static int ll_lock_noexpand(struct file *file, int flags)
3149 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3151 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3156 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3159 struct fsxattr fsxattr;
3161 if (copy_from_user(&fsxattr,
3162 (const struct fsxattr __user *)arg,
3166 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3167 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3168 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3169 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3170 if (copy_to_user((struct fsxattr __user *)arg,
3171 &fsxattr, sizeof(fsxattr)))
3177 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3180 * Project Quota ID state is only allowed to change from within the init
3181 * namespace. Enforce that restriction only if we are trying to change
3182 * the quota ID state. Everything else is allowed in user namespaces.
3184 if (current_user_ns() == &init_user_ns)
3187 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3190 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3191 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3194 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3201 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3205 struct md_op_data *op_data;
3206 struct ptlrpc_request *req = NULL;
3208 struct fsxattr fsxattr;
3209 struct cl_object *obj;
3213 if (copy_from_user(&fsxattr,
3214 (const struct fsxattr __user *)arg,
3218 rc = ll_ioctl_check_project(inode, &fsxattr);
3222 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3223 LUSTRE_OPC_ANY, NULL);
3224 if (IS_ERR(op_data))
3225 RETURN(PTR_ERR(op_data));
3227 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3228 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3229 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3230 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3231 op_data->op_projid = fsxattr.fsx_projid;
3232 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3233 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3235 ptlrpc_req_finished(req);
3237 GOTO(out_fsxattr, rc);
3238 ll_update_inode_flags(inode, op_data->op_attr_flags);
3239 obj = ll_i2info(inode)->lli_clob;
3241 GOTO(out_fsxattr, rc);
3243 OBD_ALLOC_PTR(attr);
3245 GOTO(out_fsxattr, rc = -ENOMEM);
3247 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3248 fsxattr.fsx_xflags);
3251 ll_finish_md_op_data(op_data);
3255 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3258 struct inode *inode = file_inode(file);
3259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3260 struct ll_inode_info *lli = ll_i2info(inode);
3261 struct obd_client_handle *och = NULL;
3262 struct split_param sp;
3263 struct pcc_param param;
3264 bool lease_broken = false;
3266 enum mds_op_bias bias = 0;
3267 struct file *layout_file = NULL;
3269 size_t data_size = 0;
3270 bool attached = false;
3275 mutex_lock(&lli->lli_och_mutex);
3276 if (fd->fd_lease_och != NULL) {
3277 och = fd->fd_lease_och;
3278 fd->fd_lease_och = NULL;
3280 mutex_unlock(&lli->lli_och_mutex);
3285 fmode = och->och_flags;
3287 switch (ioc->lil_flags) {
3288 case LL_LEASE_RESYNC_DONE:
3289 if (ioc->lil_count > IOC_IDS_MAX)
3290 GOTO(out_lease_close, rc = -EINVAL);
3292 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3293 OBD_ALLOC(data, data_size);
3295 GOTO(out_lease_close, rc = -ENOMEM);
3297 if (copy_from_user(data, (void __user *)arg, data_size))
3298 GOTO(out_lease_close, rc = -EFAULT);
3300 bias = MDS_CLOSE_RESYNC_DONE;
3302 case LL_LEASE_LAYOUT_MERGE: {
3305 if (ioc->lil_count != 1)
3306 GOTO(out_lease_close, rc = -EINVAL);
3308 arg += sizeof(*ioc);
3309 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3310 GOTO(out_lease_close, rc = -EFAULT);
3312 layout_file = fget(fd);
3314 GOTO(out_lease_close, rc = -EBADF);
3316 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3317 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3318 GOTO(out_lease_close, rc = -EPERM);
3320 data = file_inode(layout_file);
3321 bias = MDS_CLOSE_LAYOUT_MERGE;
3324 case LL_LEASE_LAYOUT_SPLIT: {
3328 if (ioc->lil_count != 2)
3329 GOTO(out_lease_close, rc = -EINVAL);
3331 arg += sizeof(*ioc);
3332 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3333 GOTO(out_lease_close, rc = -EFAULT);
3335 arg += sizeof(__u32);
3336 if (copy_from_user(&mirror_id, (void __user *)arg,
3338 GOTO(out_lease_close, rc = -EFAULT);
3340 layout_file = fget(fdv);
3342 GOTO(out_lease_close, rc = -EBADF);
3344 sp.sp_inode = file_inode(layout_file);
3345 sp.sp_mirror_id = (__u16)mirror_id;
3347 bias = MDS_CLOSE_LAYOUT_SPLIT;
3350 case LL_LEASE_PCC_ATTACH:
3351 if (ioc->lil_count != 1)
3354 arg += sizeof(*ioc);
3355 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3357 GOTO(out_lease_close, rc2 = -EFAULT);
3359 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3361 GOTO(out_lease_close, rc2);
3364 /* Grab latest data version */
3365 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3368 GOTO(out_lease_close, rc2);
3371 bias = MDS_PCC_ATTACH;
3374 /* without close intent */
3379 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3383 rc = ll_lease_och_release(inode, file);
3392 switch (ioc->lil_flags) {
3393 case LL_LEASE_RESYNC_DONE:
3395 OBD_FREE(data, data_size);
3397 case LL_LEASE_LAYOUT_MERGE:
3398 case LL_LEASE_LAYOUT_SPLIT:
3402 case LL_LEASE_PCC_ATTACH:
3405 rc = pcc_readwrite_attach_fini(file, inode,
3406 param.pa_layout_gen,
3413 rc = ll_lease_type_from_fmode(fmode);
3417 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3420 struct inode *inode = file_inode(file);
3421 struct ll_inode_info *lli = ll_i2info(inode);
3422 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3423 struct obd_client_handle *och = NULL;
3424 __u64 open_flags = 0;
3430 switch (ioc->lil_mode) {
3431 case LL_LEASE_WRLCK:
3432 if (!(file->f_mode & FMODE_WRITE))
3434 fmode = FMODE_WRITE;
3436 case LL_LEASE_RDLCK:
3437 if (!(file->f_mode & FMODE_READ))
3441 case LL_LEASE_UNLCK:
3442 RETURN(ll_file_unlock_lease(file, ioc, arg));
3447 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3449 /* apply for lease */
3450 if (ioc->lil_flags & LL_LEASE_RESYNC)
3451 open_flags = MDS_OPEN_RESYNC;
3452 och = ll_lease_open(inode, file, fmode, open_flags);
3454 RETURN(PTR_ERR(och));
3456 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3457 rc = ll_lease_file_resync(och, inode, arg);
3459 ll_lease_close(och, inode, NULL);
3462 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3464 ll_lease_close(och, inode, NULL);
3470 mutex_lock(&lli->lli_och_mutex);
3471 if (fd->fd_lease_och == NULL) {
3472 fd->fd_lease_och = och;
3475 mutex_unlock(&lli->lli_och_mutex);
3477 /* impossible now that only excl is supported for now */
3478 ll_lease_close(och, inode, &lease_broken);
3484 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3486 struct ll_inode_info *lli = ll_i2info(inode);
3487 struct ll_sb_info *sbi = ll_i2sbi(inode);
3488 __u64 now = ktime_get_real_seconds();
3491 spin_lock(&lli->lli_heat_lock);
3492 heat->lh_flags = lli->lli_heat_flags;
3493 for (i = 0; i < heat->lh_count; i++)
3494 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3495 now, sbi->ll_heat_decay_weight,
3496 sbi->ll_heat_period_second);
3497 spin_unlock(&lli->lli_heat_lock);
3500 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3502 struct ll_inode_info *lli = ll_i2info(inode);
3505 spin_lock(&lli->lli_heat_lock);
3506 if (flags & LU_HEAT_FLAG_CLEAR)
3507 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3509 if (flags & LU_HEAT_FLAG_OFF)
3510 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3512 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3514 spin_unlock(&lli->lli_heat_lock);
3520 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3522 struct inode *inode = file_inode(file);
3523 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3527 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3528 PFID(ll_inode2fid(inode)), inode, cmd);
3529 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3531 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3532 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3536 case LL_IOC_GETFLAGS:
3537 /* Get the current value of the file flags */
3538 return put_user(fd->fd_flags, (int __user *)arg);
3539 case LL_IOC_SETFLAGS:
3540 case LL_IOC_CLRFLAGS:
3541 /* Set or clear specific file flags */
3542 /* XXX This probably needs checks to ensure the flags are
3543 * not abused, and to handle any flag side effects.
3545 if (get_user(flags, (int __user *) arg))
3548 if (cmd == LL_IOC_SETFLAGS) {
3549 if ((flags & LL_FILE_IGNORE_LOCK) &&
3550 !(file->f_flags & O_DIRECT)) {
3551 CERROR("%s: unable to disable locking on "
3552 "non-O_DIRECT file\n", current->comm);
3556 fd->fd_flags |= flags;
3558 fd->fd_flags &= ~flags;
3561 case LL_IOC_LOV_SETSTRIPE:
3562 case LL_IOC_LOV_SETSTRIPE_NEW:
3563 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3564 case LL_IOC_LOV_SETEA:
3565 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3566 case LL_IOC_LOV_SWAP_LAYOUTS: {
3568 struct lustre_swap_layouts lsl;
3570 if (copy_from_user(&lsl, (char __user *)arg,
3571 sizeof(struct lustre_swap_layouts)))
3574 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3577 file2 = fget(lsl.sl_fd);
3581 /* O_WRONLY or O_RDWR */
3582 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3583 GOTO(out, rc = -EPERM);
3585 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3586 struct inode *inode2;
3587 struct ll_inode_info *lli;
3588 struct obd_client_handle *och = NULL;
3590 lli = ll_i2info(inode);
3591 mutex_lock(&lli->lli_och_mutex);
3592 if (fd->fd_lease_och != NULL) {
3593 och = fd->fd_lease_och;
3594 fd->fd_lease_och = NULL;
3596 mutex_unlock(&lli->lli_och_mutex);
3598 GOTO(out, rc = -ENOLCK);
3599 inode2 = file_inode(file2);
3600 rc = ll_swap_layouts_close(och, inode, inode2);
3602 rc = ll_swap_layouts(file, file2, &lsl);
3608 case LL_IOC_LOV_GETSTRIPE:
3609 case LL_IOC_LOV_GETSTRIPE_NEW:
3610 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3611 case FS_IOC_GETFLAGS:
3612 case FS_IOC_SETFLAGS:
3613 RETURN(ll_iocontrol(inode, file, cmd, arg));
3614 case FSFILT_IOC_GETVERSION:
3615 case FS_IOC_GETVERSION:
3616 RETURN(put_user(inode->i_generation, (int __user *)arg));
3617 /* We need to special case any other ioctls we want to handle,
3618 * to send them to the MDS/OST as appropriate and to properly
3619 * network encode the arg field. */
3620 case FS_IOC_SETVERSION:
3623 case LL_IOC_GROUP_LOCK:
3624 RETURN(ll_get_grouplock(inode, file, arg));
3625 case LL_IOC_GROUP_UNLOCK:
3626 RETURN(ll_put_grouplock(inode, file, arg));
3627 case IOC_OBD_STATFS:
3628 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3630 case LL_IOC_FLUSHCTX:
3631 RETURN(ll_flush_ctx(inode));
3632 case LL_IOC_PATH2FID: {
3633 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3634 sizeof(struct lu_fid)))
3639 case LL_IOC_GETPARENT:
3640 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3642 case OBD_IOC_FID2PATH:
3643 RETURN(ll_fid2path(inode, (void __user *)arg));
3644 case LL_IOC_DATA_VERSION: {
3645 struct ioc_data_version idv;
3648 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3651 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3652 rc = ll_ioc_data_version(inode, &idv);
3655 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3661 case LL_IOC_GET_MDTIDX: {
3664 mdtidx = ll_get_mdt_idx(inode);
3668 if (put_user((int)mdtidx, (int __user *)arg))
3673 case OBD_IOC_GETDTNAME:
3674 case OBD_IOC_GETMDNAME:
3675 RETURN(ll_get_obd_name(inode, cmd, arg));
3676 case LL_IOC_HSM_STATE_GET: {
3677 struct md_op_data *op_data;
3678 struct hsm_user_state *hus;
3685 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3686 LUSTRE_OPC_ANY, hus);
3687 if (IS_ERR(op_data)) {
3689 RETURN(PTR_ERR(op_data));
3692 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3695 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3698 ll_finish_md_op_data(op_data);
3702 case LL_IOC_HSM_STATE_SET: {
3703 struct hsm_state_set *hss;
3710 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3715 rc = ll_hsm_state_set(inode, hss);
3720 case LL_IOC_HSM_ACTION: {
3721 struct md_op_data *op_data;
3722 struct hsm_current_action *hca;
3729 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3730 LUSTRE_OPC_ANY, hca);
3731 if (IS_ERR(op_data)) {
3733 RETURN(PTR_ERR(op_data));
3736 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3739 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3742 ll_finish_md_op_data(op_data);
3746 case LL_IOC_SET_LEASE_OLD: {
3747 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3749 RETURN(ll_file_set_lease(file, &ioc, 0));
3751 case LL_IOC_SET_LEASE: {
3752 struct ll_ioc_lease ioc;
3754 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3757 RETURN(ll_file_set_lease(file, &ioc, arg));
3759 case LL_IOC_GET_LEASE: {
3760 struct ll_inode_info *lli = ll_i2info(inode);
3761 struct ldlm_lock *lock = NULL;
3764 mutex_lock(&lli->lli_och_mutex);
3765 if (fd->fd_lease_och != NULL) {
3766 struct obd_client_handle *och = fd->fd_lease_och;
3768 lock = ldlm_handle2lock(&och->och_lease_handle);
3770 lock_res_and_lock(lock);
3771 if (!ldlm_is_cancel(lock))
3772 fmode = och->och_flags;
3774 unlock_res_and_lock(lock);
3775 LDLM_LOCK_PUT(lock);
3778 mutex_unlock(&lli->lli_och_mutex);
3780 RETURN(ll_lease_type_from_fmode(fmode));
3782 case LL_IOC_HSM_IMPORT: {
3783 struct hsm_user_import *hui;
3789 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3794 rc = ll_hsm_import(inode, file, hui);
3799 case LL_IOC_FUTIMES_3: {
3800 struct ll_futimes_3 lfu;
3802 if (copy_from_user(&lfu,
3803 (const struct ll_futimes_3 __user *)arg,
3807 RETURN(ll_file_futimes_3(file, &lfu));
3809 case LL_IOC_LADVISE: {
3810 struct llapi_ladvise_hdr *k_ladvise_hdr;
3811 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3814 int alloc_size = sizeof(*k_ladvise_hdr);
3817 u_ladvise_hdr = (void __user *)arg;
3818 OBD_ALLOC_PTR(k_ladvise_hdr);
3819 if (k_ladvise_hdr == NULL)
3822 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3823 GOTO(out_ladvise, rc = -EFAULT);
3825 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3826 k_ladvise_hdr->lah_count < 1)
3827 GOTO(out_ladvise, rc = -EINVAL);
3829 num_advise = k_ladvise_hdr->lah_count;
3830 if (num_advise >= LAH_COUNT_MAX)
3831 GOTO(out_ladvise, rc = -EFBIG);
3833 OBD_FREE_PTR(k_ladvise_hdr);
3834 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3835 lah_advise[num_advise]);
3836 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3837 if (k_ladvise_hdr == NULL)
3841 * TODO: submit multiple advices to one server in a single RPC
3843 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3844 GOTO(out_ladvise, rc = -EFAULT);
3846 for (i = 0; i < num_advise; i++) {
3847 struct llapi_lu_ladvise *k_ladvise =
3848 &k_ladvise_hdr->lah_advise[i];
3849 struct llapi_lu_ladvise __user *u_ladvise =
3850 &u_ladvise_hdr->lah_advise[i];
3852 rc = ll_ladvise_sanity(inode, k_ladvise);
3854 GOTO(out_ladvise, rc);
3856 switch (k_ladvise->lla_advice) {
3857 case LU_LADVISE_LOCKNOEXPAND:
3858 rc = ll_lock_noexpand(file,
3859 k_ladvise->lla_peradvice_flags);
3860 GOTO(out_ladvise, rc);
3861 case LU_LADVISE_LOCKAHEAD:
3863 rc = ll_file_lock_ahead(file, k_ladvise);
3866 GOTO(out_ladvise, rc);
3869 &u_ladvise->lla_lockahead_result))
3870 GOTO(out_ladvise, rc = -EFAULT);
3873 rc = ll_ladvise(inode, file,
3874 k_ladvise_hdr->lah_flags,
3877 GOTO(out_ladvise, rc);
3884 OBD_FREE(k_ladvise_hdr, alloc_size);
3887 case LL_IOC_FLR_SET_MIRROR: {
3888 /* mirror I/O must be direct to avoid polluting page cache
3890 if (!(file->f_flags & O_DIRECT))
3893 fd->fd_designated_mirror = (__u32)arg;
3896 case LL_IOC_FSGETXATTR:
3897 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3898 case LL_IOC_FSSETXATTR:
3899 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3901 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3902 case LL_IOC_HEAT_GET: {
3903 struct lu_heat uheat;
3904 struct lu_heat *heat;
3907 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3910 if (uheat.lh_count > OBD_HEAT_COUNT)
3911 uheat.lh_count = OBD_HEAT_COUNT;
3913 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3914 OBD_ALLOC(heat, size);
3918 heat->lh_count = uheat.lh_count;
3919 ll_heat_get(inode, heat);
3920 rc = copy_to_user((char __user *)arg, heat, size);
3921 OBD_FREE(heat, size);
3922 RETURN(rc ? -EFAULT : 0);
3924 case LL_IOC_HEAT_SET: {
3927 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3930 rc = ll_heat_set(inode, flags);
3933 case LL_IOC_PCC_DETACH: {
3934 struct lu_pcc_detach *detach;
3936 OBD_ALLOC_PTR(detach);
3940 if (copy_from_user(detach,
3941 (const struct lu_pcc_detach __user *)arg,
3943 GOTO(out_detach_free, rc = -EFAULT);
3945 if (!S_ISREG(inode->i_mode))
3946 GOTO(out_detach_free, rc = -EINVAL);
3948 if (!inode_owner_or_capable(inode))
3949 GOTO(out_detach_free, rc = -EPERM);
3951 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3953 OBD_FREE_PTR(detach);
3956 case LL_IOC_PCC_STATE: {
3957 struct lu_pcc_state __user *ustate =
3958 (struct lu_pcc_state __user *)arg;
3959 struct lu_pcc_state *state;
3961 OBD_ALLOC_PTR(state);
3965 if (copy_from_user(state, ustate, sizeof(*state)))
3966 GOTO(out_state, rc = -EFAULT);
3968 rc = pcc_ioctl_state(file, inode, state);
3970 GOTO(out_state, rc);
3972 if (copy_to_user(ustate, state, sizeof(*state)))
3973 GOTO(out_state, rc = -EFAULT);
3976 OBD_FREE_PTR(state);
3980 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3981 (void __user *)arg));
3985 #ifndef HAVE_FILE_LLSEEK_SIZE
3986 static inline loff_t
3987 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3989 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3991 if (offset > maxsize)
3994 if (offset != file->f_pos) {
3995 file->f_pos = offset;
3996 file->f_version = 0;
4002 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
4003 loff_t maxsize, loff_t eof)
4005 struct inode *inode = file_inode(file);
4013 * Here we special-case the lseek(fd, 0, SEEK_CUR)
4014 * position-querying operation. Avoid rewriting the "same"
4015 * f_pos value back to the file because a concurrent read(),
4016 * write() or lseek() might have altered it
4021 * f_lock protects against read/modify/write race with other
4022 * SEEK_CURs. Note that parallel writes and reads behave
4026 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4027 inode_unlock(inode);
4031 * In the generic case the entire file is data, so as long as
4032 * offset isn't at the end of the file then the offset is data.
4039 * There is a virtual hole at the end of the file, so as long as
4040 * offset isn't i_size or larger, return i_size.
4048 return llseek_execute(file, offset, maxsize);
4052 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4054 struct inode *inode = file_inode(file);
4055 loff_t retval, eof = 0;
4056 ktime_t kstart = ktime_get();
4059 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4060 (origin == SEEK_CUR) ? file->f_pos : 0);
4061 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4062 PFID(ll_inode2fid(inode)), inode, retval, retval,
4065 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4066 retval = ll_glimpse_size(inode);
4069 eof = i_size_read(inode);
4072 retval = ll_generic_file_llseek_size(file, offset, origin,
4073 ll_file_maxbytes(inode), eof);
4075 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4076 ktime_us_delta(ktime_get(), kstart));
4080 static int ll_flush(struct file *file, fl_owner_t id)
4082 struct inode *inode = file_inode(file);
4083 struct ll_inode_info *lli = ll_i2info(inode);
4084 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4087 LASSERT(!S_ISDIR(inode->i_mode));
4089 /* catch async errors that were recorded back when async writeback
4090 * failed for pages in this mapping. */
4091 rc = lli->lli_async_rc;
4092 lli->lli_async_rc = 0;
4093 if (lli->lli_clob != NULL) {
4094 err = lov_read_and_clear_async_rc(lli->lli_clob);
4099 /* The application has been told write failure already.
4100 * Do not report failure again. */
4101 if (fd->fd_write_failed)
4103 return rc ? -EIO : 0;
4107 * Called to make sure a portion of file has been written out.
4108 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4110 * Return how many pages have been written.
4112 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4113 enum cl_fsync_mode mode, int ignore_layout)
4117 struct cl_fsync_io *fio;
4122 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4123 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4126 env = cl_env_get(&refcheck);
4128 RETURN(PTR_ERR(env));
4130 io = vvp_env_thread_io(env);
4131 io->ci_obj = ll_i2info(inode)->lli_clob;
4132 io->ci_ignore_layout = ignore_layout;
4134 /* initialize parameters for sync */
4135 fio = &io->u.ci_fsync;
4136 fio->fi_start = start;
4138 fio->fi_fid = ll_inode2fid(inode);
4139 fio->fi_mode = mode;
4140 fio->fi_nr_written = 0;
4142 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4143 result = cl_io_loop(env, io);
4145 result = io->ci_result;
4147 result = fio->fi_nr_written;
4148 cl_io_fini(env, io);
4149 cl_env_put(env, &refcheck);
4155 * When dentry is provided (the 'else' case), file_dentry() may be
4156 * null and dentry must be used directly rather than pulled from
4157 * file_dentry() as is done otherwise.
4160 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4162 struct dentry *dentry = file_dentry(file);
4163 struct inode *inode = dentry->d_inode;
4164 struct ll_inode_info *lli = ll_i2info(inode);
4165 struct ptlrpc_request *req;
4166 ktime_t kstart = ktime_get();
4171 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4173 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4175 /* fsync's caller has already called _fdata{sync,write}, we want
4176 * that IO to finish before calling the osc and mdc sync methods */
4177 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4180 /* catch async errors that were recorded back when async writeback
4181 * failed for pages in this mapping. */
4182 if (!S_ISDIR(inode->i_mode)) {
4183 err = lli->lli_async_rc;
4184 lli->lli_async_rc = 0;
4187 if (lli->lli_clob != NULL) {
4188 err = lov_read_and_clear_async_rc(lli->lli_clob);
4194 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4198 ptlrpc_req_finished(req);
4200 if (S_ISREG(inode->i_mode)) {
4201 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4204 /* Sync metadata on MDT first, and then sync the cached data
4207 err = pcc_fsync(file, start, end, datasync, &cached);
4209 err = cl_sync_file_range(inode, start, end,
4211 if (rc == 0 && err < 0)
4214 fd->fd_write_failed = true;
4216 fd->fd_write_failed = false;
4219 inode_unlock(inode);
4222 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4223 ktime_us_delta(ktime_get(), kstart));
4228 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4230 struct inode *inode = file_inode(file);
4231 struct ll_sb_info *sbi = ll_i2sbi(inode);
4232 struct ldlm_enqueue_info einfo = {
4233 .ei_type = LDLM_FLOCK,
4234 .ei_cb_cp = ldlm_flock_completion_ast,
4235 .ei_cbdata = file_lock,
4237 struct md_op_data *op_data;
4238 struct lustre_handle lockh = { 0 };
4239 union ldlm_policy_data flock = { { 0 } };
4240 int fl_type = file_lock->fl_type;
4241 ktime_t kstart = ktime_get();
4247 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4248 PFID(ll_inode2fid(inode)), file_lock);
4250 if (file_lock->fl_flags & FL_FLOCK) {
4251 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4252 /* flocks are whole-file locks */
4253 flock.l_flock.end = OFFSET_MAX;
4254 /* For flocks owner is determined by the local file desctiptor*/
4255 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4256 } else if (file_lock->fl_flags & FL_POSIX) {
4257 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4258 flock.l_flock.start = file_lock->fl_start;
4259 flock.l_flock.end = file_lock->fl_end;
4263 flock.l_flock.pid = file_lock->fl_pid;
4265 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4266 /* Somewhat ugly workaround for svc lockd.
4267 * lockd installs custom fl_lmops->lm_compare_owner that checks
4268 * for the fl_owner to be the same (which it always is on local node
4269 * I guess between lockd processes) and then compares pid.
4270 * As such we assign pid to the owner field to make it all work,
4271 * conflict with normal locks is unlikely since pid space and
4272 * pointer space for current->files are not intersecting */
4273 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4274 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4279 einfo.ei_mode = LCK_PR;
4282 /* An unlock request may or may not have any relation to
4283 * existing locks so we may not be able to pass a lock handle
4284 * via a normal ldlm_lock_cancel() request. The request may even
4285 * unlock a byte range in the middle of an existing lock. In
4286 * order to process an unlock request we need all of the same
4287 * information that is given with a normal read or write record
4288 * lock request. To avoid creating another ldlm unlock (cancel)
4289 * message we'll treat a LCK_NL flock request as an unlock. */
4290 einfo.ei_mode = LCK_NL;
4293 einfo.ei_mode = LCK_PW;
4296 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4311 flags = LDLM_FL_BLOCK_NOWAIT;
4317 flags = LDLM_FL_TEST_LOCK;
4320 CERROR("unknown fcntl lock command: %d\n", cmd);
4324 /* Save the old mode so that if the mode in the lock changes we
4325 * can decrement the appropriate reader or writer refcount. */
4326 file_lock->fl_type = einfo.ei_mode;
4328 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4329 LUSTRE_OPC_ANY, NULL);
4330 if (IS_ERR(op_data))
4331 RETURN(PTR_ERR(op_data));
4333 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4334 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4335 flock.l_flock.pid, flags, einfo.ei_mode,
4336 flock.l_flock.start, flock.l_flock.end);
4338 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4341 /* Restore the file lock type if not TEST lock. */
4342 if (!(flags & LDLM_FL_TEST_LOCK))
4343 file_lock->fl_type = fl_type;
4345 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4346 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4347 !(flags & LDLM_FL_TEST_LOCK))
4348 rc2 = locks_lock_file_wait(file, file_lock);
4350 if ((file_lock->fl_flags & FL_FLOCK) &&
4351 (rc == 0 || file_lock->fl_type == F_UNLCK))
4352 rc2 = flock_lock_file_wait(file, file_lock);
4353 if ((file_lock->fl_flags & FL_POSIX) &&
4354 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4355 !(flags & LDLM_FL_TEST_LOCK))
4356 rc2 = posix_lock_file_wait(file, file_lock);
4357 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4359 if (rc2 && file_lock->fl_type != F_UNLCK) {
4360 einfo.ei_mode = LCK_NL;
4361 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4366 ll_finish_md_op_data(op_data);
4369 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4370 ktime_us_delta(ktime_get(), kstart));
4374 int ll_get_fid_by_name(struct inode *parent, const char *name,
4375 int namelen, struct lu_fid *fid,
4376 struct inode **inode)
4378 struct md_op_data *op_data = NULL;
4379 struct mdt_body *body;
4380 struct ptlrpc_request *req;
4384 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4385 LUSTRE_OPC_ANY, NULL);
4386 if (IS_ERR(op_data))
4387 RETURN(PTR_ERR(op_data));
4389 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4390 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4391 ll_finish_md_op_data(op_data);
4395 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4397 GOTO(out_req, rc = -EFAULT);
4399 *fid = body->mbo_fid1;
4402 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4404 ptlrpc_req_finished(req);
4408 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4411 struct dentry *dchild = NULL;
4412 struct inode *child_inode = NULL;
4413 struct md_op_data *op_data;
4414 struct ptlrpc_request *request = NULL;
4415 struct obd_client_handle *och = NULL;
4417 struct mdt_body *body;
4418 __u64 data_version = 0;
4419 size_t namelen = strlen(name);
4420 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4424 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4425 PFID(ll_inode2fid(parent)), name,
4426 lum->lum_stripe_offset, lum->lum_stripe_count);
4428 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4429 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4430 lustre_swab_lmv_user_md(lum);
4432 /* Get child FID first */
4433 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4436 dchild = d_lookup(file_dentry(file), &qstr);
4438 if (dchild->d_inode)
4439 child_inode = igrab(dchild->d_inode);
4444 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4453 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4454 OBD_CONNECT2_DIR_MIGRATE)) {
4455 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4456 ll_dir_striped(child_inode)) {
4457 CERROR("%s: MDT doesn't support stripe directory "
4458 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4459 GOTO(out_iput, rc = -EOPNOTSUPP);
4464 * lfs migrate command needs to be blocked on the client
4465 * by checking the migrate FID against the FID of the
4468 if (child_inode == parent->i_sb->s_root->d_inode)
4469 GOTO(out_iput, rc = -EINVAL);
4471 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4472 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4473 if (IS_ERR(op_data))
4474 GOTO(out_iput, rc = PTR_ERR(op_data));
4476 inode_lock(child_inode);
4477 op_data->op_fid3 = *ll_inode2fid(child_inode);
4478 if (!fid_is_sane(&op_data->op_fid3)) {
4479 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4480 ll_i2sbi(parent)->ll_fsname, name,
4481 PFID(&op_data->op_fid3));
4482 GOTO(out_unlock, rc = -EINVAL);
4485 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4486 op_data->op_data = lum;
4487 op_data->op_data_size = lumlen;
4490 if (S_ISREG(child_inode->i_mode)) {
4491 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4495 GOTO(out_unlock, rc);
4498 rc = ll_data_version(child_inode, &data_version,
4501 GOTO(out_close, rc);
4503 op_data->op_open_handle = och->och_open_handle;
4504 op_data->op_data_version = data_version;
4505 op_data->op_lease_handle = och->och_lease_handle;
4506 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4508 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4509 och->och_mod->mod_open_req->rq_replay = 0;
4510 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4513 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4514 name, namelen, &request);
4516 LASSERT(request != NULL);
4517 ll_update_times(request, parent);
4520 if (rc == 0 || rc == -EAGAIN) {
4521 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4522 LASSERT(body != NULL);
4524 /* If the server does release layout lock, then we cleanup
4525 * the client och here, otherwise release it in out_close: */
4526 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4527 obd_mod_put(och->och_mod);
4528 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4530 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4536 if (request != NULL) {
4537 ptlrpc_req_finished(request);
4541 /* Try again if the lease has cancelled. */
4542 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4547 ll_lease_close(och, child_inode, NULL);
4549 clear_nlink(child_inode);
4551 inode_unlock(child_inode);
4552 ll_finish_md_op_data(op_data);
4559 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4561 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4565 * In order to avoid flood of warning messages, only print one message
4566 * for one file. And the entire message rate on the client is limited
4567 * by CDEBUG_LIMIT too.
4569 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4570 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4571 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4572 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4578 * test if some locks matching bits and l_req_mode are acquired
4579 * - bits can be in different locks
4580 * - if found clear the common lock bits in *bits
4581 * - the bits not found, are kept in *bits
4583 * \param bits [IN] searched lock bits [IN]
4584 * \param l_req_mode [IN] searched lock mode
4585 * \retval boolean, true iff all bits are found
4587 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4589 struct lustre_handle lockh;
4590 union ldlm_policy_data policy;
4591 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4592 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4601 fid = &ll_i2info(inode)->lli_fid;
4602 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4603 ldlm_lockname[mode]);
4605 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4606 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4607 policy.l_inodebits.bits = *bits & (1 << i);
4608 if (policy.l_inodebits.bits == 0)
4611 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4612 &policy, mode, &lockh)) {
4613 struct ldlm_lock *lock;
4615 lock = ldlm_handle2lock(&lockh);
4618 ~(lock->l_policy_data.l_inodebits.bits);
4619 LDLM_LOCK_PUT(lock);
4621 *bits &= ~policy.l_inodebits.bits;
4628 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4629 struct lustre_handle *lockh, __u64 flags,
4630 enum ldlm_mode mode)
4632 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4637 fid = &ll_i2info(inode)->lli_fid;
4638 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4640 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4641 fid, LDLM_IBITS, &policy, mode, lockh);
4646 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4648 /* Already unlinked. Just update nlink and return success */
4649 if (rc == -ENOENT) {
4651 /* If it is striped directory, and there is bad stripe
4652 * Let's revalidate the dentry again, instead of returning
4654 if (ll_dir_striped(inode))
4657 /* This path cannot be hit for regular files unless in
4658 * case of obscure races, so no need to to validate
4660 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4662 } else if (rc != 0) {
4663 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4664 "%s: revalidate FID "DFID" error: rc = %d\n",
4665 ll_i2sbi(inode)->ll_fsname,
4666 PFID(ll_inode2fid(inode)), rc);
4672 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4674 struct inode *inode = dentry->d_inode;
4675 struct obd_export *exp = ll_i2mdexp(inode);
4676 struct lookup_intent oit = {
4679 struct ptlrpc_request *req = NULL;
4680 struct md_op_data *op_data;
4684 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4685 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4687 /* Call getattr by fid, so do not provide name at all. */
4688 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4689 LUSTRE_OPC_ANY, NULL);
4690 if (IS_ERR(op_data))
4691 RETURN(PTR_ERR(op_data));
4693 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4694 ll_finish_md_op_data(op_data);
4696 rc = ll_inode_revalidate_fini(inode, rc);
4700 rc = ll_revalidate_it_finish(req, &oit, dentry);
4702 ll_intent_release(&oit);
4706 /* Unlinked? Unhash dentry, so it is not picked up later by
4707 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4708 * here to preserve get_cwd functionality on 2.6.
4710 if (!dentry->d_inode->i_nlink) {
4711 spin_lock(&inode->i_lock);
4712 d_lustre_invalidate(dentry, 0);
4713 spin_unlock(&inode->i_lock);
4716 ll_lookup_finish_locks(&oit, dentry);
4718 ptlrpc_req_finished(req);
4723 static int ll_merge_md_attr(struct inode *inode)
4725 struct ll_inode_info *lli = ll_i2info(inode);
4726 struct cl_attr attr = { 0 };
4729 LASSERT(lli->lli_lsm_md != NULL);
4731 if (!lmv_dir_striped(lli->lli_lsm_md))
4734 down_read(&lli->lli_lsm_sem);
4735 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4736 &attr, ll_md_blocking_ast);
4737 up_read(&lli->lli_lsm_sem);
4741 set_nlink(inode, attr.cat_nlink);
4742 inode->i_blocks = attr.cat_blocks;
4743 i_size_write(inode, attr.cat_size);
4745 ll_i2info(inode)->lli_atime = attr.cat_atime;
4746 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4747 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4752 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4754 struct inode *inode = de->d_inode;
4755 struct ll_sb_info *sbi = ll_i2sbi(inode);
4756 struct ll_inode_info *lli = ll_i2info(inode);
4757 ktime_t kstart = ktime_get();
4760 rc = ll_inode_revalidate(de, IT_GETATTR);
4764 if (S_ISREG(inode->i_mode)) {
4767 rc = pcc_inode_getattr(inode, &cached);
4768 if (cached && rc < 0)
4771 /* In case of restore, the MDT has the right size and has
4772 * already send it back without granting the layout lock,
4773 * inode is up-to-date so glimpse is useless.
4774 * Also to glimpse we need the layout, in case of a running
4775 * restore the MDT holds the layout lock so the glimpse will
4776 * block up to the end of restore (getattr will block)
4778 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4779 rc = ll_glimpse_size(inode);
4784 /* If object isn't regular a file then don't validate size. */
4785 if (ll_dir_striped(inode)) {
4786 rc = ll_merge_md_attr(inode);
4791 inode->i_atime.tv_sec = lli->lli_atime;
4792 inode->i_mtime.tv_sec = lli->lli_mtime;
4793 inode->i_ctime.tv_sec = lli->lli_ctime;
4796 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4798 if (ll_need_32bit_api(sbi)) {
4799 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4800 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4801 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4803 stat->ino = inode->i_ino;
4804 stat->dev = inode->i_sb->s_dev;
4805 stat->rdev = inode->i_rdev;
4808 stat->mode = inode->i_mode;
4809 stat->uid = inode->i_uid;
4810 stat->gid = inode->i_gid;
4811 stat->atime = inode->i_atime;
4812 stat->mtime = inode->i_mtime;
4813 stat->ctime = inode->i_ctime;
4814 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4816 stat->nlink = inode->i_nlink;
4817 stat->size = i_size_read(inode);
4818 stat->blocks = inode->i_blocks;
4820 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4821 ktime_us_delta(ktime_get(), kstart));
4826 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4827 int ll_getattr(const struct path *path, struct kstat *stat,
4828 u32 request_mask, unsigned int flags)
4830 struct dentry *de = path->dentry;
4832 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4835 return ll_getattr_dentry(de, stat);
4838 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4839 __u64 start, __u64 len)
4843 struct fiemap *fiemap;
4844 unsigned int extent_count = fieinfo->fi_extents_max;
4846 num_bytes = sizeof(*fiemap) + (extent_count *
4847 sizeof(struct fiemap_extent));
4848 OBD_ALLOC_LARGE(fiemap, num_bytes);
4853 fiemap->fm_flags = fieinfo->fi_flags;
4854 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4855 fiemap->fm_start = start;
4856 fiemap->fm_length = len;
4857 if (extent_count > 0 &&
4858 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4859 sizeof(struct fiemap_extent)) != 0)
4860 GOTO(out, rc = -EFAULT);
4862 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4864 fieinfo->fi_flags = fiemap->fm_flags;
4865 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4866 if (extent_count > 0 &&
4867 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4868 fiemap->fm_mapped_extents *
4869 sizeof(struct fiemap_extent)) != 0)
4870 GOTO(out, rc = -EFAULT);
4872 OBD_FREE_LARGE(fiemap, num_bytes);
4876 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4878 struct ll_inode_info *lli = ll_i2info(inode);
4879 struct posix_acl *acl = NULL;
4882 spin_lock(&lli->lli_lock);
4883 /* VFS' acl_permission_check->check_acl will release the refcount */
4884 acl = posix_acl_dup(lli->lli_posix_acl);
4885 spin_unlock(&lli->lli_lock);
4890 #ifdef HAVE_IOP_SET_ACL
4891 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4892 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4894 struct ll_sb_info *sbi = ll_i2sbi(inode);
4895 struct ptlrpc_request *req = NULL;
4896 const char *name = NULL;
4898 size_t value_size = 0;
4903 case ACL_TYPE_ACCESS:
4904 name = XATTR_NAME_POSIX_ACL_ACCESS;
4906 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4909 case ACL_TYPE_DEFAULT:
4910 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4911 if (!S_ISDIR(inode->i_mode))
4912 rc = acl ? -EACCES : 0;
4923 value_size = posix_acl_xattr_size(acl->a_count);
4924 value = kmalloc(value_size, GFP_NOFS);
4926 GOTO(out, rc = -ENOMEM);
4928 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4930 GOTO(out_value, rc);
4933 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4934 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4935 name, value, value_size, 0, 0, &req);
4937 ptlrpc_req_finished(req);
4942 forget_cached_acl(inode, type);
4944 set_cached_acl(inode, type, acl);
4947 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4948 #endif /* HAVE_IOP_SET_ACL */
4950 int ll_inode_permission(struct inode *inode, int mask)
4953 struct ll_sb_info *sbi;
4954 struct root_squash_info *squash;
4955 struct cred *cred = NULL;
4956 const struct cred *old_cred = NULL;
4958 bool squash_id = false;
4959 ktime_t kstart = ktime_get();
4962 if (mask & MAY_NOT_BLOCK)
4965 /* as root inode are NOT getting validated in lookup operation,
4966 * need to do it before permission check. */
4968 if (inode == inode->i_sb->s_root->d_inode) {
4969 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4974 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4975 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4977 /* squash fsuid/fsgid if needed */
4978 sbi = ll_i2sbi(inode);
4979 squash = &sbi->ll_squash;
4980 if (unlikely(squash->rsi_uid != 0 &&
4981 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4982 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4986 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4987 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4988 squash->rsi_uid, squash->rsi_gid);
4990 /* update current process's credentials
4991 * and FS capability */
4992 cred = prepare_creds();
4996 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4997 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4998 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4999 if ((1 << cap) & CFS_CAP_FS_MASK)
5000 cap_lower(cred->cap_effective, cap);
5002 old_cred = override_creds(cred);
5005 rc = generic_permission(inode, mask);
5006 /* restore current process's credentials and FS capability */
5008 revert_creds(old_cred);
5013 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5014 ktime_us_delta(ktime_get(), kstart));
5019 /* -o localflock - only provides locally consistent flock locks */
5020 struct file_operations ll_file_operations = {
5021 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5022 # ifdef HAVE_SYNC_READ_WRITE
5023 .read = new_sync_read,
5024 .write = new_sync_write,
5026 .read_iter = ll_file_read_iter,
5027 .write_iter = ll_file_write_iter,
5028 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5029 .read = ll_file_read,
5030 .aio_read = ll_file_aio_read,
5031 .write = ll_file_write,
5032 .aio_write = ll_file_aio_write,
5033 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5034 .unlocked_ioctl = ll_file_ioctl,
5035 .open = ll_file_open,
5036 .release = ll_file_release,
5037 .mmap = ll_file_mmap,
5038 .llseek = ll_file_seek,
5039 .splice_read = ll_file_splice_read,
5044 struct file_operations ll_file_operations_flock = {
5045 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5046 # ifdef HAVE_SYNC_READ_WRITE
5047 .read = new_sync_read,
5048 .write = new_sync_write,
5049 # endif /* HAVE_SYNC_READ_WRITE */
5050 .read_iter = ll_file_read_iter,
5051 .write_iter = ll_file_write_iter,
5052 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5053 .read = ll_file_read,
5054 .aio_read = ll_file_aio_read,
5055 .write = ll_file_write,
5056 .aio_write = ll_file_aio_write,
5057 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5058 .unlocked_ioctl = ll_file_ioctl,
5059 .open = ll_file_open,
5060 .release = ll_file_release,
5061 .mmap = ll_file_mmap,
5062 .llseek = ll_file_seek,
5063 .splice_read = ll_file_splice_read,
5066 .flock = ll_file_flock,
5067 .lock = ll_file_flock
5070 /* These are for -o noflock - to return ENOSYS on flock calls */
5071 struct file_operations ll_file_operations_noflock = {
5072 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5073 # ifdef HAVE_SYNC_READ_WRITE
5074 .read = new_sync_read,
5075 .write = new_sync_write,
5076 # endif /* HAVE_SYNC_READ_WRITE */
5077 .read_iter = ll_file_read_iter,
5078 .write_iter = ll_file_write_iter,
5079 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5080 .read = ll_file_read,
5081 .aio_read = ll_file_aio_read,
5082 .write = ll_file_write,
5083 .aio_write = ll_file_aio_write,
5084 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5085 .unlocked_ioctl = ll_file_ioctl,
5086 .open = ll_file_open,
5087 .release = ll_file_release,
5088 .mmap = ll_file_mmap,
5089 .llseek = ll_file_seek,
5090 .splice_read = ll_file_splice_read,
5093 .flock = ll_file_noflock,
5094 .lock = ll_file_noflock
5097 struct inode_operations ll_file_inode_operations = {
5098 .setattr = ll_setattr,
5099 .getattr = ll_getattr,
5100 .permission = ll_inode_permission,
5101 #ifdef HAVE_IOP_XATTR
5102 .setxattr = ll_setxattr,
5103 .getxattr = ll_getxattr,
5104 .removexattr = ll_removexattr,
5106 .listxattr = ll_listxattr,
5107 .fiemap = ll_fiemap,
5108 #ifdef HAVE_IOP_GET_ACL
5109 .get_acl = ll_get_acl,
5111 #ifdef HAVE_IOP_SET_ACL
5112 .set_acl = ll_set_acl,
5116 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5118 struct ll_inode_info *lli = ll_i2info(inode);
5119 struct cl_object *obj = lli->lli_clob;
5128 env = cl_env_get(&refcheck);
5130 RETURN(PTR_ERR(env));
5132 rc = cl_conf_set(env, lli->lli_clob, conf);
5136 if (conf->coc_opc == OBJECT_CONF_SET) {
5137 struct ldlm_lock *lock = conf->coc_lock;
5138 struct cl_layout cl = {
5142 LASSERT(lock != NULL);
5143 LASSERT(ldlm_has_layout(lock));
5145 /* it can only be allowed to match after layout is
5146 * applied to inode otherwise false layout would be
5147 * seen. Applying layout shoud happen before dropping
5148 * the intent lock. */
5149 ldlm_lock_allow_match(lock);
5151 rc = cl_object_layout_get(env, obj, &cl);
5156 DFID": layout version change: %u -> %u\n",
5157 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5159 ll_layout_version_set(lli, cl.cl_layout_gen);
5163 cl_env_put(env, &refcheck);
5168 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5169 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5172 struct ll_sb_info *sbi = ll_i2sbi(inode);
5173 struct ptlrpc_request *req;
5180 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5181 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5182 lock->l_lvb_data, lock->l_lvb_len);
5184 if (lock->l_lvb_data != NULL)
5187 /* if layout lock was granted right away, the layout is returned
5188 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5189 * blocked and then granted via completion ast, we have to fetch
5190 * layout here. Please note that we can't use the LVB buffer in
5191 * completion AST because it doesn't have a large enough buffer */
5192 rc = ll_get_default_mdsize(sbi, &lmmsize);
5196 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5197 XATTR_NAME_LOV, lmmsize, &req);
5200 GOTO(out, rc = 0); /* empty layout */
5207 if (lmmsize == 0) /* empty layout */
5210 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5212 GOTO(out, rc = -EFAULT);
5214 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5215 if (lvbdata == NULL)
5216 GOTO(out, rc = -ENOMEM);
5218 memcpy(lvbdata, lmm, lmmsize);
5219 lock_res_and_lock(lock);
5220 if (unlikely(lock->l_lvb_data == NULL)) {
5221 lock->l_lvb_type = LVB_T_LAYOUT;
5222 lock->l_lvb_data = lvbdata;
5223 lock->l_lvb_len = lmmsize;
5226 unlock_res_and_lock(lock);
5229 OBD_FREE_LARGE(lvbdata, lmmsize);
5234 ptlrpc_req_finished(req);
5239 * Apply the layout to the inode. Layout lock is held and will be released
5242 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5243 struct inode *inode)
5245 struct ll_inode_info *lli = ll_i2info(inode);
5246 struct ll_sb_info *sbi = ll_i2sbi(inode);
5247 struct ldlm_lock *lock;
5248 struct cl_object_conf conf;
5251 bool wait_layout = false;
5254 LASSERT(lustre_handle_is_used(lockh));
5256 lock = ldlm_handle2lock(lockh);
5257 LASSERT(lock != NULL);
5258 LASSERT(ldlm_has_layout(lock));
5260 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5261 PFID(&lli->lli_fid), inode);
5263 /* in case this is a caching lock and reinstate with new inode */
5264 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5266 lock_res_and_lock(lock);
5267 lvb_ready = ldlm_is_lvb_ready(lock);
5268 unlock_res_and_lock(lock);
5270 /* checking lvb_ready is racy but this is okay. The worst case is
5271 * that multi processes may configure the file on the same time. */
5275 rc = ll_layout_fetch(inode, lock);
5279 /* for layout lock, lmm is stored in lock's lvb.
5280 * lvb_data is immutable if the lock is held so it's safe to access it
5283 * set layout to file. Unlikely this will fail as old layout was
5284 * surely eliminated */
5285 memset(&conf, 0, sizeof conf);
5286 conf.coc_opc = OBJECT_CONF_SET;
5287 conf.coc_inode = inode;
5288 conf.coc_lock = lock;
5289 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5290 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5291 rc = ll_layout_conf(inode, &conf);
5293 /* refresh layout failed, need to wait */
5294 wait_layout = rc == -EBUSY;
5297 LDLM_LOCK_PUT(lock);
5298 ldlm_lock_decref(lockh, mode);
5300 /* wait for IO to complete if it's still being used. */
5302 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5303 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5305 memset(&conf, 0, sizeof conf);
5306 conf.coc_opc = OBJECT_CONF_WAIT;
5307 conf.coc_inode = inode;
5308 rc = ll_layout_conf(inode, &conf);
5312 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5313 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5319 * Issue layout intent RPC to MDS.
5320 * \param inode [in] file inode
5321 * \param intent [in] layout intent
5323 * \retval 0 on success
5324 * \retval < 0 error code
5326 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5328 struct ll_inode_info *lli = ll_i2info(inode);
5329 struct ll_sb_info *sbi = ll_i2sbi(inode);
5330 struct md_op_data *op_data;
5331 struct lookup_intent it;
5332 struct ptlrpc_request *req;
5336 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5337 0, 0, LUSTRE_OPC_ANY, NULL);
5338 if (IS_ERR(op_data))
5339 RETURN(PTR_ERR(op_data));
5341 op_data->op_data = intent;
5342 op_data->op_data_size = sizeof(*intent);
5344 memset(&it, 0, sizeof(it));
5345 it.it_op = IT_LAYOUT;
5346 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5347 intent->li_opc == LAYOUT_INTENT_TRUNC)
5348 it.it_flags = FMODE_WRITE;
5350 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5351 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5353 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5354 &ll_md_blocking_ast, 0);
5355 if (it.it_request != NULL)
5356 ptlrpc_req_finished(it.it_request);
5357 it.it_request = NULL;
5359 ll_finish_md_op_data(op_data);
5361 /* set lock data in case this is a new lock */
5363 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5365 ll_intent_drop_lock(&it);
5371 * This function checks if there exists a LAYOUT lock on the client side,
5372 * or enqueues it if it doesn't have one in cache.
5374 * This function will not hold layout lock so it may be revoked any time after
5375 * this function returns. Any operations depend on layout should be redone
5378 * This function should be called before lov_io_init() to get an uptodate
5379 * layout version, the caller should save the version number and after IO
5380 * is finished, this function should be called again to verify that layout
5381 * is not changed during IO time.
5383 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5385 struct ll_inode_info *lli = ll_i2info(inode);
5386 struct ll_sb_info *sbi = ll_i2sbi(inode);
5387 struct lustre_handle lockh;
5388 struct layout_intent intent = {
5389 .li_opc = LAYOUT_INTENT_ACCESS,
5391 enum ldlm_mode mode;
5395 *gen = ll_layout_version_get(lli);
5396 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5400 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5401 LASSERT(S_ISREG(inode->i_mode));
5403 /* take layout lock mutex to enqueue layout lock exclusively. */
5404 mutex_lock(&lli->lli_layout_mutex);
5407 /* mostly layout lock is caching on the local side, so try to
5408 * match it before grabbing layout lock mutex. */
5409 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5410 LCK_CR | LCK_CW | LCK_PR |
5412 if (mode != 0) { /* hit cached lock */
5413 rc = ll_layout_lock_set(&lockh, mode, inode);
5419 rc = ll_layout_intent(inode, &intent);
5425 *gen = ll_layout_version_get(lli);
5426 mutex_unlock(&lli->lli_layout_mutex);
5432 * Issue layout intent RPC indicating where in a file an IO is about to write.
5434 * \param[in] inode file inode.
5435 * \param[in] ext write range with start offset of fille in bytes where
5436 * an IO is about to write, and exclusive end offset in
5439 * \retval 0 on success
5440 * \retval < 0 error code
5442 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5443 struct lu_extent *ext)
5445 struct layout_intent intent = {
5447 .li_extent.e_start = ext->e_start,
5448 .li_extent.e_end = ext->e_end,
5453 rc = ll_layout_intent(inode, &intent);
5459 * This function send a restore request to the MDT
5461 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5463 struct hsm_user_request *hur;
5467 len = sizeof(struct hsm_user_request) +
5468 sizeof(struct hsm_user_item);
5469 OBD_ALLOC(hur, len);
5473 hur->hur_request.hr_action = HUA_RESTORE;
5474 hur->hur_request.hr_archive_id = 0;
5475 hur->hur_request.hr_flags = 0;
5476 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5477 sizeof(hur->hur_user_item[0].hui_fid));
5478 hur->hur_user_item[0].hui_extent.offset = offset;
5479 hur->hur_user_item[0].hui_extent.length = length;
5480 hur->hur_request.hr_itemcount = 1;
5481 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,