4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #include <linux/uidgid.h>
47 #include <uapi/linux/lustre/lustre_ioctl.h>
48 #include <lustre_swab.h>
50 #include "cl_object.h"
51 #include "llite_internal.h"
52 #include "vvp_internal.h"
55 struct inode *sp_inode;
60 __u64 pa_data_version;
66 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
68 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
71 static struct ll_file_data *ll_file_data_get(void)
73 struct ll_file_data *fd;
75 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
79 fd->fd_write_failed = false;
80 pcc_file_init(&fd->fd_pcc_file);
85 static void ll_file_data_put(struct ll_file_data *fd)
88 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
92 * Packs all the attributes into @op_data for the CLOSE rpc.
94 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
95 struct obd_client_handle *och)
99 ll_prep_md_op_data(op_data, inode, NULL, NULL,
100 0, 0, LUSTRE_OPC_ANY, NULL);
102 op_data->op_attr.ia_mode = inode->i_mode;
103 op_data->op_attr.ia_atime = inode->i_atime;
104 op_data->op_attr.ia_mtime = inode->i_mtime;
105 op_data->op_attr.ia_ctime = inode->i_ctime;
106 op_data->op_attr.ia_size = i_size_read(inode);
107 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 ATTR_MTIME | ATTR_MTIME_SET |
110 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
111 op_data->op_attr_blocks = inode->i_blocks;
112 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
113 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
114 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
115 op_data->op_open_handle = och->och_open_handle;
117 if (och->och_flags & FMODE_WRITE &&
118 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
119 /* For HSM: if inode data has been modified, pack it so that
120 * MDT can set data dirty flag in the archive. */
121 op_data->op_bias |= MDS_DATA_MODIFIED;
127 * Perform a close, possibly with a bias.
128 * The meaning of "data" depends on the value of "bias".
130 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
131 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
134 static int ll_close_inode_openhandle(struct inode *inode,
135 struct obd_client_handle *och,
136 enum mds_op_bias bias, void *data)
138 struct obd_export *md_exp = ll_i2mdexp(inode);
139 const struct ll_inode_info *lli = ll_i2info(inode);
140 struct md_op_data *op_data;
141 struct ptlrpc_request *req = NULL;
145 if (class_exp2obd(md_exp) == NULL) {
146 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
147 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
151 OBD_ALLOC_PTR(op_data);
152 /* We leak openhandle and request here on error, but not much to be
153 * done in OOM case since app won't retry close on error either. */
155 GOTO(out, rc = -ENOMEM);
157 ll_prepare_close(inode, op_data, och);
159 case MDS_CLOSE_LAYOUT_MERGE:
160 /* merge blocks from the victim inode */
161 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
162 op_data->op_attr.ia_valid |= ATTR_SIZE;
163 op_data->op_xvalid |= OP_XVALID_BLOCKS;
165 case MDS_CLOSE_LAYOUT_SPLIT:
166 case MDS_CLOSE_LAYOUT_SWAP: {
167 struct split_param *sp = data;
169 LASSERT(data != NULL);
170 op_data->op_bias |= bias;
171 op_data->op_data_version = 0;
172 op_data->op_lease_handle = och->och_lease_handle;
173 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
174 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
175 op_data->op_mirror_id = sp->sp_mirror_id;
177 op_data->op_fid2 = *ll_inode2fid(data);
182 case MDS_CLOSE_RESYNC_DONE: {
183 struct ll_ioc_lease *ioc = data;
185 LASSERT(data != NULL);
186 op_data->op_attr_blocks +=
187 ioc->lil_count * op_data->op_attr_blocks;
188 op_data->op_attr.ia_valid |= ATTR_SIZE;
189 op_data->op_xvalid |= OP_XVALID_BLOCKS;
190 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
192 op_data->op_lease_handle = och->och_lease_handle;
193 op_data->op_data = &ioc->lil_ids[0];
194 op_data->op_data_size =
195 ioc->lil_count * sizeof(ioc->lil_ids[0]);
199 case MDS_PCC_ATTACH: {
200 struct pcc_param *param = data;
202 LASSERT(data != NULL);
203 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
204 op_data->op_archive_id = param->pa_archive_id;
205 op_data->op_data_version = param->pa_data_version;
206 op_data->op_lease_handle = och->och_lease_handle;
210 case MDS_HSM_RELEASE:
211 LASSERT(data != NULL);
212 op_data->op_bias |= MDS_HSM_RELEASE;
213 op_data->op_data_version = *(__u64 *)data;
214 op_data->op_lease_handle = och->och_lease_handle;
215 op_data->op_attr.ia_valid |= ATTR_SIZE;
216 op_data->op_xvalid |= OP_XVALID_BLOCKS;
220 LASSERT(data == NULL);
224 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
225 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
226 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
227 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
229 rc = md_close(md_exp, op_data, och->och_mod, &req);
230 if (rc != 0 && rc != -EINTR)
231 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
232 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
234 if (rc == 0 && op_data->op_bias & bias) {
235 struct mdt_body *body;
237 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
238 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
241 if (bias & MDS_PCC_ATTACH) {
242 struct pcc_param *param = data;
244 param->pa_layout_gen = body->mbo_layout_gen;
248 ll_finish_md_op_data(op_data);
252 md_clear_open_replay_data(md_exp, och);
253 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
256 ptlrpc_req_finished(req); /* This is close request */
260 int ll_md_real_close(struct inode *inode, fmode_t fmode)
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct obd_client_handle **och_p;
264 struct obd_client_handle *och;
269 if (fmode & FMODE_WRITE) {
270 och_p = &lli->lli_mds_write_och;
271 och_usecount = &lli->lli_open_fd_write_count;
272 } else if (fmode & FMODE_EXEC) {
273 och_p = &lli->lli_mds_exec_och;
274 och_usecount = &lli->lli_open_fd_exec_count;
276 LASSERT(fmode & FMODE_READ);
277 och_p = &lli->lli_mds_read_och;
278 och_usecount = &lli->lli_open_fd_read_count;
281 mutex_lock(&lli->lli_och_mutex);
282 if (*och_usecount > 0) {
283 /* There are still users of this handle, so skip
285 mutex_unlock(&lli->lli_och_mutex);
291 mutex_unlock(&lli->lli_och_mutex);
294 /* There might be a race and this handle may already
296 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
302 static int ll_md_close(struct inode *inode, struct file *file)
304 union ldlm_policy_data policy = {
305 .l_inodebits = { MDS_INODELOCK_OPEN },
307 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
308 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
309 struct ll_inode_info *lli = ll_i2info(inode);
310 struct lustre_handle lockh;
311 enum ldlm_mode lockmode;
315 /* clear group lock, if present */
316 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
317 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
319 if (fd->fd_lease_och != NULL) {
322 /* Usually the lease is not released when the
323 * application crashed, we need to release here. */
324 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
325 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
326 PFID(&lli->lli_fid), rc, lease_broken);
328 fd->fd_lease_och = NULL;
331 if (fd->fd_och != NULL) {
332 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
337 /* Let's see if we have good enough OPEN lock on the file and if
338 we can skip talking to MDS */
339 mutex_lock(&lli->lli_och_mutex);
340 if (fd->fd_omode & FMODE_WRITE) {
342 LASSERT(lli->lli_open_fd_write_count);
343 lli->lli_open_fd_write_count--;
344 } else if (fd->fd_omode & FMODE_EXEC) {
346 LASSERT(lli->lli_open_fd_exec_count);
347 lli->lli_open_fd_exec_count--;
350 LASSERT(lli->lli_open_fd_read_count);
351 lli->lli_open_fd_read_count--;
353 mutex_unlock(&lli->lli_och_mutex);
355 /* LU-4398: do not cache write open lock if the file has exec bit */
356 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
357 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
358 LDLM_IBITS, &policy, lockmode, &lockh))
359 rc = ll_md_real_close(inode, fd->fd_omode);
362 LUSTRE_FPRIVATE(file) = NULL;
363 ll_file_data_put(fd);
368 /* While this returns an error code, fput() the caller does not, so we need
369 * to make every effort to clean up all of our state here. Also, applications
370 * rarely check close errors and even if an error is returned they will not
371 * re-try the close call.
373 int ll_file_release(struct inode *inode, struct file *file)
375 struct ll_file_data *fd;
376 struct ll_sb_info *sbi = ll_i2sbi(inode);
377 struct ll_inode_info *lli = ll_i2info(inode);
378 ktime_t kstart = ktime_get();
383 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
384 PFID(ll_inode2fid(inode)), inode);
386 fd = LUSTRE_FPRIVATE(file);
389 /* The last ref on @file, maybe not the the owner pid of statahead,
390 * because parent and child process can share the same file handle. */
391 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
392 ll_deauthorize_statahead(inode, fd);
394 if (inode->i_sb->s_root == file_dentry(file)) {
395 LUSTRE_FPRIVATE(file) = NULL;
396 ll_file_data_put(fd);
400 pcc_file_release(inode, file);
402 if (!S_ISDIR(inode->i_mode)) {
403 if (lli->lli_clob != NULL)
404 lov_read_and_clear_async_rc(lli->lli_clob);
405 lli->lli_async_rc = 0;
408 rc = ll_md_close(inode, file);
410 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
411 libcfs_debug_dumplog();
414 if (!rc && inode->i_sb->s_root != file_dentry(file))
415 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
416 ktime_us_delta(ktime_get(), kstart));
420 static inline int ll_dom_readpage(void *data, struct page *page)
422 struct niobuf_local *lnb = data;
425 kaddr = ll_kmap_atomic(page, KM_USER0);
426 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
427 if (lnb->lnb_len < PAGE_SIZE)
428 memset(kaddr + lnb->lnb_len, 0,
429 PAGE_SIZE - lnb->lnb_len);
430 flush_dcache_page(page);
431 SetPageUptodate(page);
432 ll_kunmap_atomic(kaddr, KM_USER0);
438 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
439 struct lookup_intent *it)
441 struct ll_inode_info *lli = ll_i2info(inode);
442 struct cl_object *obj = lli->lli_clob;
443 struct address_space *mapping = inode->i_mapping;
445 struct niobuf_remote *rnb;
446 struct mdt_body *body;
448 unsigned long index, start;
449 struct niobuf_local lnb;
456 if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
460 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
461 if (rnb == NULL || rnb->rnb_len == 0)
464 /* LU-11595: Server may return whole file and that is OK always or
465 * it may return just file tail and its offset must be aligned with
466 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
467 * smaller then offset may be not aligned and that data is just ignored.
469 if (rnb->rnb_offset & ~PAGE_MASK)
472 /* Server returns whole file or just file tail if it fills in reply
473 * buffer, in both cases total size should be equal to the file size.
475 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
476 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
477 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
478 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
479 rnb->rnb_len, body->mbo_dom_size);
483 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
484 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
486 data = (char *)rnb + sizeof(*rnb);
488 lnb.lnb_file_offset = rnb->rnb_offset;
489 start = lnb.lnb_file_offset >> PAGE_SHIFT;
491 LASSERT((lnb.lnb_file_offset & ~PAGE_MASK) == 0);
492 lnb.lnb_page_offset = 0;
494 lnb.lnb_data = data + (index << PAGE_SHIFT);
495 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
496 if (lnb.lnb_len > PAGE_SIZE)
497 lnb.lnb_len = PAGE_SIZE;
499 vmpage = read_cache_page(mapping, index + start,
500 ll_dom_readpage, &lnb);
501 if (IS_ERR(vmpage)) {
502 CWARN("%s: cannot fill page %lu for "DFID
503 " with data: rc = %li\n",
504 ll_i2sbi(inode)->ll_fsname, index + start,
505 PFID(lu_object_fid(&obj->co_lu)),
511 } while (rnb->rnb_len > (index << PAGE_SHIFT));
515 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
516 struct lookup_intent *itp)
518 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
519 struct dentry *parent = de->d_parent;
522 struct md_op_data *op_data;
523 struct ptlrpc_request *req = NULL;
527 LASSERT(parent != NULL);
528 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
530 /* if server supports open-by-fid, or file name is invalid, don't pack
531 * name in open request */
532 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
533 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
535 len = de->d_name.len;
536 name = kmalloc(len + 1, GFP_NOFS);
541 spin_lock(&de->d_lock);
542 if (len != de->d_name.len) {
543 spin_unlock(&de->d_lock);
547 memcpy(name, de->d_name.name, len);
549 spin_unlock(&de->d_lock);
551 if (!lu_name_is_valid_2(name, len)) {
557 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
558 name, len, 0, LUSTRE_OPC_ANY, NULL);
559 if (IS_ERR(op_data)) {
561 RETURN(PTR_ERR(op_data));
563 op_data->op_data = lmm;
564 op_data->op_data_size = lmmsize;
566 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
567 &ll_md_blocking_ast, 0);
569 ll_finish_md_op_data(op_data);
571 /* reason for keep own exit path - don`t flood log
572 * with messages with -ESTALE errors.
574 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
575 it_open_error(DISP_OPEN_OPEN, itp))
577 ll_release_openhandle(de, itp);
581 if (it_disposition(itp, DISP_LOOKUP_NEG))
582 GOTO(out, rc = -ENOENT);
584 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
585 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
586 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
590 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
592 if (!rc && itp->it_lock_mode) {
593 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
594 struct ldlm_lock *lock;
595 bool has_dom_bit = false;
597 /* If we got a lock back and it has a LOOKUP bit set,
598 * make sure the dentry is marked as valid so we can find it.
599 * We don't need to care about actual hashing since other bits
600 * of kernel will deal with that later.
602 lock = ldlm_handle2lock(&handle);
604 has_dom_bit = ldlm_has_dom(lock);
605 if (lock->l_policy_data.l_inodebits.bits &
606 MDS_INODELOCK_LOOKUP)
607 d_lustre_revalidate(de);
611 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
613 ll_dom_finish_open(de->d_inode, req, itp);
617 ptlrpc_req_finished(req);
618 ll_intent_drop_lock(itp);
620 /* We did open by fid, but by the time we got to the server,
621 * the object disappeared. If this is a create, we cannot really
622 * tell the userspace that the file it was trying to create
623 * does not exist. Instead let's return -ESTALE, and the VFS will
624 * retry the create with LOOKUP_REVAL that we are going to catch
625 * in ll_revalidate_dentry() and use lookup then.
627 if (rc == -ENOENT && itp->it_op & IT_CREAT)
633 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
634 struct obd_client_handle *och)
636 struct mdt_body *body;
638 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
639 och->och_open_handle = body->mbo_open_handle;
640 och->och_fid = body->mbo_fid1;
641 och->och_lease_handle.cookie = it->it_lock_handle;
642 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
643 och->och_flags = it->it_flags;
645 return md_set_open_replay_data(md_exp, och, it);
648 static int ll_local_open(struct file *file, struct lookup_intent *it,
649 struct ll_file_data *fd, struct obd_client_handle *och)
651 struct inode *inode = file_inode(file);
654 LASSERT(!LUSTRE_FPRIVATE(file));
661 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
666 LUSTRE_FPRIVATE(file) = fd;
667 ll_readahead_init(inode, &fd->fd_ras);
668 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
670 /* ll_cl_context initialize */
671 rwlock_init(&fd->fd_lock);
672 INIT_LIST_HEAD(&fd->fd_lccs);
677 /* Open a file, and (for the very first open) create objects on the OSTs at
678 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
679 * creation or open until ll_lov_setstripe() ioctl is called.
681 * If we already have the stripe MD locally then we don't request it in
682 * md_open(), by passing a lmm_size = 0.
684 * It is up to the application to ensure no other processes open this file
685 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
686 * used. We might be able to avoid races of that sort by getting lli_open_sem
687 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
688 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
690 int ll_file_open(struct inode *inode, struct file *file)
692 struct ll_inode_info *lli = ll_i2info(inode);
693 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
694 .it_flags = file->f_flags };
695 struct obd_client_handle **och_p = NULL;
696 __u64 *och_usecount = NULL;
697 struct ll_file_data *fd;
698 ktime_t kstart = ktime_get();
702 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
703 PFID(ll_inode2fid(inode)), inode, file->f_flags);
705 it = file->private_data; /* XXX: compat macro */
706 file->private_data = NULL; /* prevent ll_local_open assertion */
708 fd = ll_file_data_get();
710 GOTO(out_nofiledata, rc = -ENOMEM);
713 if (S_ISDIR(inode->i_mode))
714 ll_authorize_statahead(inode, fd);
716 if (inode->i_sb->s_root == file_dentry(file)) {
717 LUSTRE_FPRIVATE(file) = fd;
721 if (!it || !it->it_disposition) {
722 /* Convert f_flags into access mode. We cannot use file->f_mode,
723 * because everything but O_ACCMODE mask was stripped from
725 if ((oit.it_flags + 1) & O_ACCMODE)
727 if (file->f_flags & O_TRUNC)
728 oit.it_flags |= FMODE_WRITE;
730 /* kernel only call f_op->open in dentry_open. filp_open calls
731 * dentry_open after call to open_namei that checks permissions.
732 * Only nfsd_open call dentry_open directly without checking
733 * permissions and because of that this code below is safe.
735 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
736 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
738 /* We do not want O_EXCL here, presumably we opened the file
739 * already? XXX - NFS implications? */
740 oit.it_flags &= ~O_EXCL;
742 /* bug20584, if "it_flags" contains O_CREAT, the file will be
743 * created if necessary, then "IT_CREAT" should be set to keep
744 * consistent with it */
745 if (oit.it_flags & O_CREAT)
746 oit.it_op |= IT_CREAT;
752 /* Let's see if we have file open on MDS already. */
753 if (it->it_flags & FMODE_WRITE) {
754 och_p = &lli->lli_mds_write_och;
755 och_usecount = &lli->lli_open_fd_write_count;
756 } else if (it->it_flags & FMODE_EXEC) {
757 och_p = &lli->lli_mds_exec_och;
758 och_usecount = &lli->lli_open_fd_exec_count;
760 och_p = &lli->lli_mds_read_och;
761 och_usecount = &lli->lli_open_fd_read_count;
764 mutex_lock(&lli->lli_och_mutex);
765 if (*och_p) { /* Open handle is present */
766 if (it_disposition(it, DISP_OPEN_OPEN)) {
767 /* Well, there's extra open request that we do not need,
768 let's close it somehow. This will decref request. */
769 rc = it_open_error(DISP_OPEN_OPEN, it);
771 mutex_unlock(&lli->lli_och_mutex);
772 GOTO(out_openerr, rc);
775 ll_release_openhandle(file_dentry(file), it);
779 rc = ll_local_open(file, it, fd, NULL);
782 mutex_unlock(&lli->lli_och_mutex);
783 GOTO(out_openerr, rc);
786 LASSERT(*och_usecount == 0);
787 if (!it->it_disposition) {
788 struct dentry *dentry = file_dentry(file);
789 struct ll_dentry_data *ldd;
791 /* We cannot just request lock handle now, new ELC code
792 means that one of other OPEN locks for this file
793 could be cancelled, and since blocking ast handler
794 would attempt to grab och_mutex as well, that would
795 result in a deadlock */
796 mutex_unlock(&lli->lli_och_mutex);
798 * Normally called under two situations:
800 * 2. A race/condition on MDS resulting in no open
801 * handle to be returned from LOOKUP|OPEN request,
802 * for example if the target entry was a symlink.
804 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
805 * marked by a bit set in ll_iget_for_nfs. Clear the
806 * bit so that it's not confusing later callers.
808 * NB; when ldd is NULL, it must have come via normal
809 * lookup path only, since ll_iget_for_nfs always calls
812 ldd = ll_d2d(dentry);
813 if (ldd && ldd->lld_nfs_dentry) {
814 ldd->lld_nfs_dentry = 0;
815 if (!filename_is_volatile(dentry->d_name.name,
818 it->it_flags |= MDS_OPEN_LOCK;
822 * Always specify MDS_OPEN_BY_FID because we don't want
823 * to get file with different fid.
825 it->it_flags |= MDS_OPEN_BY_FID;
826 rc = ll_intent_file_open(dentry, NULL, 0, it);
828 GOTO(out_openerr, rc);
832 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
834 GOTO(out_och_free, rc = -ENOMEM);
838 /* md_intent_lock() didn't get a request ref if there was an
839 * open error, so don't do cleanup on the request here
841 /* XXX (green): Should not we bail out on any error here, not
842 * just open error? */
843 rc = it_open_error(DISP_OPEN_OPEN, it);
845 GOTO(out_och_free, rc);
847 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
848 "inode %p: disposition %x, status %d\n", inode,
849 it_disposition(it, ~0), it->it_status);
851 rc = ll_local_open(file, it, fd, *och_p);
853 GOTO(out_och_free, rc);
856 rc = pcc_file_open(inode, file);
858 GOTO(out_och_free, rc);
860 mutex_unlock(&lli->lli_och_mutex);
862 /* lockless for direct IO so that it can do IO in parallel */
863 if (file->f_flags & O_DIRECT)
864 fd->fd_flags |= LL_FILE_LOCKLESS_IO;
867 /* Must do this outside lli_och_mutex lock to prevent deadlock where
868 different kind of OPEN lock for this same inode gets cancelled
869 by ldlm_cancel_lru */
870 if (!S_ISREG(inode->i_mode))
871 GOTO(out_och_free, rc);
872 cl_lov_delay_create_clear(&file->f_flags);
873 GOTO(out_och_free, rc);
877 if (och_p && *och_p) {
878 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
879 *och_p = NULL; /* OBD_FREE writes some magic there */
882 mutex_unlock(&lli->lli_och_mutex);
885 if (lli->lli_opendir_key == fd)
886 ll_deauthorize_statahead(inode, fd);
889 ll_file_data_put(fd);
891 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
892 ktime_us_delta(ktime_get(), kstart));
896 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
897 ptlrpc_req_finished(it->it_request);
898 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
904 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
905 struct ldlm_lock_desc *desc, void *data, int flag)
908 struct lustre_handle lockh;
912 case LDLM_CB_BLOCKING:
913 ldlm_lock2handle(lock, &lockh);
914 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
916 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
920 case LDLM_CB_CANCELING:
928 * When setting a lease on a file, we take ownership of the lli_mds_*_och
929 * and save it as fd->fd_och so as to force client to reopen the file even
930 * if it has an open lock in cache already.
932 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
933 struct lustre_handle *old_open_handle)
935 struct ll_inode_info *lli = ll_i2info(inode);
936 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
937 struct obd_client_handle **och_p;
942 /* Get the openhandle of the file */
943 mutex_lock(&lli->lli_och_mutex);
944 if (fd->fd_lease_och != NULL)
945 GOTO(out_unlock, rc = -EBUSY);
947 if (fd->fd_och == NULL) {
948 if (file->f_mode & FMODE_WRITE) {
949 LASSERT(lli->lli_mds_write_och != NULL);
950 och_p = &lli->lli_mds_write_och;
951 och_usecount = &lli->lli_open_fd_write_count;
953 LASSERT(lli->lli_mds_read_och != NULL);
954 och_p = &lli->lli_mds_read_och;
955 och_usecount = &lli->lli_open_fd_read_count;
958 if (*och_usecount > 1)
959 GOTO(out_unlock, rc = -EBUSY);
966 *old_open_handle = fd->fd_och->och_open_handle;
970 mutex_unlock(&lli->lli_och_mutex);
975 * Release ownership on lli_mds_*_och when putting back a file lease.
977 static int ll_lease_och_release(struct inode *inode, struct file *file)
979 struct ll_inode_info *lli = ll_i2info(inode);
980 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
981 struct obd_client_handle **och_p;
982 struct obd_client_handle *old_och = NULL;
987 mutex_lock(&lli->lli_och_mutex);
988 if (file->f_mode & FMODE_WRITE) {
989 och_p = &lli->lli_mds_write_och;
990 och_usecount = &lli->lli_open_fd_write_count;
992 och_p = &lli->lli_mds_read_och;
993 och_usecount = &lli->lli_open_fd_read_count;
996 /* The file may have been open by another process (broken lease) so
997 * *och_p is not NULL. In this case we should simply increase usecount
1000 if (*och_p != NULL) {
1001 old_och = fd->fd_och;
1004 *och_p = fd->fd_och;
1008 mutex_unlock(&lli->lli_och_mutex);
1010 if (old_och != NULL)
1011 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1017 * Acquire a lease and open the file.
1019 static struct obd_client_handle *
1020 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1023 struct lookup_intent it = { .it_op = IT_OPEN };
1024 struct ll_sb_info *sbi = ll_i2sbi(inode);
1025 struct md_op_data *op_data;
1026 struct ptlrpc_request *req = NULL;
1027 struct lustre_handle old_open_handle = { 0 };
1028 struct obd_client_handle *och = NULL;
1033 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1034 RETURN(ERR_PTR(-EINVAL));
1037 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1038 RETURN(ERR_PTR(-EPERM));
1040 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1042 RETURN(ERR_PTR(rc));
1047 RETURN(ERR_PTR(-ENOMEM));
1049 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1050 LUSTRE_OPC_ANY, NULL);
1051 if (IS_ERR(op_data))
1052 GOTO(out, rc = PTR_ERR(op_data));
1054 /* To tell the MDT this openhandle is from the same owner */
1055 op_data->op_open_handle = old_open_handle;
1057 it.it_flags = fmode | open_flags;
1058 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1059 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1060 &ll_md_blocking_lease_ast,
1061 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1062 * it can be cancelled which may mislead applications that the lease is
1064 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1065 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1066 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1067 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1068 ll_finish_md_op_data(op_data);
1069 ptlrpc_req_finished(req);
1071 GOTO(out_release_it, rc);
1073 if (it_disposition(&it, DISP_LOOKUP_NEG))
1074 GOTO(out_release_it, rc = -ENOENT);
1076 rc = it_open_error(DISP_OPEN_OPEN, &it);
1078 GOTO(out_release_it, rc);
1080 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1081 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1083 GOTO(out_release_it, rc);
1085 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1086 GOTO(out_close, rc = -EOPNOTSUPP);
1088 /* already get lease, handle lease lock */
1089 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1090 if (it.it_lock_mode == 0 ||
1091 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1092 /* open lock must return for lease */
1093 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1094 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1096 GOTO(out_close, rc = -EPROTO);
1099 ll_intent_release(&it);
1103 /* Cancel open lock */
1104 if (it.it_lock_mode != 0) {
1105 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1107 it.it_lock_mode = 0;
1108 och->och_lease_handle.cookie = 0ULL;
1110 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1112 CERROR("%s: error closing file "DFID": %d\n",
1113 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1114 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1116 ll_intent_release(&it);
1120 RETURN(ERR_PTR(rc));
1124 * Check whether a layout swap can be done between two inodes.
1126 * \param[in] inode1 First inode to check
1127 * \param[in] inode2 Second inode to check
1129 * \retval 0 on success, layout swap can be performed between both inodes
1130 * \retval negative error code if requirements are not met
1132 static int ll_check_swap_layouts_validity(struct inode *inode1,
1133 struct inode *inode2)
1135 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1138 if (inode_permission(inode1, MAY_WRITE) ||
1139 inode_permission(inode2, MAY_WRITE))
1142 if (inode1->i_sb != inode2->i_sb)
1148 static int ll_swap_layouts_close(struct obd_client_handle *och,
1149 struct inode *inode, struct inode *inode2)
1151 const struct lu_fid *fid1 = ll_inode2fid(inode);
1152 const struct lu_fid *fid2;
1156 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1157 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1159 rc = ll_check_swap_layouts_validity(inode, inode2);
1161 GOTO(out_free_och, rc);
1163 /* We now know that inode2 is a lustre inode */
1164 fid2 = ll_inode2fid(inode2);
1166 rc = lu_fid_cmp(fid1, fid2);
1168 GOTO(out_free_och, rc = -EINVAL);
1170 /* Close the file and {swap,merge} layouts between inode & inode2.
1171 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1172 * because we still need it to pack l_remote_handle to MDT. */
1173 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1176 och = NULL; /* freed in ll_close_inode_openhandle() */
1186 * Release lease and close the file.
1187 * It will check if the lease has ever broken.
1189 static int ll_lease_close_intent(struct obd_client_handle *och,
1190 struct inode *inode,
1191 bool *lease_broken, enum mds_op_bias bias,
1194 struct ldlm_lock *lock;
1195 bool cancelled = true;
1199 lock = ldlm_handle2lock(&och->och_lease_handle);
1201 lock_res_and_lock(lock);
1202 cancelled = ldlm_is_cancel(lock);
1203 unlock_res_and_lock(lock);
1204 LDLM_LOCK_PUT(lock);
1207 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1208 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1210 if (lease_broken != NULL)
1211 *lease_broken = cancelled;
1213 if (!cancelled && !bias)
1214 ldlm_cli_cancel(&och->och_lease_handle, 0);
1216 if (cancelled) { /* no need to excute intent */
1221 rc = ll_close_inode_openhandle(inode, och, bias, data);
1225 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1228 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1232 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1234 static int ll_lease_file_resync(struct obd_client_handle *och,
1235 struct inode *inode, unsigned long arg)
1237 struct ll_sb_info *sbi = ll_i2sbi(inode);
1238 struct md_op_data *op_data;
1239 struct ll_ioc_lease_id ioc;
1240 __u64 data_version_unused;
1244 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1245 LUSTRE_OPC_ANY, NULL);
1246 if (IS_ERR(op_data))
1247 RETURN(PTR_ERR(op_data));
1249 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1253 /* before starting file resync, it's necessary to clean up page cache
1254 * in client memory, otherwise once the layout version is increased,
1255 * writing back cached data will be denied the OSTs. */
1256 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1260 op_data->op_lease_handle = och->och_lease_handle;
1261 op_data->op_mirror_id = ioc.lil_mirror_id;
1262 rc = md_file_resync(sbi->ll_md_exp, op_data);
1268 ll_finish_md_op_data(op_data);
1272 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1274 struct ll_inode_info *lli = ll_i2info(inode);
1275 struct cl_object *obj = lli->lli_clob;
1276 struct cl_attr *attr = vvp_env_thread_attr(env);
1284 ll_inode_size_lock(inode);
1286 /* Merge timestamps the most recently obtained from MDS with
1287 * timestamps obtained from OSTs.
1289 * Do not overwrite atime of inode because it may be refreshed
1290 * by file_accessed() function. If the read was served by cache
1291 * data, there is no RPC to be sent so that atime may not be
1292 * transferred to OSTs at all. MDT only updates atime at close time
1293 * if it's at least 'mdd.*.atime_diff' older.
1294 * All in all, the atime in Lustre does not strictly comply with
1295 * POSIX. Solving this problem needs to send an RPC to MDT for each
1296 * read, this will hurt performance.
1298 if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
1299 inode->i_atime.tv_sec < lli->lli_atime)
1300 inode->i_atime.tv_sec = lli->lli_atime;
1302 inode->i_mtime.tv_sec = lli->lli_mtime;
1303 inode->i_ctime.tv_sec = lli->lli_ctime;
1305 mtime = inode->i_mtime.tv_sec;
1306 atime = inode->i_atime.tv_sec;
1307 ctime = inode->i_ctime.tv_sec;
1309 cl_object_attr_lock(obj);
1310 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1313 rc = cl_object_attr_get(env, obj, attr);
1314 cl_object_attr_unlock(obj);
1317 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1319 if (atime < attr->cat_atime)
1320 atime = attr->cat_atime;
1322 if (ctime < attr->cat_ctime)
1323 ctime = attr->cat_ctime;
1325 if (mtime < attr->cat_mtime)
1326 mtime = attr->cat_mtime;
1328 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1329 PFID(&lli->lli_fid), attr->cat_size);
1331 i_size_write(inode, attr->cat_size);
1332 inode->i_blocks = attr->cat_blocks;
1334 inode->i_mtime.tv_sec = mtime;
1335 inode->i_atime.tv_sec = atime;
1336 inode->i_ctime.tv_sec = ctime;
1339 ll_inode_size_unlock(inode);
1345 * Set designated mirror for I/O.
1347 * So far only read, write, and truncated can support to issue I/O to
1348 * designated mirror.
1350 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1352 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1354 /* clear layout version for generic(non-resync) I/O in case it carries
1355 * stale layout version due to I/O restart */
1356 io->ci_layout_version = 0;
1358 /* FLR: disable non-delay for designated mirror I/O because obviously
1359 * only one mirror is available */
1360 if (fd->fd_designated_mirror > 0) {
1362 io->ci_designated_mirror = fd->fd_designated_mirror;
1363 io->ci_layout_version = fd->fd_layout_version;
1366 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1367 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1370 static bool file_is_noatime(const struct file *file)
1372 const struct vfsmount *mnt = file->f_path.mnt;
1373 const struct inode *inode = file_inode((struct file *)file);
1375 /* Adapted from file_accessed() and touch_atime().*/
1376 if (file->f_flags & O_NOATIME)
1379 if (inode->i_flags & S_NOATIME)
1382 if (IS_NOATIME(inode))
1385 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1388 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1391 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1397 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1398 struct vvp_io_args *args)
1400 struct inode *inode = file_inode(file);
1401 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1403 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1404 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1406 if (iot == CIT_WRITE) {
1407 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1408 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1409 file->f_flags & O_DIRECT ||
1411 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1412 io->u.ci_wr.wr_sync |= !!(args &&
1413 args->via_io_subtype == IO_NORMAL &&
1414 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1418 io->ci_obj = ll_i2info(inode)->lli_clob;
1419 io->ci_lockreq = CILR_MAYBE;
1420 if (ll_file_nolock(file)) {
1421 io->ci_lockreq = CILR_NEVER;
1422 io->ci_no_srvlock = 1;
1423 } else if (file->f_flags & O_APPEND) {
1424 io->ci_lockreq = CILR_MANDATORY;
1426 io->ci_noatime = file_is_noatime(file);
1427 io->ci_async_readahead = false;
1429 /* FLR: only use non-delay I/O for read as there is only one
1430 * avaliable mirror for write. */
1431 io->ci_ndelay = !(iot == CIT_WRITE);
1433 ll_io_set_mirror(io, file);
1436 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1439 struct ll_inode_info *lli = ll_i2info(inode);
1440 struct ll_sb_info *sbi = ll_i2sbi(inode);
1441 enum obd_heat_type sample_type;
1442 enum obd_heat_type iobyte_type;
1443 __u64 now = ktime_get_real_seconds();
1445 if (!ll_sbi_has_file_heat(sbi) ||
1446 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1449 if (iot == CIT_READ) {
1450 sample_type = OBD_HEAT_READSAMPLE;
1451 iobyte_type = OBD_HEAT_READBYTE;
1452 } else if (iot == CIT_WRITE) {
1453 sample_type = OBD_HEAT_WRITESAMPLE;
1454 iobyte_type = OBD_HEAT_WRITEBYTE;
1459 spin_lock(&lli->lli_heat_lock);
1460 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1461 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1462 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1463 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1464 spin_unlock(&lli->lli_heat_lock);
1468 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1469 struct file *file, enum cl_io_type iot,
1470 loff_t *ppos, size_t count)
1472 struct vvp_io *vio = vvp_env_io(env);
1473 struct inode *inode = file_inode(file);
1474 struct ll_inode_info *lli = ll_i2info(inode);
1475 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1476 struct range_lock range;
1480 unsigned retried = 0;
1481 unsigned ignore_lockless = 0;
1485 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1486 file_dentry(file)->d_name.name,
1487 iot == CIT_READ ? "read" : "write", *ppos, count);
1490 io = vvp_env_thread_io(env);
1491 ll_io_init(io, file, iot, args);
1492 io->ci_ignore_lockless = ignore_lockless;
1493 io->ci_ndelay_tried = retried;
1495 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1496 bool range_locked = false;
1498 if (file->f_flags & O_APPEND)
1499 range_lock_init(&range, 0, LUSTRE_EOF);
1501 range_lock_init(&range, *ppos, *ppos + count - 1);
1503 vio->vui_fd = LUSTRE_FPRIVATE(file);
1504 vio->vui_io_subtype = args->via_io_subtype;
1506 switch (vio->vui_io_subtype) {
1508 vio->vui_iter = args->u.normal.via_iter;
1509 vio->vui_iocb = args->u.normal.via_iocb;
1510 /* Direct IO reads must also take range lock,
1511 * or multiple reads will try to work on the same pages
1512 * See LU-6227 for details. */
1513 if (((iot == CIT_WRITE) ||
1514 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1515 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1516 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1518 rc = range_lock(&lli->lli_write_tree, &range);
1522 range_locked = true;
1526 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1527 vio->u.splice.vui_flags = args->u.splice.via_flags;
1530 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1534 ll_cl_add(file, env, io, LCC_RW);
1535 rc = cl_io_loop(env, io);
1536 ll_cl_remove(file, env);
1539 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1541 range_unlock(&lli->lli_write_tree, &range);
1544 /* cl_io_rw_init() handled IO */
1548 if (io->ci_nob > 0) {
1549 result += io->ci_nob;
1550 count -= io->ci_nob;
1551 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1553 /* prepare IO restart */
1554 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1555 args->u.normal.via_iter = vio->vui_iter;
1558 cl_io_fini(env, io);
1561 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1562 file->f_path.dentry->d_name.name,
1563 iot, rc, result, io->ci_need_restart);
1565 if ((rc == 0 || rc == -ENODATA || rc == -ENOLCK) &&
1566 count > 0 && io->ci_need_restart) {
1568 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1569 file_dentry(file)->d_name.name,
1570 iot == CIT_READ ? "read" : "write",
1571 *ppos, count, result, rc);
1572 /* preserve the tried count for FLR */
1573 retried = io->ci_ndelay_tried;
1574 ignore_lockless = io->ci_ignore_lockless;
1578 if (iot == CIT_READ) {
1580 ll_stats_ops_tally(ll_i2sbi(inode),
1581 LPROC_LL_READ_BYTES, result);
1582 } else if (iot == CIT_WRITE) {
1584 ll_stats_ops_tally(ll_i2sbi(inode),
1585 LPROC_LL_WRITE_BYTES, result);
1586 fd->fd_write_failed = false;
1587 } else if (result == 0 && rc == 0) {
1590 fd->fd_write_failed = true;
1592 fd->fd_write_failed = false;
1593 } else if (rc != -ERESTARTSYS) {
1594 fd->fd_write_failed = true;
1598 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1600 ll_heat_add(inode, iot, result);
1602 RETURN(result > 0 ? result : rc);
1606 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1607 * especially for small I/O.
1609 * To serve a read request, CLIO has to create and initialize a cl_io and
1610 * then request DLM lock. This has turned out to have siginificant overhead
1611 * and affects the performance of small I/O dramatically.
1613 * It's not necessary to create a cl_io for each I/O. Under the help of read
1614 * ahead, most of the pages being read are already in memory cache and we can
1615 * read those pages directly because if the pages exist, the corresponding DLM
1616 * lock must exist so that page content must be valid.
1618 * In fast read implementation, the llite speculatively finds and reads pages
1619 * in memory cache. There are three scenarios for fast read:
1620 * - If the page exists and is uptodate, kernel VM will provide the data and
1621 * CLIO won't be intervened;
1622 * - If the page was brought into memory by read ahead, it will be exported
1623 * and read ahead parameters will be updated;
1624 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1625 * it will go back and invoke normal read, i.e., a cl_io will be created
1626 * and DLM lock will be requested.
1628 * POSIX compliance: posix standard states that read is intended to be atomic.
1629 * Lustre read implementation is in line with Linux kernel read implementation
1630 * and neither of them complies with POSIX standard in this matter. Fast read
1631 * doesn't make the situation worse on single node but it may interleave write
1632 * results from multiple nodes due to short read handling in ll_file_aio_read().
1634 * \param env - lu_env
1635 * \param iocb - kiocb from kernel
1636 * \param iter - user space buffers where the data will be copied
1638 * \retval - number of bytes have been read, or error code if error occurred.
1641 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1645 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1648 /* NB: we can't do direct IO for fast read because it will need a lock
1649 * to make IO engine happy. */
1650 if (iocb->ki_filp->f_flags & O_DIRECT)
1653 result = generic_file_read_iter(iocb, iter);
1655 /* If the first page is not in cache, generic_file_aio_read() will be
1656 * returned with -ENODATA.
1657 * See corresponding code in ll_readpage(). */
1658 if (result == -ENODATA)
1662 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1663 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1664 LPROC_LL_READ_BYTES, result);
1671 * Read from a file (through the page cache).
1673 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1676 struct vvp_io_args *args;
1677 struct file *file = iocb->ki_filp;
1681 ktime_t kstart = ktime_get();
1684 if (!iov_iter_count(to))
1688 * Currently when PCC read failed, we do not fall back to the
1689 * normal read path, just return the error.
1690 * The resaon is that: for RW-PCC, the file data may be modified
1691 * in the PCC and inconsistent with the data on OSTs (or file
1692 * data has been removed from the Lustre file system), at this
1693 * time, fallback to the normal read path may read the wrong
1695 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1696 * path: read data from data copy on OSTs.
1698 result = pcc_file_read_iter(iocb, to, &cached);
1702 ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1704 result = ll_do_fast_read(iocb, to);
1705 if (result < 0 || iov_iter_count(to) == 0)
1708 env = cl_env_get(&refcheck);
1710 return PTR_ERR(env);
1712 args = ll_env_args(env, IO_NORMAL);
1713 args->u.normal.via_iter = to;
1714 args->u.normal.via_iocb = iocb;
1716 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1717 &iocb->ki_pos, iov_iter_count(to));
1720 else if (result == 0)
1723 cl_env_put(env, &refcheck);
1726 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1727 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1729 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1730 ktime_us_delta(ktime_get(), kstart));
1737 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1738 * If a page is already in the page cache and dirty (and some other things -
1739 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1740 * write to it without doing a full I/O, because Lustre already knows about it
1741 * and will write it out. This saves a lot of processing time.
1743 * All writes here are within one page, so exclusion is handled by the page
1744 * lock on the vm page. We do not do tiny writes for writes which touch
1745 * multiple pages because it's very unlikely multiple sequential pages are
1746 * are already dirty.
1748 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1749 * and are unlikely to be to already dirty pages.
1751 * Attribute updates are important here, we do them in ll_tiny_write_end.
1753 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1755 ssize_t count = iov_iter_count(iter);
1756 struct file *file = iocb->ki_filp;
1757 struct inode *inode = file_inode(file);
1758 bool lock_inode = !IS_NOSEC(inode);
1763 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1764 * of function for why.
1766 if (count >= PAGE_SIZE ||
1767 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1770 if (unlikely(lock_inode))
1772 result = __generic_file_write_iter(iocb, iter);
1774 if (unlikely(lock_inode))
1775 inode_unlock(inode);
1777 /* If the page is not already dirty, ll_tiny_write_begin returns
1778 * -ENODATA. We continue on to normal write.
1780 if (result == -ENODATA)
1784 ll_heat_add(inode, CIT_WRITE, result);
1785 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1787 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1790 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1796 * Write to a file (through the page cache).
1798 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1800 struct vvp_io_args *args;
1802 ssize_t rc_tiny = 0, rc_normal;
1803 struct file *file = iocb->ki_filp;
1806 ktime_t kstart = ktime_get();
1811 if (!iov_iter_count(from))
1812 GOTO(out, rc_normal = 0);
1815 * When PCC write failed, we usually do not fall back to the normal
1816 * write path, just return the error. But there is a special case when
1817 * returned error code is -ENOSPC due to running out of space on PCC HSM
1818 * bakcend. At this time, it will fall back to normal I/O path and
1819 * retry the I/O. As the file is in HSM released state, it will restore
1820 * the file data to OSTs first and redo the write again. And the
1821 * restore process will revoke the layout lock and detach the file
1822 * from PCC cache automatically.
1824 result = pcc_file_write_iter(iocb, from, &cached);
1825 if (cached && result != -ENOSPC && result != -EDQUOT)
1826 GOTO(out, rc_normal = result);
1828 /* NB: we can't do direct IO for tiny writes because they use the page
1829 * cache, we can't do sync writes because tiny writes can't flush
1830 * pages, and we can't do append writes because we can't guarantee the
1831 * required DLM locks are held to protect file size.
1833 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1834 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1835 rc_tiny = ll_do_tiny_write(iocb, from);
1837 /* In case of error, go on and try normal write - Only stop if tiny
1838 * write completed I/O.
1840 if (iov_iter_count(from) == 0)
1841 GOTO(out, rc_normal = rc_tiny);
1843 env = cl_env_get(&refcheck);
1845 return PTR_ERR(env);
1847 args = ll_env_args(env, IO_NORMAL);
1848 args->u.normal.via_iter = from;
1849 args->u.normal.via_iocb = iocb;
1851 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1852 &iocb->ki_pos, iov_iter_count(from));
1854 /* On success, combine bytes written. */
1855 if (rc_tiny >= 0 && rc_normal > 0)
1856 rc_normal += rc_tiny;
1857 /* On error, only return error from normal write if tiny write did not
1858 * write any bytes. Otherwise return bytes written by tiny write.
1860 else if (rc_tiny > 0)
1861 rc_normal = rc_tiny;
1863 cl_env_put(env, &refcheck);
1865 if (rc_normal > 0) {
1866 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1867 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1869 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1870 ktime_us_delta(ktime_get(), kstart));
1876 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1878 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1880 static int ll_file_get_iov_count(const struct iovec *iov,
1881 unsigned long *nr_segs, size_t *count)
1886 for (seg = 0; seg < *nr_segs; seg++) {
1887 const struct iovec *iv = &iov[seg];
1890 * If any segment has a negative length, or the cumulative
1891 * length ever wraps negative then return -EINVAL.
1894 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1896 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1901 cnt -= iv->iov_len; /* This segment is no good */
1908 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1909 unsigned long nr_segs, loff_t pos)
1916 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1923 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1924 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1925 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1926 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1927 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1929 result = ll_file_read_iter(iocb, &to);
1934 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1937 struct iovec iov = { .iov_base = buf, .iov_len = count };
1946 init_sync_kiocb(&kiocb, file);
1947 kiocb.ki_pos = *ppos;
1948 #ifdef HAVE_KIOCB_KI_LEFT
1949 kiocb.ki_left = count;
1950 #elif defined(HAVE_KI_NBYTES)
1951 kiocb.i_nbytes = count;
1954 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1955 *ppos = kiocb.ki_pos;
1961 * Write to a file (through the page cache).
1964 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1965 unsigned long nr_segs, loff_t pos)
1967 struct iov_iter from;
1972 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1979 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1980 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1981 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1982 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1983 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1985 result = ll_file_write_iter(iocb, &from);
1990 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1991 size_t count, loff_t *ppos)
1993 struct iovec iov = { .iov_base = (void __user *)buf,
2003 init_sync_kiocb(&kiocb, file);
2004 kiocb.ki_pos = *ppos;
2005 #ifdef HAVE_KIOCB_KI_LEFT
2006 kiocb.ki_left = count;
2007 #elif defined(HAVE_KI_NBYTES)
2008 kiocb.ki_nbytes = count;
2011 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2012 *ppos = kiocb.ki_pos;
2016 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2019 * Send file content (through pagecache) somewhere with helper
2021 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2022 struct pipe_inode_info *pipe, size_t count,
2026 struct vvp_io_args *args;
2033 result = pcc_file_splice_read(in_file, ppos, pipe,
2034 count, flags, &cached);
2038 ll_ras_enter(in_file, *ppos, count);
2040 env = cl_env_get(&refcheck);
2042 RETURN(PTR_ERR(env));
2044 args = ll_env_args(env, IO_SPLICE);
2045 args->u.splice.via_pipe = pipe;
2046 args->u.splice.via_flags = flags;
2048 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2049 cl_env_put(env, &refcheck);
2052 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2053 LUSTRE_FPRIVATE(in_file), *ppos, result,
2058 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2059 __u64 flags, struct lov_user_md *lum, int lum_size)
2061 struct lookup_intent oit = {
2063 .it_flags = flags | MDS_OPEN_BY_FID,
2068 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2069 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2070 /* this code will only exist for big-endian systems */
2071 lustre_swab_lov_user_md(lum, 0);
2074 ll_inode_size_lock(inode);
2075 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2077 GOTO(out_unlock, rc);
2079 ll_release_openhandle(dentry, &oit);
2082 ll_inode_size_unlock(inode);
2083 ll_intent_release(&oit);
2088 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2089 struct lov_mds_md **lmmp, int *lmm_size,
2090 struct ptlrpc_request **request)
2092 struct ll_sb_info *sbi = ll_i2sbi(inode);
2093 struct mdt_body *body;
2094 struct lov_mds_md *lmm = NULL;
2095 struct ptlrpc_request *req = NULL;
2096 struct md_op_data *op_data;
2099 rc = ll_get_default_mdsize(sbi, &lmmsize);
2103 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2104 strlen(filename), lmmsize,
2105 LUSTRE_OPC_ANY, NULL);
2106 if (IS_ERR(op_data))
2107 RETURN(PTR_ERR(op_data));
2109 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2110 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2111 ll_finish_md_op_data(op_data);
2113 CDEBUG(D_INFO, "md_getattr_name failed "
2114 "on %s: rc %d\n", filename, rc);
2118 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2119 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2121 lmmsize = body->mbo_eadatasize;
2123 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2125 GOTO(out, rc = -ENODATA);
2128 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2129 LASSERT(lmm != NULL);
2131 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2132 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2133 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2134 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2135 GOTO(out, rc = -EPROTO);
2138 * This is coming from the MDS, so is probably in
2139 * little endian. We convert it to host endian before
2140 * passing it to userspace.
2142 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2143 __swab32(LOV_MAGIC_MAGIC)) {
2144 int stripe_count = 0;
2146 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2147 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2148 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2149 if (le32_to_cpu(lmm->lmm_pattern) &
2150 LOV_PATTERN_F_RELEASED)
2154 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2156 /* if function called for directory - we should
2157 * avoid swab not existent lsm objects */
2158 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2159 lustre_swab_lov_user_md_objects(
2160 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2162 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2163 S_ISREG(body->mbo_mode))
2164 lustre_swab_lov_user_md_objects(
2165 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2171 *lmm_size = lmmsize;
2176 static int ll_lov_setea(struct inode *inode, struct file *file,
2179 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2180 struct lov_user_md *lump;
2181 int lum_size = sizeof(struct lov_user_md) +
2182 sizeof(struct lov_user_ost_data);
2186 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2189 OBD_ALLOC_LARGE(lump, lum_size);
2193 if (copy_from_user(lump, arg, lum_size))
2194 GOTO(out_lump, rc = -EFAULT);
2196 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2198 cl_lov_delay_create_clear(&file->f_flags);
2201 OBD_FREE_LARGE(lump, lum_size);
2205 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2212 env = cl_env_get(&refcheck);
2214 RETURN(PTR_ERR(env));
2216 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2217 cl_env_put(env, &refcheck);
2221 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2224 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2225 struct lov_user_md *klum;
2227 __u64 flags = FMODE_WRITE;
2230 rc = ll_copy_user_md(lum, &klum);
2235 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2240 rc = put_user(0, &lum->lmm_stripe_count);
2244 rc = ll_layout_refresh(inode, &gen);
2248 rc = ll_file_getstripe(inode, arg, lum_size);
2250 cl_lov_delay_create_clear(&file->f_flags);
2253 OBD_FREE_LARGE(klum, lum_size);
2259 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2261 struct ll_inode_info *lli = ll_i2info(inode);
2262 struct cl_object *obj = lli->lli_clob;
2263 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2264 struct ll_grouplock grouplock;
2269 CWARN("group id for group lock must not be 0\n");
2273 if (ll_file_nolock(file))
2274 RETURN(-EOPNOTSUPP);
2276 if (file->f_flags & O_NONBLOCK) {
2277 if (!mutex_trylock(&lli->lli_group_mutex))
2280 mutex_lock(&lli->lli_group_mutex);
2282 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2283 CWARN("group lock already existed with gid %lu\n",
2284 fd->fd_grouplock.lg_gid);
2285 GOTO(out, rc = -EINVAL);
2287 if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2288 if (file->f_flags & O_NONBLOCK)
2289 GOTO(out, rc = -EAGAIN);
2290 mutex_unlock(&lli->lli_group_mutex);
2291 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2292 GOTO(retry, rc = 0);
2294 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2297 * XXX: group lock needs to protect all OST objects while PFL
2298 * can add new OST objects during the IO, so we'd instantiate
2299 * all OST objects before getting its group lock.
2304 struct cl_layout cl = {
2305 .cl_is_composite = false,
2307 struct lu_extent ext = {
2309 .e_end = OBD_OBJECT_EOF,
2312 env = cl_env_get(&refcheck);
2314 GOTO(out, rc = PTR_ERR(env));
2316 rc = cl_object_layout_get(env, obj, &cl);
2317 if (!rc && cl.cl_is_composite)
2318 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2321 cl_env_put(env, &refcheck);
2326 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2327 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2332 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2333 fd->fd_grouplock = grouplock;
2334 if (lli->lli_group_users == 0)
2335 lli->lli_group_gid = grouplock.lg_gid;
2336 lli->lli_group_users++;
2338 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2340 mutex_unlock(&lli->lli_group_mutex);
2345 static int ll_put_grouplock(struct inode *inode, struct file *file,
2348 struct ll_inode_info *lli = ll_i2info(inode);
2349 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2350 struct ll_grouplock grouplock;
2354 mutex_lock(&lli->lli_group_mutex);
2355 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2356 CWARN("no group lock held\n");
2357 GOTO(out, rc = -EINVAL);
2360 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2362 if (fd->fd_grouplock.lg_gid != arg) {
2363 CWARN("group lock %lu doesn't match current id %lu\n",
2364 arg, fd->fd_grouplock.lg_gid);
2365 GOTO(out, rc = -EINVAL);
2368 grouplock = fd->fd_grouplock;
2369 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2370 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2372 cl_put_grouplock(&grouplock);
2374 lli->lli_group_users--;
2375 if (lli->lli_group_users == 0) {
2376 lli->lli_group_gid = 0;
2377 wake_up_var(&lli->lli_group_users);
2379 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2382 mutex_unlock(&lli->lli_group_mutex);
2388 * Close inode open handle
2390 * \param dentry [in] dentry which contains the inode
2391 * \param it [in,out] intent which contains open info and result
2394 * \retval <0 failure
2396 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2398 struct inode *inode = dentry->d_inode;
2399 struct obd_client_handle *och;
2405 /* Root ? Do nothing. */
2406 if (dentry->d_inode->i_sb->s_root == dentry)
2409 /* No open handle to close? Move away */
2410 if (!it_disposition(it, DISP_OPEN_OPEN))
2413 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2415 OBD_ALLOC(och, sizeof(*och));
2417 GOTO(out, rc = -ENOMEM);
2419 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2423 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2425 /* this one is in place of ll_file_open */
2426 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2427 ptlrpc_req_finished(it->it_request);
2428 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2434 * Get size for inode for which FIEMAP mapping is requested.
2435 * Make the FIEMAP get_info call and returns the result.
2436 * \param fiemap kernel buffer to hold extens
2437 * \param num_bytes kernel buffer size
2439 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2445 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2448 /* Checks for fiemap flags */
2449 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2450 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2454 /* Check for FIEMAP_FLAG_SYNC */
2455 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2456 rc = filemap_fdatawrite(inode->i_mapping);
2461 env = cl_env_get(&refcheck);
2463 RETURN(PTR_ERR(env));
2465 if (i_size_read(inode) == 0) {
2466 rc = ll_glimpse_size(inode);
2471 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2472 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2473 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2475 /* If filesize is 0, then there would be no objects for mapping */
2476 if (fmkey.lfik_oa.o_size == 0) {
2477 fiemap->fm_mapped_extents = 0;
2481 fmkey.lfik_fiemap = *fiemap;
2483 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2484 &fmkey, fiemap, &num_bytes);
2486 cl_env_put(env, &refcheck);
2490 int ll_fid2path(struct inode *inode, void __user *arg)
2492 struct obd_export *exp = ll_i2mdexp(inode);
2493 const struct getinfo_fid2path __user *gfin = arg;
2495 struct getinfo_fid2path *gfout;
2501 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2502 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2505 /* Only need to get the buflen */
2506 if (get_user(pathlen, &gfin->gf_pathlen))
2509 if (pathlen > PATH_MAX)
2512 outsize = sizeof(*gfout) + pathlen;
2513 OBD_ALLOC(gfout, outsize);
2517 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2518 GOTO(gf_free, rc = -EFAULT);
2519 /* append root FID after gfout to let MDT know the root FID so that it
2520 * can lookup the correct path, this is mainly for fileset.
2521 * old server without fileset mount support will ignore this. */
2522 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2524 /* Call mdc_iocontrol */
2525 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2529 if (copy_to_user(arg, gfout, outsize))
2533 OBD_FREE(gfout, outsize);
2538 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2540 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2548 ioc->idv_version = 0;
2549 ioc->idv_layout_version = UINT_MAX;
2551 /* If no file object initialized, we consider its version is 0. */
2555 env = cl_env_get(&refcheck);
2557 RETURN(PTR_ERR(env));
2559 io = vvp_env_thread_io(env);
2561 io->u.ci_data_version.dv_data_version = 0;
2562 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2563 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2566 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2567 result = cl_io_loop(env, io);
2569 result = io->ci_result;
2571 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2572 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2574 cl_io_fini(env, io);
2576 if (unlikely(io->ci_need_restart))
2579 cl_env_put(env, &refcheck);
2585 * Read the data_version for inode.
2587 * This value is computed using stripe object version on OST.
2588 * Version is computed using server side locking.
2590 * @param flags if do sync on the OST side;
2592 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2593 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2595 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2597 struct ioc_data_version ioc = { .idv_flags = flags };
2600 rc = ll_ioc_data_version(inode, &ioc);
2602 *data_version = ioc.idv_version;
2608 * Trigger a HSM release request for the provided inode.
2610 int ll_hsm_release(struct inode *inode)
2613 struct obd_client_handle *och = NULL;
2614 __u64 data_version = 0;
2619 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2620 ll_i2sbi(inode)->ll_fsname,
2621 PFID(&ll_i2info(inode)->lli_fid));
2623 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2625 GOTO(out, rc = PTR_ERR(och));
2627 /* Grab latest data_version and [am]time values */
2628 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2632 env = cl_env_get(&refcheck);
2634 GOTO(out, rc = PTR_ERR(env));
2636 rc = ll_merge_attr(env, inode);
2637 cl_env_put(env, &refcheck);
2639 /* If error happen, we have the wrong size for a file.
2645 /* Release the file.
2646 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2647 * we still need it to pack l_remote_handle to MDT. */
2648 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2654 if (och != NULL && !IS_ERR(och)) /* close the file */
2655 ll_lease_close(och, inode, NULL);
2660 struct ll_swap_stack {
2663 struct inode *inode1;
2664 struct inode *inode2;
2669 static int ll_swap_layouts(struct file *file1, struct file *file2,
2670 struct lustre_swap_layouts *lsl)
2672 struct mdc_swap_layouts msl;
2673 struct md_op_data *op_data;
2676 struct ll_swap_stack *llss = NULL;
2679 OBD_ALLOC_PTR(llss);
2683 llss->inode1 = file_inode(file1);
2684 llss->inode2 = file_inode(file2);
2686 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2690 /* we use 2 bool because it is easier to swap than 2 bits */
2691 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2692 llss->check_dv1 = true;
2694 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2695 llss->check_dv2 = true;
2697 /* we cannot use lsl->sl_dvX directly because we may swap them */
2698 llss->dv1 = lsl->sl_dv1;
2699 llss->dv2 = lsl->sl_dv2;
2701 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2702 if (rc == 0) /* same file, done! */
2705 if (rc < 0) { /* sequentialize it */
2706 swap(llss->inode1, llss->inode2);
2708 swap(llss->dv1, llss->dv2);
2709 swap(llss->check_dv1, llss->check_dv2);
2713 if (gid != 0) { /* application asks to flush dirty cache */
2714 rc = ll_get_grouplock(llss->inode1, file1, gid);
2718 rc = ll_get_grouplock(llss->inode2, file2, gid);
2720 ll_put_grouplock(llss->inode1, file1, gid);
2725 /* ultimate check, before swaping the layouts we check if
2726 * dataversion has changed (if requested) */
2727 if (llss->check_dv1) {
2728 rc = ll_data_version(llss->inode1, &dv, 0);
2731 if (dv != llss->dv1)
2732 GOTO(putgl, rc = -EAGAIN);
2735 if (llss->check_dv2) {
2736 rc = ll_data_version(llss->inode2, &dv, 0);
2739 if (dv != llss->dv2)
2740 GOTO(putgl, rc = -EAGAIN);
2743 /* struct md_op_data is used to send the swap args to the mdt
2744 * only flags is missing, so we use struct mdc_swap_layouts
2745 * through the md_op_data->op_data */
2746 /* flags from user space have to be converted before they are send to
2747 * server, no flag is sent today, they are only used on the client */
2750 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2751 0, LUSTRE_OPC_ANY, &msl);
2752 if (IS_ERR(op_data))
2753 GOTO(free, rc = PTR_ERR(op_data));
2755 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2756 sizeof(*op_data), op_data, NULL);
2757 ll_finish_md_op_data(op_data);
2764 ll_put_grouplock(llss->inode2, file2, gid);
2765 ll_put_grouplock(llss->inode1, file1, gid);
2775 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2777 struct obd_export *exp = ll_i2mdexp(inode);
2778 struct md_op_data *op_data;
2782 /* Detect out-of range masks */
2783 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2786 /* Non-root users are forbidden to set or clear flags which are
2787 * NOT defined in HSM_USER_MASK. */
2788 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2789 !cfs_capable(CFS_CAP_SYS_ADMIN))
2792 if (!exp_connect_archive_id_array(exp)) {
2793 /* Detect out-of range archive id */
2794 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2795 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2799 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2800 LUSTRE_OPC_ANY, hss);
2801 if (IS_ERR(op_data))
2802 RETURN(PTR_ERR(op_data));
2804 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2807 ll_finish_md_op_data(op_data);
2812 static int ll_hsm_import(struct inode *inode, struct file *file,
2813 struct hsm_user_import *hui)
2815 struct hsm_state_set *hss = NULL;
2816 struct iattr *attr = NULL;
2820 if (!S_ISREG(inode->i_mode))
2826 GOTO(out, rc = -ENOMEM);
2828 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2829 hss->hss_archive_id = hui->hui_archive_id;
2830 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2831 rc = ll_hsm_state_set(inode, hss);
2835 OBD_ALLOC_PTR(attr);
2837 GOTO(out, rc = -ENOMEM);
2839 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2840 attr->ia_mode |= S_IFREG;
2841 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2842 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2843 attr->ia_size = hui->hui_size;
2844 attr->ia_mtime.tv_sec = hui->hui_mtime;
2845 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2846 attr->ia_atime.tv_sec = hui->hui_atime;
2847 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2849 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2850 ATTR_UID | ATTR_GID |
2851 ATTR_MTIME | ATTR_MTIME_SET |
2852 ATTR_ATIME | ATTR_ATIME_SET;
2856 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2860 inode_unlock(inode);
2872 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2874 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2875 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2878 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2880 struct inode *inode = file_inode(file);
2882 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2883 ATTR_MTIME | ATTR_MTIME_SET |
2886 .tv_sec = lfu->lfu_atime_sec,
2887 .tv_nsec = lfu->lfu_atime_nsec,
2890 .tv_sec = lfu->lfu_mtime_sec,
2891 .tv_nsec = lfu->lfu_mtime_nsec,
2894 .tv_sec = lfu->lfu_ctime_sec,
2895 .tv_nsec = lfu->lfu_ctime_nsec,
2901 if (!capable(CAP_SYS_ADMIN))
2904 if (!S_ISREG(inode->i_mode))
2908 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2910 inode_unlock(inode);
2915 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2918 case MODE_READ_USER:
2920 case MODE_WRITE_USER:
2927 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2929 /* Used to allow the upper layers of the client to request an LDLM lock
2930 * without doing an actual read or write.
2932 * Used for ladvise lockahead to manually request specific locks.
2934 * \param[in] file file this ladvise lock request is on
2935 * \param[in] ladvise ladvise struct describing this lock request
2937 * \retval 0 success, no detailed result available (sync requests
2938 * and requests sent to the server [not handled locally]
2939 * cannot return detailed results)
2940 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2941 * see definitions for details.
2942 * \retval negative negative errno on error
2944 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2946 struct lu_env *env = NULL;
2947 struct cl_io *io = NULL;
2948 struct cl_lock *lock = NULL;
2949 struct cl_lock_descr *descr = NULL;
2950 struct dentry *dentry = file->f_path.dentry;
2951 struct inode *inode = dentry->d_inode;
2952 enum cl_lock_mode cl_mode;
2953 off_t start = ladvise->lla_start;
2954 off_t end = ladvise->lla_end;
2960 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2961 "start=%llu, end=%llu\n", dentry->d_name.len,
2962 dentry->d_name.name, dentry->d_inode,
2963 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2966 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2968 GOTO(out, result = cl_mode);
2970 /* Get IO environment */
2971 result = cl_io_get(inode, &env, &io, &refcheck);
2975 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2978 * nothing to do for this io. This currently happens when
2979 * stripe sub-object's are not yet created.
2981 result = io->ci_result;
2982 } else if (result == 0) {
2983 lock = vvp_env_lock(env);
2984 descr = &lock->cll_descr;
2986 descr->cld_obj = io->ci_obj;
2987 /* Convert byte offsets to pages */
2988 descr->cld_start = cl_index(io->ci_obj, start);
2989 descr->cld_end = cl_index(io->ci_obj, end);
2990 descr->cld_mode = cl_mode;
2991 /* CEF_MUST is used because we do not want to convert a
2992 * lockahead request to a lockless lock */
2993 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2996 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2997 descr->cld_enq_flags |= CEF_SPECULATIVE;
2999 result = cl_lock_request(env, io, lock);
3001 /* On success, we need to release the lock */
3003 cl_lock_release(env, lock);
3005 cl_io_fini(env, io);
3006 cl_env_put(env, &refcheck);
3008 /* -ECANCELED indicates a matching lock with a different extent
3009 * was already present, and -EEXIST indicates a matching lock
3010 * on exactly the same extent was already present.
3011 * We convert them to positive values for userspace to make
3012 * recognizing true errors easier.
3013 * Note we can only return these detailed results on async requests,
3014 * as sync requests look the same as i/o requests for locking. */
3015 if (result == -ECANCELED)
3016 result = LLA_RESULT_DIFFERENT;
3017 else if (result == -EEXIST)
3018 result = LLA_RESULT_SAME;
3023 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3025 static int ll_ladvise_sanity(struct inode *inode,
3026 struct llapi_lu_ladvise *ladvise)
3028 struct ll_sb_info *sbi = ll_i2sbi(inode);
3029 enum lu_ladvise_type advice = ladvise->lla_advice;
3030 /* Note the peradvice flags is a 32 bit field, so per advice flags must
3031 * be in the first 32 bits of enum ladvise_flags */
3032 __u32 flags = ladvise->lla_peradvice_flags;
3033 /* 3 lines at 80 characters per line, should be plenty */
3036 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3038 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3039 "last supported advice is %s (value '%d'): rc = %d\n",
3040 sbi->ll_fsname, advice,
3041 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3045 /* Per-advice checks */
3047 case LU_LADVISE_LOCKNOEXPAND:
3048 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3050 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3051 "rc = %d\n", sbi->ll_fsname, flags,
3052 ladvise_names[advice], rc);
3056 case LU_LADVISE_LOCKAHEAD:
3057 /* Currently only READ and WRITE modes can be requested */
3058 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3059 ladvise->lla_lockahead_mode == 0) {
3061 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3062 "rc = %d\n", sbi->ll_fsname,
3063 ladvise->lla_lockahead_mode,
3064 ladvise_names[advice], rc);
3068 case LU_LADVISE_WILLREAD:
3069 case LU_LADVISE_DONTNEED:
3071 /* Note fall through above - These checks apply to all advices
3072 * except LOCKNOEXPAND */
3073 if (flags & ~LF_DEFAULT_MASK) {
3075 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3076 "rc = %d\n", sbi->ll_fsname, flags,
3077 ladvise_names[advice], rc);
3080 if (ladvise->lla_start >= ladvise->lla_end) {
3082 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3083 "for %s: rc = %d\n", sbi->ll_fsname,
3084 ladvise->lla_start, ladvise->lla_end,
3085 ladvise_names[advice], rc);
3097 * Give file access advices
3099 * The ladvise interface is similar to Linux fadvise() system call, except it
3100 * forwards the advices directly from Lustre client to server. The server side
3101 * codes will apply appropriate read-ahead and caching techniques for the
3102 * corresponding files.
3104 * A typical workload for ladvise is e.g. a bunch of different clients are
3105 * doing small random reads of a file, so prefetching pages into OSS cache
3106 * with big linear reads before the random IO is a net benefit. Fetching
3107 * all that data into each client cache with fadvise() may not be, due to
3108 * much more data being sent to the client.
3110 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3111 struct llapi_lu_ladvise *ladvise)
3115 struct cl_ladvise_io *lio;
3120 env = cl_env_get(&refcheck);
3122 RETURN(PTR_ERR(env));
3124 io = vvp_env_thread_io(env);
3125 io->ci_obj = ll_i2info(inode)->lli_clob;
3127 /* initialize parameters for ladvise */
3128 lio = &io->u.ci_ladvise;
3129 lio->li_start = ladvise->lla_start;
3130 lio->li_end = ladvise->lla_end;
3131 lio->li_fid = ll_inode2fid(inode);
3132 lio->li_advice = ladvise->lla_advice;
3133 lio->li_flags = flags;
3135 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3136 rc = cl_io_loop(env, io);
3140 cl_io_fini(env, io);
3141 cl_env_put(env, &refcheck);
3145 static int ll_lock_noexpand(struct file *file, int flags)
3147 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3149 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3154 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3157 struct fsxattr fsxattr;
3159 if (copy_from_user(&fsxattr,
3160 (const struct fsxattr __user *)arg,
3164 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3165 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3166 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3167 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3168 if (copy_to_user((struct fsxattr __user *)arg,
3169 &fsxattr, sizeof(fsxattr)))
3175 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3178 * Project Quota ID state is only allowed to change from within the init
3179 * namespace. Enforce that restriction only if we are trying to change
3180 * the quota ID state. Everything else is allowed in user namespaces.
3182 if (current_user_ns() == &init_user_ns)
3185 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3188 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3189 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3192 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3199 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3203 struct md_op_data *op_data;
3204 struct ptlrpc_request *req = NULL;
3206 struct fsxattr fsxattr;
3207 struct cl_object *obj;
3211 if (copy_from_user(&fsxattr,
3212 (const struct fsxattr __user *)arg,
3216 rc = ll_ioctl_check_project(inode, &fsxattr);
3220 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3221 LUSTRE_OPC_ANY, NULL);
3222 if (IS_ERR(op_data))
3223 RETURN(PTR_ERR(op_data));
3225 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3226 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3227 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3228 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3229 op_data->op_projid = fsxattr.fsx_projid;
3230 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3231 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3233 ptlrpc_req_finished(req);
3235 GOTO(out_fsxattr, rc);
3236 ll_update_inode_flags(inode, op_data->op_attr_flags);
3237 obj = ll_i2info(inode)->lli_clob;
3239 GOTO(out_fsxattr, rc);
3241 OBD_ALLOC_PTR(attr);
3243 GOTO(out_fsxattr, rc = -ENOMEM);
3245 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3246 fsxattr.fsx_xflags);
3249 ll_finish_md_op_data(op_data);
3253 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3256 struct inode *inode = file_inode(file);
3257 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3258 struct ll_inode_info *lli = ll_i2info(inode);
3259 struct obd_client_handle *och = NULL;
3260 struct split_param sp;
3261 struct pcc_param param;
3262 bool lease_broken = false;
3264 enum mds_op_bias bias = 0;
3265 struct file *layout_file = NULL;
3267 size_t data_size = 0;
3268 bool attached = false;
3273 mutex_lock(&lli->lli_och_mutex);
3274 if (fd->fd_lease_och != NULL) {
3275 och = fd->fd_lease_och;
3276 fd->fd_lease_och = NULL;
3278 mutex_unlock(&lli->lli_och_mutex);
3283 fmode = och->och_flags;
3285 switch (ioc->lil_flags) {
3286 case LL_LEASE_RESYNC_DONE:
3287 if (ioc->lil_count > IOC_IDS_MAX)
3288 GOTO(out_lease_close, rc = -EINVAL);
3290 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3291 OBD_ALLOC(data, data_size);
3293 GOTO(out_lease_close, rc = -ENOMEM);
3295 if (copy_from_user(data, (void __user *)arg, data_size))
3296 GOTO(out_lease_close, rc = -EFAULT);
3298 bias = MDS_CLOSE_RESYNC_DONE;
3300 case LL_LEASE_LAYOUT_MERGE: {
3303 if (ioc->lil_count != 1)
3304 GOTO(out_lease_close, rc = -EINVAL);
3306 arg += sizeof(*ioc);
3307 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3308 GOTO(out_lease_close, rc = -EFAULT);
3310 layout_file = fget(fd);
3312 GOTO(out_lease_close, rc = -EBADF);
3314 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3315 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3316 GOTO(out_lease_close, rc = -EPERM);
3318 data = file_inode(layout_file);
3319 bias = MDS_CLOSE_LAYOUT_MERGE;
3322 case LL_LEASE_LAYOUT_SPLIT: {
3326 if (ioc->lil_count != 2)
3327 GOTO(out_lease_close, rc = -EINVAL);
3329 arg += sizeof(*ioc);
3330 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3331 GOTO(out_lease_close, rc = -EFAULT);
3333 arg += sizeof(__u32);
3334 if (copy_from_user(&mirror_id, (void __user *)arg,
3336 GOTO(out_lease_close, rc = -EFAULT);
3338 layout_file = fget(fdv);
3340 GOTO(out_lease_close, rc = -EBADF);
3342 sp.sp_inode = file_inode(layout_file);
3343 sp.sp_mirror_id = (__u16)mirror_id;
3345 bias = MDS_CLOSE_LAYOUT_SPLIT;
3348 case LL_LEASE_PCC_ATTACH:
3349 if (ioc->lil_count != 1)
3352 arg += sizeof(*ioc);
3353 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3355 GOTO(out_lease_close, rc2 = -EFAULT);
3357 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3359 GOTO(out_lease_close, rc2);
3362 /* Grab latest data version */
3363 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3366 GOTO(out_lease_close, rc2);
3369 bias = MDS_PCC_ATTACH;
3372 /* without close intent */
3377 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3381 rc = ll_lease_och_release(inode, file);
3390 switch (ioc->lil_flags) {
3391 case LL_LEASE_RESYNC_DONE:
3393 OBD_FREE(data, data_size);
3395 case LL_LEASE_LAYOUT_MERGE:
3396 case LL_LEASE_LAYOUT_SPLIT:
3400 case LL_LEASE_PCC_ATTACH:
3403 rc = pcc_readwrite_attach_fini(file, inode,
3404 param.pa_layout_gen,
3411 rc = ll_lease_type_from_fmode(fmode);
3415 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3418 struct inode *inode = file_inode(file);
3419 struct ll_inode_info *lli = ll_i2info(inode);
3420 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3421 struct obd_client_handle *och = NULL;
3422 __u64 open_flags = 0;
3428 switch (ioc->lil_mode) {
3429 case LL_LEASE_WRLCK:
3430 if (!(file->f_mode & FMODE_WRITE))
3432 fmode = FMODE_WRITE;
3434 case LL_LEASE_RDLCK:
3435 if (!(file->f_mode & FMODE_READ))
3439 case LL_LEASE_UNLCK:
3440 RETURN(ll_file_unlock_lease(file, ioc, arg));
3445 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3447 /* apply for lease */
3448 if (ioc->lil_flags & LL_LEASE_RESYNC)
3449 open_flags = MDS_OPEN_RESYNC;
3450 och = ll_lease_open(inode, file, fmode, open_flags);
3452 RETURN(PTR_ERR(och));
3454 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3455 rc = ll_lease_file_resync(och, inode, arg);
3457 ll_lease_close(och, inode, NULL);
3460 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3462 ll_lease_close(och, inode, NULL);
3468 mutex_lock(&lli->lli_och_mutex);
3469 if (fd->fd_lease_och == NULL) {
3470 fd->fd_lease_och = och;
3473 mutex_unlock(&lli->lli_och_mutex);
3475 /* impossible now that only excl is supported for now */
3476 ll_lease_close(och, inode, &lease_broken);
3482 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3484 struct ll_inode_info *lli = ll_i2info(inode);
3485 struct ll_sb_info *sbi = ll_i2sbi(inode);
3486 __u64 now = ktime_get_real_seconds();
3489 spin_lock(&lli->lli_heat_lock);
3490 heat->lh_flags = lli->lli_heat_flags;
3491 for (i = 0; i < heat->lh_count; i++)
3492 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3493 now, sbi->ll_heat_decay_weight,
3494 sbi->ll_heat_period_second);
3495 spin_unlock(&lli->lli_heat_lock);
3498 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3500 struct ll_inode_info *lli = ll_i2info(inode);
3503 spin_lock(&lli->lli_heat_lock);
3504 if (flags & LU_HEAT_FLAG_CLEAR)
3505 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3507 if (flags & LU_HEAT_FLAG_OFF)
3508 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3510 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3512 spin_unlock(&lli->lli_heat_lock);
3518 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3520 struct inode *inode = file_inode(file);
3521 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3525 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3526 PFID(ll_inode2fid(inode)), inode, cmd);
3527 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3529 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3530 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3534 case LL_IOC_GETFLAGS:
3535 /* Get the current value of the file flags */
3536 return put_user(fd->fd_flags, (int __user *)arg);
3537 case LL_IOC_SETFLAGS:
3538 case LL_IOC_CLRFLAGS:
3539 /* Set or clear specific file flags */
3540 /* XXX This probably needs checks to ensure the flags are
3541 * not abused, and to handle any flag side effects.
3543 if (get_user(flags, (int __user *) arg))
3546 if (cmd == LL_IOC_SETFLAGS) {
3547 if ((flags & LL_FILE_IGNORE_LOCK) &&
3548 !(file->f_flags & O_DIRECT)) {
3549 CERROR("%s: unable to disable locking on "
3550 "non-O_DIRECT file\n", current->comm);
3554 fd->fd_flags |= flags;
3556 fd->fd_flags &= ~flags;
3559 case LL_IOC_LOV_SETSTRIPE:
3560 case LL_IOC_LOV_SETSTRIPE_NEW:
3561 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3562 case LL_IOC_LOV_SETEA:
3563 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3564 case LL_IOC_LOV_SWAP_LAYOUTS: {
3566 struct lustre_swap_layouts lsl;
3568 if (copy_from_user(&lsl, (char __user *)arg,
3569 sizeof(struct lustre_swap_layouts)))
3572 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3575 file2 = fget(lsl.sl_fd);
3579 /* O_WRONLY or O_RDWR */
3580 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3581 GOTO(out, rc = -EPERM);
3583 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3584 struct inode *inode2;
3585 struct ll_inode_info *lli;
3586 struct obd_client_handle *och = NULL;
3588 lli = ll_i2info(inode);
3589 mutex_lock(&lli->lli_och_mutex);
3590 if (fd->fd_lease_och != NULL) {
3591 och = fd->fd_lease_och;
3592 fd->fd_lease_och = NULL;
3594 mutex_unlock(&lli->lli_och_mutex);
3596 GOTO(out, rc = -ENOLCK);
3597 inode2 = file_inode(file2);
3598 rc = ll_swap_layouts_close(och, inode, inode2);
3600 rc = ll_swap_layouts(file, file2, &lsl);
3606 case LL_IOC_LOV_GETSTRIPE:
3607 case LL_IOC_LOV_GETSTRIPE_NEW:
3608 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3609 case FS_IOC_GETFLAGS:
3610 case FS_IOC_SETFLAGS:
3611 RETURN(ll_iocontrol(inode, file, cmd, arg));
3612 case FSFILT_IOC_GETVERSION:
3613 case FS_IOC_GETVERSION:
3614 RETURN(put_user(inode->i_generation, (int __user *)arg));
3615 /* We need to special case any other ioctls we want to handle,
3616 * to send them to the MDS/OST as appropriate and to properly
3617 * network encode the arg field. */
3618 case FS_IOC_SETVERSION:
3621 case LL_IOC_GROUP_LOCK:
3622 RETURN(ll_get_grouplock(inode, file, arg));
3623 case LL_IOC_GROUP_UNLOCK:
3624 RETURN(ll_put_grouplock(inode, file, arg));
3625 case IOC_OBD_STATFS:
3626 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3628 case LL_IOC_FLUSHCTX:
3629 RETURN(ll_flush_ctx(inode));
3630 case LL_IOC_PATH2FID: {
3631 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3632 sizeof(struct lu_fid)))
3637 case LL_IOC_GETPARENT:
3638 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3640 case OBD_IOC_FID2PATH:
3641 RETURN(ll_fid2path(inode, (void __user *)arg));
3642 case LL_IOC_DATA_VERSION: {
3643 struct ioc_data_version idv;
3646 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3649 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3650 rc = ll_ioc_data_version(inode, &idv);
3653 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3659 case LL_IOC_GET_MDTIDX: {
3662 mdtidx = ll_get_mdt_idx(inode);
3666 if (put_user((int)mdtidx, (int __user *)arg))
3671 case OBD_IOC_GETDTNAME:
3672 case OBD_IOC_GETMDNAME:
3673 RETURN(ll_get_obd_name(inode, cmd, arg));
3674 case LL_IOC_HSM_STATE_GET: {
3675 struct md_op_data *op_data;
3676 struct hsm_user_state *hus;
3683 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3684 LUSTRE_OPC_ANY, hus);
3685 if (IS_ERR(op_data)) {
3687 RETURN(PTR_ERR(op_data));
3690 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3693 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3696 ll_finish_md_op_data(op_data);
3700 case LL_IOC_HSM_STATE_SET: {
3701 struct hsm_state_set *hss;
3708 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3713 rc = ll_hsm_state_set(inode, hss);
3718 case LL_IOC_HSM_ACTION: {
3719 struct md_op_data *op_data;
3720 struct hsm_current_action *hca;
3727 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3728 LUSTRE_OPC_ANY, hca);
3729 if (IS_ERR(op_data)) {
3731 RETURN(PTR_ERR(op_data));
3734 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3737 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3740 ll_finish_md_op_data(op_data);
3744 case LL_IOC_SET_LEASE_OLD: {
3745 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3747 RETURN(ll_file_set_lease(file, &ioc, 0));
3749 case LL_IOC_SET_LEASE: {
3750 struct ll_ioc_lease ioc;
3752 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3755 RETURN(ll_file_set_lease(file, &ioc, arg));
3757 case LL_IOC_GET_LEASE: {
3758 struct ll_inode_info *lli = ll_i2info(inode);
3759 struct ldlm_lock *lock = NULL;
3762 mutex_lock(&lli->lli_och_mutex);
3763 if (fd->fd_lease_och != NULL) {
3764 struct obd_client_handle *och = fd->fd_lease_och;
3766 lock = ldlm_handle2lock(&och->och_lease_handle);
3768 lock_res_and_lock(lock);
3769 if (!ldlm_is_cancel(lock))
3770 fmode = och->och_flags;
3772 unlock_res_and_lock(lock);
3773 LDLM_LOCK_PUT(lock);
3776 mutex_unlock(&lli->lli_och_mutex);
3778 RETURN(ll_lease_type_from_fmode(fmode));
3780 case LL_IOC_HSM_IMPORT: {
3781 struct hsm_user_import *hui;
3787 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3792 rc = ll_hsm_import(inode, file, hui);
3797 case LL_IOC_FUTIMES_3: {
3798 struct ll_futimes_3 lfu;
3800 if (copy_from_user(&lfu,
3801 (const struct ll_futimes_3 __user *)arg,
3805 RETURN(ll_file_futimes_3(file, &lfu));
3807 case LL_IOC_LADVISE: {
3808 struct llapi_ladvise_hdr *k_ladvise_hdr;
3809 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3812 int alloc_size = sizeof(*k_ladvise_hdr);
3815 u_ladvise_hdr = (void __user *)arg;
3816 OBD_ALLOC_PTR(k_ladvise_hdr);
3817 if (k_ladvise_hdr == NULL)
3820 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3821 GOTO(out_ladvise, rc = -EFAULT);
3823 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3824 k_ladvise_hdr->lah_count < 1)
3825 GOTO(out_ladvise, rc = -EINVAL);
3827 num_advise = k_ladvise_hdr->lah_count;
3828 if (num_advise >= LAH_COUNT_MAX)
3829 GOTO(out_ladvise, rc = -EFBIG);
3831 OBD_FREE_PTR(k_ladvise_hdr);
3832 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3833 lah_advise[num_advise]);
3834 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3835 if (k_ladvise_hdr == NULL)
3839 * TODO: submit multiple advices to one server in a single RPC
3841 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3842 GOTO(out_ladvise, rc = -EFAULT);
3844 for (i = 0; i < num_advise; i++) {
3845 struct llapi_lu_ladvise *k_ladvise =
3846 &k_ladvise_hdr->lah_advise[i];
3847 struct llapi_lu_ladvise __user *u_ladvise =
3848 &u_ladvise_hdr->lah_advise[i];
3850 rc = ll_ladvise_sanity(inode, k_ladvise);
3852 GOTO(out_ladvise, rc);
3854 switch (k_ladvise->lla_advice) {
3855 case LU_LADVISE_LOCKNOEXPAND:
3856 rc = ll_lock_noexpand(file,
3857 k_ladvise->lla_peradvice_flags);
3858 GOTO(out_ladvise, rc);
3859 case LU_LADVISE_LOCKAHEAD:
3861 rc = ll_file_lock_ahead(file, k_ladvise);
3864 GOTO(out_ladvise, rc);
3867 &u_ladvise->lla_lockahead_result))
3868 GOTO(out_ladvise, rc = -EFAULT);
3871 rc = ll_ladvise(inode, file,
3872 k_ladvise_hdr->lah_flags,
3875 GOTO(out_ladvise, rc);
3882 OBD_FREE(k_ladvise_hdr, alloc_size);
3885 case LL_IOC_FLR_SET_MIRROR: {
3886 /* mirror I/O must be direct to avoid polluting page cache
3888 if (!(file->f_flags & O_DIRECT))
3891 fd->fd_designated_mirror = (__u32)arg;
3894 case LL_IOC_FSGETXATTR:
3895 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3896 case LL_IOC_FSSETXATTR:
3897 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3899 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3900 case LL_IOC_HEAT_GET: {
3901 struct lu_heat uheat;
3902 struct lu_heat *heat;
3905 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3908 if (uheat.lh_count > OBD_HEAT_COUNT)
3909 uheat.lh_count = OBD_HEAT_COUNT;
3911 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3912 OBD_ALLOC(heat, size);
3916 heat->lh_count = uheat.lh_count;
3917 ll_heat_get(inode, heat);
3918 rc = copy_to_user((char __user *)arg, heat, size);
3919 OBD_FREE(heat, size);
3920 RETURN(rc ? -EFAULT : 0);
3922 case LL_IOC_HEAT_SET: {
3925 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3928 rc = ll_heat_set(inode, flags);
3931 case LL_IOC_PCC_DETACH: {
3932 struct lu_pcc_detach *detach;
3934 OBD_ALLOC_PTR(detach);
3938 if (copy_from_user(detach,
3939 (const struct lu_pcc_detach __user *)arg,
3941 GOTO(out_detach_free, rc = -EFAULT);
3943 if (!S_ISREG(inode->i_mode))
3944 GOTO(out_detach_free, rc = -EINVAL);
3946 if (!inode_owner_or_capable(inode))
3947 GOTO(out_detach_free, rc = -EPERM);
3949 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3951 OBD_FREE_PTR(detach);
3954 case LL_IOC_PCC_STATE: {
3955 struct lu_pcc_state __user *ustate =
3956 (struct lu_pcc_state __user *)arg;
3957 struct lu_pcc_state *state;
3959 OBD_ALLOC_PTR(state);
3963 if (copy_from_user(state, ustate, sizeof(*state)))
3964 GOTO(out_state, rc = -EFAULT);
3966 rc = pcc_ioctl_state(file, inode, state);
3968 GOTO(out_state, rc);
3970 if (copy_to_user(ustate, state, sizeof(*state)))
3971 GOTO(out_state, rc = -EFAULT);
3974 OBD_FREE_PTR(state);
3978 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3979 (void __user *)arg));
3983 #ifndef HAVE_FILE_LLSEEK_SIZE
3984 static inline loff_t
3985 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3987 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3989 if (offset > maxsize)
3992 if (offset != file->f_pos) {
3993 file->f_pos = offset;
3994 file->f_version = 0;
4000 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
4001 loff_t maxsize, loff_t eof)
4003 struct inode *inode = file_inode(file);
4011 * Here we special-case the lseek(fd, 0, SEEK_CUR)
4012 * position-querying operation. Avoid rewriting the "same"
4013 * f_pos value back to the file because a concurrent read(),
4014 * write() or lseek() might have altered it
4019 * f_lock protects against read/modify/write race with other
4020 * SEEK_CURs. Note that parallel writes and reads behave
4024 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4025 inode_unlock(inode);
4029 * In the generic case the entire file is data, so as long as
4030 * offset isn't at the end of the file then the offset is data.
4037 * There is a virtual hole at the end of the file, so as long as
4038 * offset isn't i_size or larger, return i_size.
4046 return llseek_execute(file, offset, maxsize);
4050 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4052 struct inode *inode = file_inode(file);
4053 loff_t retval, eof = 0;
4054 ktime_t kstart = ktime_get();
4057 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4058 (origin == SEEK_CUR) ? file->f_pos : 0);
4059 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4060 PFID(ll_inode2fid(inode)), inode, retval, retval,
4063 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4064 retval = ll_glimpse_size(inode);
4067 eof = i_size_read(inode);
4070 retval = ll_generic_file_llseek_size(file, offset, origin,
4071 ll_file_maxbytes(inode), eof);
4073 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4074 ktime_us_delta(ktime_get(), kstart));
4078 static int ll_flush(struct file *file, fl_owner_t id)
4080 struct inode *inode = file_inode(file);
4081 struct ll_inode_info *lli = ll_i2info(inode);
4082 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4085 LASSERT(!S_ISDIR(inode->i_mode));
4087 /* catch async errors that were recorded back when async writeback
4088 * failed for pages in this mapping. */
4089 rc = lli->lli_async_rc;
4090 lli->lli_async_rc = 0;
4091 if (lli->lli_clob != NULL) {
4092 err = lov_read_and_clear_async_rc(lli->lli_clob);
4097 /* The application has been told write failure already.
4098 * Do not report failure again. */
4099 if (fd->fd_write_failed)
4101 return rc ? -EIO : 0;
4105 * Called to make sure a portion of file has been written out.
4106 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4108 * Return how many pages have been written.
4110 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4111 enum cl_fsync_mode mode, int ignore_layout)
4115 struct cl_fsync_io *fio;
4120 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4121 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4124 env = cl_env_get(&refcheck);
4126 RETURN(PTR_ERR(env));
4128 io = vvp_env_thread_io(env);
4129 io->ci_obj = ll_i2info(inode)->lli_clob;
4130 io->ci_ignore_layout = ignore_layout;
4132 /* initialize parameters for sync */
4133 fio = &io->u.ci_fsync;
4134 fio->fi_start = start;
4136 fio->fi_fid = ll_inode2fid(inode);
4137 fio->fi_mode = mode;
4138 fio->fi_nr_written = 0;
4140 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4141 result = cl_io_loop(env, io);
4143 result = io->ci_result;
4145 result = fio->fi_nr_written;
4146 cl_io_fini(env, io);
4147 cl_env_put(env, &refcheck);
4153 * When dentry is provided (the 'else' case), file_dentry() may be
4154 * null and dentry must be used directly rather than pulled from
4155 * file_dentry() as is done otherwise.
4158 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4160 struct dentry *dentry = file_dentry(file);
4161 struct inode *inode = dentry->d_inode;
4162 struct ll_inode_info *lli = ll_i2info(inode);
4163 struct ptlrpc_request *req;
4164 ktime_t kstart = ktime_get();
4169 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4171 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4173 /* fsync's caller has already called _fdata{sync,write}, we want
4174 * that IO to finish before calling the osc and mdc sync methods */
4175 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4178 /* catch async errors that were recorded back when async writeback
4179 * failed for pages in this mapping. */
4180 if (!S_ISDIR(inode->i_mode)) {
4181 err = lli->lli_async_rc;
4182 lli->lli_async_rc = 0;
4185 if (lli->lli_clob != NULL) {
4186 err = lov_read_and_clear_async_rc(lli->lli_clob);
4192 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4196 ptlrpc_req_finished(req);
4198 if (S_ISREG(inode->i_mode)) {
4199 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4202 /* Sync metadata on MDT first, and then sync the cached data
4205 err = pcc_fsync(file, start, end, datasync, &cached);
4207 err = cl_sync_file_range(inode, start, end,
4209 if (rc == 0 && err < 0)
4212 fd->fd_write_failed = true;
4214 fd->fd_write_failed = false;
4217 inode_unlock(inode);
4220 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4221 ktime_us_delta(ktime_get(), kstart));
4226 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4228 struct inode *inode = file_inode(file);
4229 struct ll_sb_info *sbi = ll_i2sbi(inode);
4230 struct ldlm_enqueue_info einfo = {
4231 .ei_type = LDLM_FLOCK,
4232 .ei_cb_cp = ldlm_flock_completion_ast,
4233 .ei_cbdata = file_lock,
4235 struct md_op_data *op_data;
4236 struct lustre_handle lockh = { 0 };
4237 union ldlm_policy_data flock = { { 0 } };
4238 int fl_type = file_lock->fl_type;
4239 ktime_t kstart = ktime_get();
4245 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4246 PFID(ll_inode2fid(inode)), file_lock);
4248 if (file_lock->fl_flags & FL_FLOCK) {
4249 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4250 /* flocks are whole-file locks */
4251 flock.l_flock.end = OFFSET_MAX;
4252 /* For flocks owner is determined by the local file desctiptor*/
4253 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4254 } else if (file_lock->fl_flags & FL_POSIX) {
4255 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4256 flock.l_flock.start = file_lock->fl_start;
4257 flock.l_flock.end = file_lock->fl_end;
4261 flock.l_flock.pid = file_lock->fl_pid;
4263 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4264 /* Somewhat ugly workaround for svc lockd.
4265 * lockd installs custom fl_lmops->lm_compare_owner that checks
4266 * for the fl_owner to be the same (which it always is on local node
4267 * I guess between lockd processes) and then compares pid.
4268 * As such we assign pid to the owner field to make it all work,
4269 * conflict with normal locks is unlikely since pid space and
4270 * pointer space for current->files are not intersecting */
4271 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4272 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4277 einfo.ei_mode = LCK_PR;
4280 /* An unlock request may or may not have any relation to
4281 * existing locks so we may not be able to pass a lock handle
4282 * via a normal ldlm_lock_cancel() request. The request may even
4283 * unlock a byte range in the middle of an existing lock. In
4284 * order to process an unlock request we need all of the same
4285 * information that is given with a normal read or write record
4286 * lock request. To avoid creating another ldlm unlock (cancel)
4287 * message we'll treat a LCK_NL flock request as an unlock. */
4288 einfo.ei_mode = LCK_NL;
4291 einfo.ei_mode = LCK_PW;
4294 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4309 flags = LDLM_FL_BLOCK_NOWAIT;
4315 flags = LDLM_FL_TEST_LOCK;
4318 CERROR("unknown fcntl lock command: %d\n", cmd);
4322 /* Save the old mode so that if the mode in the lock changes we
4323 * can decrement the appropriate reader or writer refcount. */
4324 file_lock->fl_type = einfo.ei_mode;
4326 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4327 LUSTRE_OPC_ANY, NULL);
4328 if (IS_ERR(op_data))
4329 RETURN(PTR_ERR(op_data));
4331 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4332 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4333 flock.l_flock.pid, flags, einfo.ei_mode,
4334 flock.l_flock.start, flock.l_flock.end);
4336 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4339 /* Restore the file lock type if not TEST lock. */
4340 if (!(flags & LDLM_FL_TEST_LOCK))
4341 file_lock->fl_type = fl_type;
4343 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4344 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4345 !(flags & LDLM_FL_TEST_LOCK))
4346 rc2 = locks_lock_file_wait(file, file_lock);
4348 if ((file_lock->fl_flags & FL_FLOCK) &&
4349 (rc == 0 || file_lock->fl_type == F_UNLCK))
4350 rc2 = flock_lock_file_wait(file, file_lock);
4351 if ((file_lock->fl_flags & FL_POSIX) &&
4352 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4353 !(flags & LDLM_FL_TEST_LOCK))
4354 rc2 = posix_lock_file_wait(file, file_lock);
4355 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4357 if (rc2 && file_lock->fl_type != F_UNLCK) {
4358 einfo.ei_mode = LCK_NL;
4359 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4364 ll_finish_md_op_data(op_data);
4367 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4368 ktime_us_delta(ktime_get(), kstart));
4372 int ll_get_fid_by_name(struct inode *parent, const char *name,
4373 int namelen, struct lu_fid *fid,
4374 struct inode **inode)
4376 struct md_op_data *op_data = NULL;
4377 struct mdt_body *body;
4378 struct ptlrpc_request *req;
4382 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4383 LUSTRE_OPC_ANY, NULL);
4384 if (IS_ERR(op_data))
4385 RETURN(PTR_ERR(op_data));
4387 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4388 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4389 ll_finish_md_op_data(op_data);
4393 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4395 GOTO(out_req, rc = -EFAULT);
4397 *fid = body->mbo_fid1;
4400 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4402 ptlrpc_req_finished(req);
4406 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4409 struct dentry *dchild = NULL;
4410 struct inode *child_inode = NULL;
4411 struct md_op_data *op_data;
4412 struct ptlrpc_request *request = NULL;
4413 struct obd_client_handle *och = NULL;
4415 struct mdt_body *body;
4416 __u64 data_version = 0;
4417 size_t namelen = strlen(name);
4418 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4422 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4423 PFID(ll_inode2fid(parent)), name,
4424 lum->lum_stripe_offset, lum->lum_stripe_count);
4426 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4427 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4428 lustre_swab_lmv_user_md(lum);
4430 /* Get child FID first */
4431 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4434 dchild = d_lookup(file_dentry(file), &qstr);
4436 if (dchild->d_inode)
4437 child_inode = igrab(dchild->d_inode);
4442 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4451 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4452 OBD_CONNECT2_DIR_MIGRATE)) {
4453 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4454 ll_dir_striped(child_inode)) {
4455 CERROR("%s: MDT doesn't support stripe directory "
4456 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4457 GOTO(out_iput, rc = -EOPNOTSUPP);
4462 * lfs migrate command needs to be blocked on the client
4463 * by checking the migrate FID against the FID of the
4466 if (child_inode == parent->i_sb->s_root->d_inode)
4467 GOTO(out_iput, rc = -EINVAL);
4469 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4470 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4471 if (IS_ERR(op_data))
4472 GOTO(out_iput, rc = PTR_ERR(op_data));
4474 inode_lock(child_inode);
4475 op_data->op_fid3 = *ll_inode2fid(child_inode);
4476 if (!fid_is_sane(&op_data->op_fid3)) {
4477 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4478 ll_i2sbi(parent)->ll_fsname, name,
4479 PFID(&op_data->op_fid3));
4480 GOTO(out_unlock, rc = -EINVAL);
4483 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4484 op_data->op_data = lum;
4485 op_data->op_data_size = lumlen;
4488 if (S_ISREG(child_inode->i_mode)) {
4489 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4493 GOTO(out_unlock, rc);
4496 rc = ll_data_version(child_inode, &data_version,
4499 GOTO(out_close, rc);
4501 op_data->op_open_handle = och->och_open_handle;
4502 op_data->op_data_version = data_version;
4503 op_data->op_lease_handle = och->och_lease_handle;
4504 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4506 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4507 och->och_mod->mod_open_req->rq_replay = 0;
4508 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4511 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4512 name, namelen, &request);
4514 LASSERT(request != NULL);
4515 ll_update_times(request, parent);
4518 if (rc == 0 || rc == -EAGAIN) {
4519 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4520 LASSERT(body != NULL);
4522 /* If the server does release layout lock, then we cleanup
4523 * the client och here, otherwise release it in out_close: */
4524 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4525 obd_mod_put(och->och_mod);
4526 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4528 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4534 if (request != NULL) {
4535 ptlrpc_req_finished(request);
4539 /* Try again if the lease has cancelled. */
4540 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4545 ll_lease_close(och, child_inode, NULL);
4547 clear_nlink(child_inode);
4549 inode_unlock(child_inode);
4550 ll_finish_md_op_data(op_data);
4557 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4559 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4563 * In order to avoid flood of warning messages, only print one message
4564 * for one file. And the entire message rate on the client is limited
4565 * by CDEBUG_LIMIT too.
4567 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4568 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4569 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4570 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4576 * test if some locks matching bits and l_req_mode are acquired
4577 * - bits can be in different locks
4578 * - if found clear the common lock bits in *bits
4579 * - the bits not found, are kept in *bits
4581 * \param bits [IN] searched lock bits [IN]
4582 * \param l_req_mode [IN] searched lock mode
4583 * \retval boolean, true iff all bits are found
4585 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4587 struct lustre_handle lockh;
4588 union ldlm_policy_data policy;
4589 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4590 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4599 fid = &ll_i2info(inode)->lli_fid;
4600 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4601 ldlm_lockname[mode]);
4603 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4604 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4605 policy.l_inodebits.bits = *bits & (1 << i);
4606 if (policy.l_inodebits.bits == 0)
4609 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4610 &policy, mode, &lockh)) {
4611 struct ldlm_lock *lock;
4613 lock = ldlm_handle2lock(&lockh);
4616 ~(lock->l_policy_data.l_inodebits.bits);
4617 LDLM_LOCK_PUT(lock);
4619 *bits &= ~policy.l_inodebits.bits;
4626 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4627 struct lustre_handle *lockh, __u64 flags,
4628 enum ldlm_mode mode)
4630 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4635 fid = &ll_i2info(inode)->lli_fid;
4636 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4638 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4639 fid, LDLM_IBITS, &policy, mode, lockh);
4644 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4646 /* Already unlinked. Just update nlink and return success */
4647 if (rc == -ENOENT) {
4649 /* If it is striped directory, and there is bad stripe
4650 * Let's revalidate the dentry again, instead of returning
4652 if (ll_dir_striped(inode))
4655 /* This path cannot be hit for regular files unless in
4656 * case of obscure races, so no need to to validate
4658 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4660 } else if (rc != 0) {
4661 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4662 "%s: revalidate FID "DFID" error: rc = %d\n",
4663 ll_i2sbi(inode)->ll_fsname,
4664 PFID(ll_inode2fid(inode)), rc);
4670 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4672 struct inode *inode = dentry->d_inode;
4673 struct obd_export *exp = ll_i2mdexp(inode);
4674 struct lookup_intent oit = {
4677 struct ptlrpc_request *req = NULL;
4678 struct md_op_data *op_data;
4682 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4683 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4685 /* Call getattr by fid, so do not provide name at all. */
4686 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4687 LUSTRE_OPC_ANY, NULL);
4688 if (IS_ERR(op_data))
4689 RETURN(PTR_ERR(op_data));
4691 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4692 ll_finish_md_op_data(op_data);
4694 rc = ll_inode_revalidate_fini(inode, rc);
4698 rc = ll_revalidate_it_finish(req, &oit, dentry);
4700 ll_intent_release(&oit);
4704 /* Unlinked? Unhash dentry, so it is not picked up later by
4705 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4706 * here to preserve get_cwd functionality on 2.6.
4708 if (!dentry->d_inode->i_nlink) {
4709 spin_lock(&inode->i_lock);
4710 d_lustre_invalidate(dentry, 0);
4711 spin_unlock(&inode->i_lock);
4714 ll_lookup_finish_locks(&oit, dentry);
4716 ptlrpc_req_finished(req);
4721 static int ll_merge_md_attr(struct inode *inode)
4723 struct ll_inode_info *lli = ll_i2info(inode);
4724 struct cl_attr attr = { 0 };
4727 LASSERT(lli->lli_lsm_md != NULL);
4729 if (!lmv_dir_striped(lli->lli_lsm_md))
4732 down_read(&lli->lli_lsm_sem);
4733 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4734 &attr, ll_md_blocking_ast);
4735 up_read(&lli->lli_lsm_sem);
4739 set_nlink(inode, attr.cat_nlink);
4740 inode->i_blocks = attr.cat_blocks;
4741 i_size_write(inode, attr.cat_size);
4743 ll_i2info(inode)->lli_atime = attr.cat_atime;
4744 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4745 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4750 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4752 struct inode *inode = de->d_inode;
4753 struct ll_sb_info *sbi = ll_i2sbi(inode);
4754 struct ll_inode_info *lli = ll_i2info(inode);
4755 ktime_t kstart = ktime_get();
4758 rc = ll_inode_revalidate(de, IT_GETATTR);
4762 if (S_ISREG(inode->i_mode)) {
4765 rc = pcc_inode_getattr(inode, &cached);
4766 if (cached && rc < 0)
4769 /* In case of restore, the MDT has the right size and has
4770 * already send it back without granting the layout lock,
4771 * inode is up-to-date so glimpse is useless.
4772 * Also to glimpse we need the layout, in case of a running
4773 * restore the MDT holds the layout lock so the glimpse will
4774 * block up to the end of restore (getattr will block)
4776 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4777 rc = ll_glimpse_size(inode);
4782 /* If object isn't regular a file then don't validate size. */
4783 if (ll_dir_striped(inode)) {
4784 rc = ll_merge_md_attr(inode);
4789 inode->i_atime.tv_sec = lli->lli_atime;
4790 inode->i_mtime.tv_sec = lli->lli_mtime;
4791 inode->i_ctime.tv_sec = lli->lli_ctime;
4794 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4796 if (ll_need_32bit_api(sbi)) {
4797 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4798 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4799 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4801 stat->ino = inode->i_ino;
4802 stat->dev = inode->i_sb->s_dev;
4803 stat->rdev = inode->i_rdev;
4806 stat->mode = inode->i_mode;
4807 stat->uid = inode->i_uid;
4808 stat->gid = inode->i_gid;
4809 stat->atime = inode->i_atime;
4810 stat->mtime = inode->i_mtime;
4811 stat->ctime = inode->i_ctime;
4812 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4814 stat->nlink = inode->i_nlink;
4815 stat->size = i_size_read(inode);
4816 stat->blocks = inode->i_blocks;
4818 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4819 ktime_us_delta(ktime_get(), kstart));
4824 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4825 int ll_getattr(const struct path *path, struct kstat *stat,
4826 u32 request_mask, unsigned int flags)
4828 struct dentry *de = path->dentry;
4830 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4833 return ll_getattr_dentry(de, stat);
4836 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4837 __u64 start, __u64 len)
4841 struct fiemap *fiemap;
4842 unsigned int extent_count = fieinfo->fi_extents_max;
4844 num_bytes = sizeof(*fiemap) + (extent_count *
4845 sizeof(struct fiemap_extent));
4846 OBD_ALLOC_LARGE(fiemap, num_bytes);
4851 fiemap->fm_flags = fieinfo->fi_flags;
4852 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4853 fiemap->fm_start = start;
4854 fiemap->fm_length = len;
4855 if (extent_count > 0 &&
4856 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4857 sizeof(struct fiemap_extent)) != 0)
4858 GOTO(out, rc = -EFAULT);
4860 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4862 fieinfo->fi_flags = fiemap->fm_flags;
4863 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4864 if (extent_count > 0 &&
4865 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4866 fiemap->fm_mapped_extents *
4867 sizeof(struct fiemap_extent)) != 0)
4868 GOTO(out, rc = -EFAULT);
4870 OBD_FREE_LARGE(fiemap, num_bytes);
4874 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4876 struct ll_inode_info *lli = ll_i2info(inode);
4877 struct posix_acl *acl = NULL;
4880 spin_lock(&lli->lli_lock);
4881 /* VFS' acl_permission_check->check_acl will release the refcount */
4882 acl = posix_acl_dup(lli->lli_posix_acl);
4883 spin_unlock(&lli->lli_lock);
4888 #ifdef HAVE_IOP_SET_ACL
4889 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4890 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4892 struct ll_sb_info *sbi = ll_i2sbi(inode);
4893 struct ptlrpc_request *req = NULL;
4894 const char *name = NULL;
4896 size_t value_size = 0;
4901 case ACL_TYPE_ACCESS:
4902 name = XATTR_NAME_POSIX_ACL_ACCESS;
4904 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4907 case ACL_TYPE_DEFAULT:
4908 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4909 if (!S_ISDIR(inode->i_mode))
4910 rc = acl ? -EACCES : 0;
4921 value_size = posix_acl_xattr_size(acl->a_count);
4922 value = kmalloc(value_size, GFP_NOFS);
4924 GOTO(out, rc = -ENOMEM);
4926 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4928 GOTO(out_value, rc);
4931 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4932 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4933 name, value, value_size, 0, 0, &req);
4935 ptlrpc_req_finished(req);
4940 forget_cached_acl(inode, type);
4942 set_cached_acl(inode, type, acl);
4945 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4946 #endif /* HAVE_IOP_SET_ACL */
4948 int ll_inode_permission(struct inode *inode, int mask)
4951 struct ll_sb_info *sbi;
4952 struct root_squash_info *squash;
4953 struct cred *cred = NULL;
4954 const struct cred *old_cred = NULL;
4956 bool squash_id = false;
4957 ktime_t kstart = ktime_get();
4960 if (mask & MAY_NOT_BLOCK)
4963 /* as root inode are NOT getting validated in lookup operation,
4964 * need to do it before permission check. */
4966 if (inode == inode->i_sb->s_root->d_inode) {
4967 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4972 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4973 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4975 /* squash fsuid/fsgid if needed */
4976 sbi = ll_i2sbi(inode);
4977 squash = &sbi->ll_squash;
4978 if (unlikely(squash->rsi_uid != 0 &&
4979 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4980 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4984 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4985 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4986 squash->rsi_uid, squash->rsi_gid);
4988 /* update current process's credentials
4989 * and FS capability */
4990 cred = prepare_creds();
4994 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4995 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4996 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4997 if ((1 << cap) & CFS_CAP_FS_MASK)
4998 cap_lower(cred->cap_effective, cap);
5000 old_cred = override_creds(cred);
5003 rc = generic_permission(inode, mask);
5004 /* restore current process's credentials and FS capability */
5006 revert_creds(old_cred);
5011 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5012 ktime_us_delta(ktime_get(), kstart));
5017 /* -o localflock - only provides locally consistent flock locks */
5018 struct file_operations ll_file_operations = {
5019 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5020 # ifdef HAVE_SYNC_READ_WRITE
5021 .read = new_sync_read,
5022 .write = new_sync_write,
5024 .read_iter = ll_file_read_iter,
5025 .write_iter = ll_file_write_iter,
5026 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5027 .read = ll_file_read,
5028 .aio_read = ll_file_aio_read,
5029 .write = ll_file_write,
5030 .aio_write = ll_file_aio_write,
5031 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5032 .unlocked_ioctl = ll_file_ioctl,
5033 .open = ll_file_open,
5034 .release = ll_file_release,
5035 .mmap = ll_file_mmap,
5036 .llseek = ll_file_seek,
5037 .splice_read = ll_file_splice_read,
5042 struct file_operations ll_file_operations_flock = {
5043 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5044 # ifdef HAVE_SYNC_READ_WRITE
5045 .read = new_sync_read,
5046 .write = new_sync_write,
5047 # endif /* HAVE_SYNC_READ_WRITE */
5048 .read_iter = ll_file_read_iter,
5049 .write_iter = ll_file_write_iter,
5050 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5051 .read = ll_file_read,
5052 .aio_read = ll_file_aio_read,
5053 .write = ll_file_write,
5054 .aio_write = ll_file_aio_write,
5055 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5056 .unlocked_ioctl = ll_file_ioctl,
5057 .open = ll_file_open,
5058 .release = ll_file_release,
5059 .mmap = ll_file_mmap,
5060 .llseek = ll_file_seek,
5061 .splice_read = ll_file_splice_read,
5064 .flock = ll_file_flock,
5065 .lock = ll_file_flock
5068 /* These are for -o noflock - to return ENOSYS on flock calls */
5069 struct file_operations ll_file_operations_noflock = {
5070 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5071 # ifdef HAVE_SYNC_READ_WRITE
5072 .read = new_sync_read,
5073 .write = new_sync_write,
5074 # endif /* HAVE_SYNC_READ_WRITE */
5075 .read_iter = ll_file_read_iter,
5076 .write_iter = ll_file_write_iter,
5077 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5078 .read = ll_file_read,
5079 .aio_read = ll_file_aio_read,
5080 .write = ll_file_write,
5081 .aio_write = ll_file_aio_write,
5082 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5083 .unlocked_ioctl = ll_file_ioctl,
5084 .open = ll_file_open,
5085 .release = ll_file_release,
5086 .mmap = ll_file_mmap,
5087 .llseek = ll_file_seek,
5088 .splice_read = ll_file_splice_read,
5091 .flock = ll_file_noflock,
5092 .lock = ll_file_noflock
5095 struct inode_operations ll_file_inode_operations = {
5096 .setattr = ll_setattr,
5097 .getattr = ll_getattr,
5098 .permission = ll_inode_permission,
5099 #ifdef HAVE_IOP_XATTR
5100 .setxattr = ll_setxattr,
5101 .getxattr = ll_getxattr,
5102 .removexattr = ll_removexattr,
5104 .listxattr = ll_listxattr,
5105 .fiemap = ll_fiemap,
5106 #ifdef HAVE_IOP_GET_ACL
5107 .get_acl = ll_get_acl,
5109 #ifdef HAVE_IOP_SET_ACL
5110 .set_acl = ll_set_acl,
5114 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5116 struct ll_inode_info *lli = ll_i2info(inode);
5117 struct cl_object *obj = lli->lli_clob;
5126 env = cl_env_get(&refcheck);
5128 RETURN(PTR_ERR(env));
5130 rc = cl_conf_set(env, lli->lli_clob, conf);
5134 if (conf->coc_opc == OBJECT_CONF_SET) {
5135 struct ldlm_lock *lock = conf->coc_lock;
5136 struct cl_layout cl = {
5140 LASSERT(lock != NULL);
5141 LASSERT(ldlm_has_layout(lock));
5143 /* it can only be allowed to match after layout is
5144 * applied to inode otherwise false layout would be
5145 * seen. Applying layout shoud happen before dropping
5146 * the intent lock. */
5147 ldlm_lock_allow_match(lock);
5149 rc = cl_object_layout_get(env, obj, &cl);
5154 DFID": layout version change: %u -> %u\n",
5155 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5157 ll_layout_version_set(lli, cl.cl_layout_gen);
5161 cl_env_put(env, &refcheck);
5166 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5167 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5170 struct ll_sb_info *sbi = ll_i2sbi(inode);
5171 struct ptlrpc_request *req;
5178 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5179 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5180 lock->l_lvb_data, lock->l_lvb_len);
5182 if (lock->l_lvb_data != NULL)
5185 /* if layout lock was granted right away, the layout is returned
5186 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5187 * blocked and then granted via completion ast, we have to fetch
5188 * layout here. Please note that we can't use the LVB buffer in
5189 * completion AST because it doesn't have a large enough buffer */
5190 rc = ll_get_default_mdsize(sbi, &lmmsize);
5194 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5195 XATTR_NAME_LOV, lmmsize, &req);
5198 GOTO(out, rc = 0); /* empty layout */
5205 if (lmmsize == 0) /* empty layout */
5208 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5210 GOTO(out, rc = -EFAULT);
5212 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5213 if (lvbdata == NULL)
5214 GOTO(out, rc = -ENOMEM);
5216 memcpy(lvbdata, lmm, lmmsize);
5217 lock_res_and_lock(lock);
5218 if (unlikely(lock->l_lvb_data == NULL)) {
5219 lock->l_lvb_type = LVB_T_LAYOUT;
5220 lock->l_lvb_data = lvbdata;
5221 lock->l_lvb_len = lmmsize;
5224 unlock_res_and_lock(lock);
5227 OBD_FREE_LARGE(lvbdata, lmmsize);
5232 ptlrpc_req_finished(req);
5237 * Apply the layout to the inode. Layout lock is held and will be released
5240 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5241 struct inode *inode)
5243 struct ll_inode_info *lli = ll_i2info(inode);
5244 struct ll_sb_info *sbi = ll_i2sbi(inode);
5245 struct ldlm_lock *lock;
5246 struct cl_object_conf conf;
5249 bool wait_layout = false;
5252 LASSERT(lustre_handle_is_used(lockh));
5254 lock = ldlm_handle2lock(lockh);
5255 LASSERT(lock != NULL);
5256 LASSERT(ldlm_has_layout(lock));
5258 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5259 PFID(&lli->lli_fid), inode);
5261 /* in case this is a caching lock and reinstate with new inode */
5262 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5264 lock_res_and_lock(lock);
5265 lvb_ready = ldlm_is_lvb_ready(lock);
5266 unlock_res_and_lock(lock);
5268 /* checking lvb_ready is racy but this is okay. The worst case is
5269 * that multi processes may configure the file on the same time. */
5273 rc = ll_layout_fetch(inode, lock);
5277 /* for layout lock, lmm is stored in lock's lvb.
5278 * lvb_data is immutable if the lock is held so it's safe to access it
5281 * set layout to file. Unlikely this will fail as old layout was
5282 * surely eliminated */
5283 memset(&conf, 0, sizeof conf);
5284 conf.coc_opc = OBJECT_CONF_SET;
5285 conf.coc_inode = inode;
5286 conf.coc_lock = lock;
5287 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5288 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5289 rc = ll_layout_conf(inode, &conf);
5291 /* refresh layout failed, need to wait */
5292 wait_layout = rc == -EBUSY;
5295 LDLM_LOCK_PUT(lock);
5296 ldlm_lock_decref(lockh, mode);
5298 /* wait for IO to complete if it's still being used. */
5300 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5301 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5303 memset(&conf, 0, sizeof conf);
5304 conf.coc_opc = OBJECT_CONF_WAIT;
5305 conf.coc_inode = inode;
5306 rc = ll_layout_conf(inode, &conf);
5310 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5311 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5317 * Issue layout intent RPC to MDS.
5318 * \param inode [in] file inode
5319 * \param intent [in] layout intent
5321 * \retval 0 on success
5322 * \retval < 0 error code
5324 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5326 struct ll_inode_info *lli = ll_i2info(inode);
5327 struct ll_sb_info *sbi = ll_i2sbi(inode);
5328 struct md_op_data *op_data;
5329 struct lookup_intent it;
5330 struct ptlrpc_request *req;
5334 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5335 0, 0, LUSTRE_OPC_ANY, NULL);
5336 if (IS_ERR(op_data))
5337 RETURN(PTR_ERR(op_data));
5339 op_data->op_data = intent;
5340 op_data->op_data_size = sizeof(*intent);
5342 memset(&it, 0, sizeof(it));
5343 it.it_op = IT_LAYOUT;
5344 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5345 intent->li_opc == LAYOUT_INTENT_TRUNC)
5346 it.it_flags = FMODE_WRITE;
5348 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5349 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5351 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5352 &ll_md_blocking_ast, 0);
5353 if (it.it_request != NULL)
5354 ptlrpc_req_finished(it.it_request);
5355 it.it_request = NULL;
5357 ll_finish_md_op_data(op_data);
5359 /* set lock data in case this is a new lock */
5361 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5363 ll_intent_drop_lock(&it);
5369 * This function checks if there exists a LAYOUT lock on the client side,
5370 * or enqueues it if it doesn't have one in cache.
5372 * This function will not hold layout lock so it may be revoked any time after
5373 * this function returns. Any operations depend on layout should be redone
5376 * This function should be called before lov_io_init() to get an uptodate
5377 * layout version, the caller should save the version number and after IO
5378 * is finished, this function should be called again to verify that layout
5379 * is not changed during IO time.
5381 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5383 struct ll_inode_info *lli = ll_i2info(inode);
5384 struct ll_sb_info *sbi = ll_i2sbi(inode);
5385 struct lustre_handle lockh;
5386 struct layout_intent intent = {
5387 .li_opc = LAYOUT_INTENT_ACCESS,
5389 enum ldlm_mode mode;
5393 *gen = ll_layout_version_get(lli);
5394 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5398 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5399 LASSERT(S_ISREG(inode->i_mode));
5401 /* take layout lock mutex to enqueue layout lock exclusively. */
5402 mutex_lock(&lli->lli_layout_mutex);
5405 /* mostly layout lock is caching on the local side, so try to
5406 * match it before grabbing layout lock mutex. */
5407 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5408 LCK_CR | LCK_CW | LCK_PR |
5410 if (mode != 0) { /* hit cached lock */
5411 rc = ll_layout_lock_set(&lockh, mode, inode);
5417 rc = ll_layout_intent(inode, &intent);
5423 *gen = ll_layout_version_get(lli);
5424 mutex_unlock(&lli->lli_layout_mutex);
5430 * Issue layout intent RPC indicating where in a file an IO is about to write.
5432 * \param[in] inode file inode.
5433 * \param[in] ext write range with start offset of fille in bytes where
5434 * an IO is about to write, and exclusive end offset in
5437 * \retval 0 on success
5438 * \retval < 0 error code
5440 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5441 struct lu_extent *ext)
5443 struct layout_intent intent = {
5445 .li_extent.e_start = ext->e_start,
5446 .li_extent.e_end = ext->e_end,
5451 rc = ll_layout_intent(inode, &intent);
5457 * This function send a restore request to the MDT
5459 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5461 struct hsm_user_request *hur;
5465 len = sizeof(struct hsm_user_request) +
5466 sizeof(struct hsm_user_item);
5467 OBD_ALLOC(hur, len);
5471 hur->hur_request.hr_action = HUA_RESTORE;
5472 hur->hur_request.hr_archive_id = 0;
5473 hur->hur_request.hr_flags = 0;
5474 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5475 sizeof(hur->hur_user_item[0].hui_fid));
5476 hur->hur_user_item[0].hui_extent.offset = offset;
5477 hur->hur_user_item[0].hui_extent.length = length;
5478 hur->hur_request.hr_itemcount = 1;
5479 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,