4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_HSM_RELEASE:
160 LASSERT(data != NULL);
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *(__u64 *)data;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
168 LASSERT(data == NULL);
172 rc = md_close(md_exp, op_data, och->och_mod, &req);
173 if (rc != 0 && rc != -EINTR)
174 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
175 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
177 if (rc == 0 && op_data->op_bias & bias) {
178 struct mdt_body *body;
180 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
181 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
185 ll_finish_md_op_data(op_data);
189 md_clear_open_replay_data(md_exp, och);
190 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
193 ptlrpc_req_finished(req); /* This is close request */
197 int ll_md_real_close(struct inode *inode, fmode_t fmode)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (fmode & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (fmode & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(fmode & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount > 0) {
220 /* There are still users of this handle, so skip
222 mutex_unlock(&lli->lli_och_mutex);
228 mutex_unlock(&lli->lli_och_mutex);
231 /* There might be a race and this handle may already
233 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
239 static int ll_md_close(struct inode *inode, struct file *file)
241 union ldlm_policy_data policy = {
242 .l_inodebits = { MDS_INODELOCK_OPEN },
244 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
247 struct lustre_handle lockh;
248 enum ldlm_mode lockmode;
252 /* clear group lock, if present */
253 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
254 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
256 if (fd->fd_lease_och != NULL) {
259 /* Usually the lease is not released when the
260 * application crashed, we need to release here. */
261 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
262 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
263 PFID(&lli->lli_fid), rc, lease_broken);
265 fd->fd_lease_och = NULL;
268 if (fd->fd_och != NULL) {
269 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
274 /* Let's see if we have good enough OPEN lock on the file and if
275 we can skip talking to MDS */
276 mutex_lock(&lli->lli_och_mutex);
277 if (fd->fd_omode & FMODE_WRITE) {
279 LASSERT(lli->lli_open_fd_write_count);
280 lli->lli_open_fd_write_count--;
281 } else if (fd->fd_omode & FMODE_EXEC) {
283 LASSERT(lli->lli_open_fd_exec_count);
284 lli->lli_open_fd_exec_count--;
287 LASSERT(lli->lli_open_fd_read_count);
288 lli->lli_open_fd_read_count--;
290 mutex_unlock(&lli->lli_och_mutex);
292 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
293 LDLM_IBITS, &policy, lockmode, &lockh))
294 rc = ll_md_real_close(inode, fd->fd_omode);
297 LUSTRE_FPRIVATE(file) = NULL;
298 ll_file_data_put(fd);
303 /* While this returns an error code, fput() the caller does not, so we need
304 * to make every effort to clean up all of our state here. Also, applications
305 * rarely check close errors and even if an error is returned they will not
306 * re-try the close call.
308 int ll_file_release(struct inode *inode, struct file *file)
310 struct ll_file_data *fd;
311 struct ll_sb_info *sbi = ll_i2sbi(inode);
312 struct ll_inode_info *lli = ll_i2info(inode);
316 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
317 PFID(ll_inode2fid(inode)), inode);
319 if (inode->i_sb->s_root != file_dentry(file))
320 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
321 fd = LUSTRE_FPRIVATE(file);
324 /* The last ref on @file, maybe not the the owner pid of statahead,
325 * because parent and child process can share the same file handle. */
326 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
327 ll_deauthorize_statahead(inode, fd);
329 if (inode->i_sb->s_root == file_dentry(file)) {
330 LUSTRE_FPRIVATE(file) = NULL;
331 ll_file_data_put(fd);
335 if (!S_ISDIR(inode->i_mode)) {
336 if (lli->lli_clob != NULL)
337 lov_read_and_clear_async_rc(lli->lli_clob);
338 lli->lli_async_rc = 0;
341 rc = ll_md_close(inode, file);
343 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
344 libcfs_debug_dumplog();
349 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
350 struct lookup_intent *itp)
352 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
353 struct dentry *parent = de->d_parent;
354 const char *name = NULL;
356 struct md_op_data *op_data;
357 struct ptlrpc_request *req = NULL;
361 LASSERT(parent != NULL);
362 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
364 /* if server supports open-by-fid, or file name is invalid, don't pack
365 * name in open request */
366 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
367 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
368 name = de->d_name.name;
369 len = de->d_name.len;
372 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
373 name, len, 0, LUSTRE_OPC_ANY, NULL);
375 RETURN(PTR_ERR(op_data));
376 op_data->op_data = lmm;
377 op_data->op_data_size = lmmsize;
379 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
380 &ll_md_blocking_ast, 0);
381 ll_finish_md_op_data(op_data);
383 /* reason for keep own exit path - don`t flood log
384 * with messages with -ESTALE errors.
386 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
387 it_open_error(DISP_OPEN_OPEN, itp))
389 ll_release_openhandle(de, itp);
393 if (it_disposition(itp, DISP_LOOKUP_NEG))
394 GOTO(out, rc = -ENOENT);
396 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
397 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
398 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
402 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
403 if (!rc && itp->it_lock_mode)
404 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
407 ptlrpc_req_finished(req);
408 ll_intent_drop_lock(itp);
410 /* We did open by fid, but by the time we got to the server,
411 * the object disappeared. If this is a create, we cannot really
412 * tell the userspace that the file it was trying to create
413 * does not exist. Instead let's return -ESTALE, and the VFS will
414 * retry the create with LOOKUP_REVAL that we are going to catch
415 * in ll_revalidate_dentry() and use lookup then.
417 if (rc == -ENOENT && itp->it_op & IT_CREAT)
423 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
424 struct obd_client_handle *och)
426 struct mdt_body *body;
428 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
429 och->och_fh = body->mbo_handle;
430 och->och_fid = body->mbo_fid1;
431 och->och_lease_handle.cookie = it->it_lock_handle;
432 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
433 och->och_flags = it->it_flags;
435 return md_set_open_replay_data(md_exp, och, it);
438 static int ll_local_open(struct file *file, struct lookup_intent *it,
439 struct ll_file_data *fd, struct obd_client_handle *och)
441 struct inode *inode = file_inode(file);
444 LASSERT(!LUSTRE_FPRIVATE(file));
451 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
456 LUSTRE_FPRIVATE(file) = fd;
457 ll_readahead_init(inode, &fd->fd_ras);
458 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
460 /* ll_cl_context initialize */
461 rwlock_init(&fd->fd_lock);
462 INIT_LIST_HEAD(&fd->fd_lccs);
467 /* Open a file, and (for the very first open) create objects on the OSTs at
468 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
469 * creation or open until ll_lov_setstripe() ioctl is called.
471 * If we already have the stripe MD locally then we don't request it in
472 * md_open(), by passing a lmm_size = 0.
474 * It is up to the application to ensure no other processes open this file
475 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
476 * used. We might be able to avoid races of that sort by getting lli_open_sem
477 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
478 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480 int ll_file_open(struct inode *inode, struct file *file)
482 struct ll_inode_info *lli = ll_i2info(inode);
483 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
484 .it_flags = file->f_flags };
485 struct obd_client_handle **och_p = NULL;
486 __u64 *och_usecount = NULL;
487 struct ll_file_data *fd;
491 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
492 PFID(ll_inode2fid(inode)), inode, file->f_flags);
494 it = file->private_data; /* XXX: compat macro */
495 file->private_data = NULL; /* prevent ll_local_open assertion */
497 fd = ll_file_data_get();
499 GOTO(out_openerr, rc = -ENOMEM);
502 if (S_ISDIR(inode->i_mode))
503 ll_authorize_statahead(inode, fd);
505 if (inode->i_sb->s_root == file_dentry(file)) {
506 LUSTRE_FPRIVATE(file) = fd;
510 if (!it || !it->it_disposition) {
511 /* Convert f_flags into access mode. We cannot use file->f_mode,
512 * because everything but O_ACCMODE mask was stripped from
514 if ((oit.it_flags + 1) & O_ACCMODE)
516 if (file->f_flags & O_TRUNC)
517 oit.it_flags |= FMODE_WRITE;
519 /* kernel only call f_op->open in dentry_open. filp_open calls
520 * dentry_open after call to open_namei that checks permissions.
521 * Only nfsd_open call dentry_open directly without checking
522 * permissions and because of that this code below is safe. */
523 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
524 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
526 /* We do not want O_EXCL here, presumably we opened the file
527 * already? XXX - NFS implications? */
528 oit.it_flags &= ~O_EXCL;
530 /* bug20584, if "it_flags" contains O_CREAT, the file will be
531 * created if necessary, then "IT_CREAT" should be set to keep
532 * consistent with it */
533 if (oit.it_flags & O_CREAT)
534 oit.it_op |= IT_CREAT;
540 /* Let's see if we have file open on MDS already. */
541 if (it->it_flags & FMODE_WRITE) {
542 och_p = &lli->lli_mds_write_och;
543 och_usecount = &lli->lli_open_fd_write_count;
544 } else if (it->it_flags & FMODE_EXEC) {
545 och_p = &lli->lli_mds_exec_och;
546 och_usecount = &lli->lli_open_fd_exec_count;
548 och_p = &lli->lli_mds_read_och;
549 och_usecount = &lli->lli_open_fd_read_count;
552 mutex_lock(&lli->lli_och_mutex);
553 if (*och_p) { /* Open handle is present */
554 if (it_disposition(it, DISP_OPEN_OPEN)) {
555 /* Well, there's extra open request that we do not need,
556 let's close it somehow. This will decref request. */
557 rc = it_open_error(DISP_OPEN_OPEN, it);
559 mutex_unlock(&lli->lli_och_mutex);
560 GOTO(out_openerr, rc);
563 ll_release_openhandle(file_dentry(file), it);
567 rc = ll_local_open(file, it, fd, NULL);
570 mutex_unlock(&lli->lli_och_mutex);
571 GOTO(out_openerr, rc);
574 LASSERT(*och_usecount == 0);
575 if (!it->it_disposition) {
576 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
577 /* We cannot just request lock handle now, new ELC code
578 means that one of other OPEN locks for this file
579 could be cancelled, and since blocking ast handler
580 would attempt to grab och_mutex as well, that would
581 result in a deadlock */
582 mutex_unlock(&lli->lli_och_mutex);
584 * Normally called under two situations:
586 * 2. A race/condition on MDS resulting in no open
587 * handle to be returned from LOOKUP|OPEN request,
588 * for example if the target entry was a symlink.
590 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
591 * marked by a bit set in ll_iget_for_nfs. Clear the
592 * bit so that it's not confusing later callers.
594 * NB; when ldd is NULL, it must have come via normal
595 * lookup path only, since ll_iget_for_nfs always calls
598 if (ldd && ldd->lld_nfs_dentry) {
599 ldd->lld_nfs_dentry = 0;
600 it->it_flags |= MDS_OPEN_LOCK;
604 * Always specify MDS_OPEN_BY_FID because we don't want
605 * to get file with different fid.
607 it->it_flags |= MDS_OPEN_BY_FID;
608 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
611 GOTO(out_openerr, rc);
615 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
617 GOTO(out_och_free, rc = -ENOMEM);
621 /* md_intent_lock() didn't get a request ref if there was an
622 * open error, so don't do cleanup on the request here
624 /* XXX (green): Should not we bail out on any error here, not
625 * just open error? */
626 rc = it_open_error(DISP_OPEN_OPEN, it);
628 GOTO(out_och_free, rc);
630 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
631 "inode %p: disposition %x, status %d\n", inode,
632 it_disposition(it, ~0), it->it_status);
634 rc = ll_local_open(file, it, fd, *och_p);
636 GOTO(out_och_free, rc);
638 mutex_unlock(&lli->lli_och_mutex);
641 /* Must do this outside lli_och_mutex lock to prevent deadlock where
642 different kind of OPEN lock for this same inode gets cancelled
643 by ldlm_cancel_lru */
644 if (!S_ISREG(inode->i_mode))
645 GOTO(out_och_free, rc);
647 cl_lov_delay_create_clear(&file->f_flags);
648 GOTO(out_och_free, rc);
652 if (och_p && *och_p) {
653 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
654 *och_p = NULL; /* OBD_FREE writes some magic there */
657 mutex_unlock(&lli->lli_och_mutex);
660 if (lli->lli_opendir_key == fd)
661 ll_deauthorize_statahead(inode, fd);
663 ll_file_data_put(fd);
665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
668 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
669 ptlrpc_req_finished(it->it_request);
670 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
676 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
677 struct ldlm_lock_desc *desc, void *data, int flag)
680 struct lustre_handle lockh;
684 case LDLM_CB_BLOCKING:
685 ldlm_lock2handle(lock, &lockh);
686 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
688 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
692 case LDLM_CB_CANCELING:
700 * When setting a lease on a file, we take ownership of the lli_mds_*_och
701 * and save it as fd->fd_och so as to force client to reopen the file even
702 * if it has an open lock in cache already.
704 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
705 struct lustre_handle *old_handle)
707 struct ll_inode_info *lli = ll_i2info(inode);
708 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
709 struct obd_client_handle **och_p;
714 /* Get the openhandle of the file */
715 mutex_lock(&lli->lli_och_mutex);
716 if (fd->fd_lease_och != NULL)
717 GOTO(out_unlock, rc = -EBUSY);
719 if (fd->fd_och == NULL) {
720 if (file->f_mode & FMODE_WRITE) {
721 LASSERT(lli->lli_mds_write_och != NULL);
722 och_p = &lli->lli_mds_write_och;
723 och_usecount = &lli->lli_open_fd_write_count;
725 LASSERT(lli->lli_mds_read_och != NULL);
726 och_p = &lli->lli_mds_read_och;
727 och_usecount = &lli->lli_open_fd_read_count;
730 if (*och_usecount > 1)
731 GOTO(out_unlock, rc = -EBUSY);
738 *old_handle = fd->fd_och->och_fh;
742 mutex_unlock(&lli->lli_och_mutex);
747 * Release ownership on lli_mds_*_och when putting back a file lease.
749 static int ll_lease_och_release(struct inode *inode, struct file *file)
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
754 struct obd_client_handle *old_och = NULL;
759 mutex_lock(&lli->lli_och_mutex);
760 if (file->f_mode & FMODE_WRITE) {
761 och_p = &lli->lli_mds_write_och;
762 och_usecount = &lli->lli_open_fd_write_count;
764 och_p = &lli->lli_mds_read_och;
765 och_usecount = &lli->lli_open_fd_read_count;
768 /* The file may have been open by another process (broken lease) so
769 * *och_p is not NULL. In this case we should simply increase usecount
772 if (*och_p != NULL) {
773 old_och = fd->fd_och;
780 mutex_unlock(&lli->lli_och_mutex);
783 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
789 * Acquire a lease and open the file.
791 static struct obd_client_handle *
792 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
795 struct lookup_intent it = { .it_op = IT_OPEN };
796 struct ll_sb_info *sbi = ll_i2sbi(inode);
797 struct md_op_data *op_data;
798 struct ptlrpc_request *req = NULL;
799 struct lustre_handle old_handle = { 0 };
800 struct obd_client_handle *och = NULL;
805 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
806 RETURN(ERR_PTR(-EINVAL));
809 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
810 RETURN(ERR_PTR(-EPERM));
812 rc = ll_lease_och_acquire(inode, file, &old_handle);
819 RETURN(ERR_PTR(-ENOMEM));
821 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
822 LUSTRE_OPC_ANY, NULL);
824 GOTO(out, rc = PTR_ERR(op_data));
826 /* To tell the MDT this openhandle is from the same owner */
827 op_data->op_handle = old_handle;
829 it.it_flags = fmode | open_flags;
830 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
831 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
832 &ll_md_blocking_lease_ast,
833 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
834 * it can be cancelled which may mislead applications that the lease is
836 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
837 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
838 * doesn't deal with openhandle, so normal openhandle will be leaked. */
839 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
840 ll_finish_md_op_data(op_data);
841 ptlrpc_req_finished(req);
843 GOTO(out_release_it, rc);
845 if (it_disposition(&it, DISP_LOOKUP_NEG))
846 GOTO(out_release_it, rc = -ENOENT);
848 rc = it_open_error(DISP_OPEN_OPEN, &it);
850 GOTO(out_release_it, rc);
852 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
853 ll_och_fill(sbi->ll_md_exp, &it, och);
855 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
856 GOTO(out_close, rc = -EOPNOTSUPP);
858 /* already get lease, handle lease lock */
859 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
860 if (it.it_lock_mode == 0 ||
861 it.it_lock_bits != MDS_INODELOCK_OPEN) {
862 /* open lock must return for lease */
863 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
864 PFID(ll_inode2fid(inode)), it.it_lock_mode,
866 GOTO(out_close, rc = -EPROTO);
869 ll_intent_release(&it);
873 /* Cancel open lock */
874 if (it.it_lock_mode != 0) {
875 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
878 och->och_lease_handle.cookie = 0ULL;
880 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
882 CERROR("%s: error closing file "DFID": %d\n",
883 ll_get_fsname(inode->i_sb, NULL, 0),
884 PFID(&ll_i2info(inode)->lli_fid), rc2);
885 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
887 ll_intent_release(&it);
895 * Check whether a layout swap can be done between two inodes.
897 * \param[in] inode1 First inode to check
898 * \param[in] inode2 Second inode to check
900 * \retval 0 on success, layout swap can be performed between both inodes
901 * \retval negative error code if requirements are not met
903 static int ll_check_swap_layouts_validity(struct inode *inode1,
904 struct inode *inode2)
906 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
909 if (inode_permission(inode1, MAY_WRITE) ||
910 inode_permission(inode2, MAY_WRITE))
913 if (inode1->i_sb != inode2->i_sb)
919 static int ll_swap_layouts_close(struct obd_client_handle *och,
920 struct inode *inode, struct inode *inode2,
923 const struct lu_fid *fid1 = ll_inode2fid(inode);
924 const struct lu_fid *fid2;
925 enum mds_op_bias bias;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
944 case SWAP_LAYOUTS_CLOSE:
945 bias = MDS_CLOSE_LAYOUT_SWAP;
947 case MERGE_LAYOUTS_CLOSE:
948 bias = MDS_CLOSE_LAYOUT_MERGE;
951 GOTO(out_free_och, rc = -EOPNOTSUPP);
954 /* Close the file and {swap,merge} layouts between inode & inode2.
955 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
956 * because we still need it to pack l_remote_handle to MDT. */
957 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
959 och = NULL; /* freed in ll_close_inode_openhandle() */
969 * Release lease and close the file.
970 * It will check if the lease has ever broken.
972 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
975 struct ldlm_lock *lock;
976 bool cancelled = true;
980 lock = ldlm_handle2lock(&och->och_lease_handle);
982 lock_res_and_lock(lock);
983 cancelled = ldlm_is_cancel(lock);
984 unlock_res_and_lock(lock);
988 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
989 PFID(&ll_i2info(inode)->lli_fid), cancelled);
992 ldlm_cli_cancel(&och->och_lease_handle, 0);
994 if (lease_broken != NULL)
995 *lease_broken = cancelled;
997 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1001 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1003 struct ll_inode_info *lli = ll_i2info(inode);
1004 struct cl_object *obj = lli->lli_clob;
1005 struct cl_attr *attr = vvp_env_thread_attr(env);
1013 ll_inode_size_lock(inode);
1015 /* Merge timestamps the most recently obtained from MDS with
1016 * timestamps obtained from OSTs.
1018 * Do not overwrite atime of inode because it may be refreshed
1019 * by file_accessed() function. If the read was served by cache
1020 * data, there is no RPC to be sent so that atime may not be
1021 * transferred to OSTs at all. MDT only updates atime at close time
1022 * if it's at least 'mdd.*.atime_diff' older.
1023 * All in all, the atime in Lustre does not strictly comply with
1024 * POSIX. Solving this problem needs to send an RPC to MDT for each
1025 * read, this will hurt performance. */
1026 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1027 LTIME_S(inode->i_atime) = lli->lli_atime;
1028 lli->lli_update_atime = 0;
1030 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1031 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1033 atime = LTIME_S(inode->i_atime);
1034 mtime = LTIME_S(inode->i_mtime);
1035 ctime = LTIME_S(inode->i_ctime);
1037 cl_object_attr_lock(obj);
1038 rc = cl_object_attr_get(env, obj, attr);
1039 cl_object_attr_unlock(obj);
1042 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1044 if (atime < attr->cat_atime)
1045 atime = attr->cat_atime;
1047 if (ctime < attr->cat_ctime)
1048 ctime = attr->cat_ctime;
1050 if (mtime < attr->cat_mtime)
1051 mtime = attr->cat_mtime;
1053 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1054 PFID(&lli->lli_fid), attr->cat_size);
1056 i_size_write(inode, attr->cat_size);
1057 inode->i_blocks = attr->cat_blocks;
1059 LTIME_S(inode->i_atime) = atime;
1060 LTIME_S(inode->i_mtime) = mtime;
1061 LTIME_S(inode->i_ctime) = ctime;
1064 ll_inode_size_unlock(inode);
1069 static bool file_is_noatime(const struct file *file)
1071 const struct vfsmount *mnt = file->f_path.mnt;
1072 const struct inode *inode = file_inode((struct file *)file);
1074 /* Adapted from file_accessed() and touch_atime().*/
1075 if (file->f_flags & O_NOATIME)
1078 if (inode->i_flags & S_NOATIME)
1081 if (IS_NOATIME(inode))
1084 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1087 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1090 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1096 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1098 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1100 struct inode *inode = file_inode(file);
1101 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1103 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1104 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1105 io->u.ci_rw.rw_file = file;
1106 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1107 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1108 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1110 if (iot == CIT_WRITE) {
1111 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1112 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1113 file->f_flags & O_DIRECT ||
1116 io->ci_obj = ll_i2info(inode)->lli_clob;
1117 io->ci_lockreq = CILR_MAYBE;
1118 if (ll_file_nolock(file)) {
1119 io->ci_lockreq = CILR_NEVER;
1120 io->ci_no_srvlock = 1;
1121 } else if (file->f_flags & O_APPEND) {
1122 io->ci_lockreq = CILR_MANDATORY;
1124 io->ci_noatime = file_is_noatime(file);
1125 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1126 io->ci_pio = !io->u.ci_rw.rw_append;
1131 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1133 struct cl_io_pt *pt = ptask->pt_cbdata;
1134 struct file *file = pt->cip_file;
1137 loff_t pos = pt->cip_pos;
1142 env = cl_env_get(&refcheck);
1144 RETURN(PTR_ERR(env));
1146 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1147 file_dentry(file)->d_name.name,
1148 pt->cip_iot == CIT_READ ? "read" : "write",
1149 pos, pos + pt->cip_count);
1152 io = vvp_env_thread_io(env);
1153 ll_io_init(io, file, pt->cip_iot);
1154 io->u.ci_rw.rw_iter = pt->cip_iter;
1155 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1156 io->ci_pio = 0; /* It's already in parallel task */
1158 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1159 pt->cip_count - pt->cip_result);
1161 struct vvp_io *vio = vvp_env_io(env);
1163 vio->vui_io_subtype = IO_NORMAL;
1164 vio->vui_fd = LUSTRE_FPRIVATE(file);
1166 ll_cl_add(file, env, io, LCC_RW);
1167 rc = cl_io_loop(env, io);
1168 ll_cl_remove(file, env);
1170 /* cl_io_rw_init() handled IO */
1174 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1180 if (io->ci_nob > 0) {
1181 pt->cip_result += io->ci_nob;
1182 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1184 pt->cip_iocb.ki_pos = pos;
1185 #ifdef HAVE_KIOCB_KI_LEFT
1186 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1187 #elif defined(HAVE_KI_NBYTES)
1188 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1192 cl_io_fini(env, io);
1194 if ((rc == 0 || rc == -ENODATA) &&
1195 pt->cip_result < pt->cip_count &&
1196 io->ci_need_restart) {
1198 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1199 file_dentry(file)->d_name.name,
1200 pt->cip_iot == CIT_READ ? "read" : "write",
1201 pos, pos + pt->cip_count - pt->cip_result,
1202 pt->cip_result, rc);
1206 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1207 file_dentry(file)->d_name.name,
1208 pt->cip_iot == CIT_READ ? "read" : "write",
1209 pt->cip_result, rc);
1211 cl_env_put(env, &refcheck);
1212 RETURN(pt->cip_result > 0 ? 0 : rc);
1216 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1217 struct file *file, enum cl_io_type iot,
1218 loff_t *ppos, size_t count)
1220 struct range_lock range;
1221 struct vvp_io *vio = vvp_env_io(env);
1222 struct inode *inode = file_inode(file);
1223 struct ll_inode_info *lli = ll_i2info(inode);
1224 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1232 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1233 file_dentry(file)->d_name.name,
1234 iot == CIT_READ ? "read" : "write", pos, pos + count);
1237 io = vvp_env_thread_io(env);
1238 ll_io_init(io, file, iot);
1239 if (args->via_io_subtype == IO_NORMAL) {
1240 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1241 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1246 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1247 bool range_locked = false;
1249 if (file->f_flags & O_APPEND)
1250 range_lock_init(&range, 0, LUSTRE_EOF);
1252 range_lock_init(&range, pos, pos + count - 1);
1254 vio->vui_fd = LUSTRE_FPRIVATE(file);
1255 vio->vui_io_subtype = args->via_io_subtype;
1257 switch (vio->vui_io_subtype) {
1259 /* Direct IO reads must also take range lock,
1260 * or multiple reads will try to work on the same pages
1261 * See LU-6227 for details. */
1262 if (((iot == CIT_WRITE) ||
1263 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1264 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1265 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1267 rc = range_lock(&lli->lli_write_tree, &range);
1271 range_locked = true;
1275 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1276 vio->u.splice.vui_flags = args->u.splice.via_flags;
1279 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1283 ll_cl_add(file, env, io, LCC_RW);
1284 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1285 !lli->lli_inode_locked) {
1287 lli->lli_inode_locked = 1;
1289 rc = cl_io_loop(env, io);
1290 if (lli->lli_inode_locked) {
1291 lli->lli_inode_locked = 0;
1292 inode_unlock(inode);
1294 ll_cl_remove(file, env);
1297 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1299 range_unlock(&lli->lli_write_tree, &range);
1302 /* cl_io_rw_init() handled IO */
1306 if (io->ci_nob > 0) {
1307 result += io->ci_nob;
1308 count -= io->ci_nob;
1310 if (args->via_io_subtype == IO_NORMAL) {
1311 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1313 args->u.normal.via_iocb->ki_pos = pos;
1314 #ifdef HAVE_KIOCB_KI_LEFT
1315 args->u.normal.via_iocb->ki_left = count;
1316 #elif defined(HAVE_KI_NBYTES)
1317 args->u.normal.via_iocb->ki_nbytes = count;
1321 pos = io->u.ci_rw.rw_range.cir_pos;
1325 cl_io_fini(env, io);
1327 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1329 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1330 file_dentry(file)->d_name.name,
1331 iot == CIT_READ ? "read" : "write",
1332 pos, pos + count, result, rc);
1336 if (iot == CIT_READ) {
1338 ll_stats_ops_tally(ll_i2sbi(inode),
1339 LPROC_LL_READ_BYTES, result);
1340 } else if (iot == CIT_WRITE) {
1342 ll_stats_ops_tally(ll_i2sbi(inode),
1343 LPROC_LL_WRITE_BYTES, result);
1344 fd->fd_write_failed = false;
1345 } else if (result == 0 && rc == 0) {
1348 fd->fd_write_failed = true;
1350 fd->fd_write_failed = false;
1351 } else if (rc != -ERESTARTSYS) {
1352 fd->fd_write_failed = true;
1356 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1357 file_dentry(file)->d_name.name,
1358 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1362 RETURN(result > 0 ? result : rc);
1366 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1367 * especially for small I/O.
1369 * To serve a read request, CLIO has to create and initialize a cl_io and
1370 * then request DLM lock. This has turned out to have siginificant overhead
1371 * and affects the performance of small I/O dramatically.
1373 * It's not necessary to create a cl_io for each I/O. Under the help of read
1374 * ahead, most of the pages being read are already in memory cache and we can
1375 * read those pages directly because if the pages exist, the corresponding DLM
1376 * lock must exist so that page content must be valid.
1378 * In fast read implementation, the llite speculatively finds and reads pages
1379 * in memory cache. There are three scenarios for fast read:
1380 * - If the page exists and is uptodate, kernel VM will provide the data and
1381 * CLIO won't be intervened;
1382 * - If the page was brought into memory by read ahead, it will be exported
1383 * and read ahead parameters will be updated;
1384 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1385 * it will go back and invoke normal read, i.e., a cl_io will be created
1386 * and DLM lock will be requested.
1388 * POSIX compliance: posix standard states that read is intended to be atomic.
1389 * Lustre read implementation is in line with Linux kernel read implementation
1390 * and neither of them complies with POSIX standard in this matter. Fast read
1391 * doesn't make the situation worse on single node but it may interleave write
1392 * results from multiple nodes due to short read handling in ll_file_aio_read().
1394 * \param env - lu_env
1395 * \param iocb - kiocb from kernel
1396 * \param iter - user space buffers where the data will be copied
1398 * \retval - number of bytes have been read, or error code if error occurred.
1401 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1405 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1408 /* NB: we can't do direct IO for fast read because it will need a lock
1409 * to make IO engine happy. */
1410 if (iocb->ki_filp->f_flags & O_DIRECT)
1413 result = generic_file_read_iter(iocb, iter);
1415 /* If the first page is not in cache, generic_file_aio_read() will be
1416 * returned with -ENODATA.
1417 * See corresponding code in ll_readpage(). */
1418 if (result == -ENODATA)
1422 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1423 LPROC_LL_READ_BYTES, result);
1429 * Read from a file (through the page cache).
1431 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1434 struct vvp_io_args *args;
1439 result = ll_do_fast_read(iocb, to);
1440 if (result < 0 || iov_iter_count(to) == 0)
1443 env = cl_env_get(&refcheck);
1445 return PTR_ERR(env);
1447 args = ll_env_args(env, IO_NORMAL);
1448 args->u.normal.via_iter = to;
1449 args->u.normal.via_iocb = iocb;
1451 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1452 &iocb->ki_pos, iov_iter_count(to));
1455 else if (result == 0)
1458 cl_env_put(env, &refcheck);
1464 * Write to a file (through the page cache).
1466 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1468 struct vvp_io_args *args;
1473 env = cl_env_get(&refcheck);
1475 return PTR_ERR(env);
1477 args = ll_env_args(env, IO_NORMAL);
1478 args->u.normal.via_iter = from;
1479 args->u.normal.via_iocb = iocb;
1481 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1482 &iocb->ki_pos, iov_iter_count(from));
1483 cl_env_put(env, &refcheck);
1487 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1489 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1491 static int ll_file_get_iov_count(const struct iovec *iov,
1492 unsigned long *nr_segs, size_t *count)
1497 for (seg = 0; seg < *nr_segs; seg++) {
1498 const struct iovec *iv = &iov[seg];
1501 * If any segment has a negative length, or the cumulative
1502 * length ever wraps negative then return -EINVAL.
1505 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1507 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1512 cnt -= iv->iov_len; /* This segment is no good */
1519 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1520 unsigned long nr_segs, loff_t pos)
1527 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1531 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1532 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1533 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1534 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1535 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1537 result = ll_file_read_iter(iocb, &to);
1542 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1545 struct iovec iov = { .iov_base = buf, .iov_len = count };
1550 init_sync_kiocb(&kiocb, file);
1551 kiocb.ki_pos = *ppos;
1552 #ifdef HAVE_KIOCB_KI_LEFT
1553 kiocb.ki_left = count;
1554 #elif defined(HAVE_KI_NBYTES)
1555 kiocb.i_nbytes = count;
1558 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1559 *ppos = kiocb.ki_pos;
1565 * Write to a file (through the page cache).
1568 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1569 unsigned long nr_segs, loff_t pos)
1571 struct iov_iter from;
1576 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1580 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1581 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1582 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1583 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1584 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1586 result = ll_file_write_iter(iocb, &from);
1591 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1592 size_t count, loff_t *ppos)
1595 struct iovec iov = { .iov_base = (void __user *)buf,
1597 struct kiocb *kiocb;
1602 env = cl_env_get(&refcheck);
1604 RETURN(PTR_ERR(env));
1606 kiocb = &ll_env_info(env)->lti_kiocb;
1607 init_sync_kiocb(kiocb, file);
1608 kiocb->ki_pos = *ppos;
1609 #ifdef HAVE_KIOCB_KI_LEFT
1610 kiocb->ki_left = count;
1611 #elif defined(HAVE_KI_NBYTES)
1612 kiocb->ki_nbytes = count;
1615 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1616 *ppos = kiocb->ki_pos;
1618 cl_env_put(env, &refcheck);
1621 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1624 * Send file content (through pagecache) somewhere with helper
1626 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1627 struct pipe_inode_info *pipe, size_t count,
1631 struct vvp_io_args *args;
1636 env = cl_env_get(&refcheck);
1638 RETURN(PTR_ERR(env));
1640 args = ll_env_args(env, IO_SPLICE);
1641 args->u.splice.via_pipe = pipe;
1642 args->u.splice.via_flags = flags;
1644 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1645 cl_env_put(env, &refcheck);
1649 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1650 __u64 flags, struct lov_user_md *lum, int lum_size)
1652 struct lookup_intent oit = {
1654 .it_flags = flags | MDS_OPEN_BY_FID,
1659 ll_inode_size_lock(inode);
1660 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1662 GOTO(out_unlock, rc);
1664 ll_release_openhandle(dentry, &oit);
1667 ll_inode_size_unlock(inode);
1668 ll_intent_release(&oit);
1673 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1674 struct lov_mds_md **lmmp, int *lmm_size,
1675 struct ptlrpc_request **request)
1677 struct ll_sb_info *sbi = ll_i2sbi(inode);
1678 struct mdt_body *body;
1679 struct lov_mds_md *lmm = NULL;
1680 struct ptlrpc_request *req = NULL;
1681 struct md_op_data *op_data;
1684 rc = ll_get_default_mdsize(sbi, &lmmsize);
1688 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1689 strlen(filename), lmmsize,
1690 LUSTRE_OPC_ANY, NULL);
1691 if (IS_ERR(op_data))
1692 RETURN(PTR_ERR(op_data));
1694 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1695 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1696 ll_finish_md_op_data(op_data);
1698 CDEBUG(D_INFO, "md_getattr_name failed "
1699 "on %s: rc %d\n", filename, rc);
1703 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1704 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1706 lmmsize = body->mbo_eadatasize;
1708 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1710 GOTO(out, rc = -ENODATA);
1713 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1714 LASSERT(lmm != NULL);
1716 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1717 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1718 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1719 GOTO(out, rc = -EPROTO);
1722 * This is coming from the MDS, so is probably in
1723 * little endian. We convert it to host endian before
1724 * passing it to userspace.
1726 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1729 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1730 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1731 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1732 if (le32_to_cpu(lmm->lmm_pattern) &
1733 LOV_PATTERN_F_RELEASED)
1737 /* if function called for directory - we should
1738 * avoid swab not existent lsm objects */
1739 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1740 lustre_swab_lov_user_md_v1(
1741 (struct lov_user_md_v1 *)lmm);
1742 if (S_ISREG(body->mbo_mode))
1743 lustre_swab_lov_user_md_objects(
1744 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1746 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1747 lustre_swab_lov_user_md_v3(
1748 (struct lov_user_md_v3 *)lmm);
1749 if (S_ISREG(body->mbo_mode))
1750 lustre_swab_lov_user_md_objects(
1751 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1753 } else if (lmm->lmm_magic ==
1754 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1755 lustre_swab_lov_comp_md_v1(
1756 (struct lov_comp_md_v1 *)lmm);
1762 *lmm_size = lmmsize;
1767 static int ll_lov_setea(struct inode *inode, struct file *file,
1770 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1771 struct lov_user_md *lump;
1772 int lum_size = sizeof(struct lov_user_md) +
1773 sizeof(struct lov_user_ost_data);
1777 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1780 OBD_ALLOC_LARGE(lump, lum_size);
1784 if (copy_from_user(lump, arg, lum_size))
1785 GOTO(out_lump, rc = -EFAULT);
1787 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1789 cl_lov_delay_create_clear(&file->f_flags);
1792 OBD_FREE_LARGE(lump, lum_size);
1796 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1803 env = cl_env_get(&refcheck);
1805 RETURN(PTR_ERR(env));
1807 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1808 cl_env_put(env, &refcheck);
1812 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1815 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1816 struct lov_user_md *klum;
1818 __u64 flags = FMODE_WRITE;
1821 rc = ll_copy_user_md(lum, &klum);
1826 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1831 rc = put_user(0, &lum->lmm_stripe_count);
1835 rc = ll_layout_refresh(inode, &gen);
1839 rc = ll_file_getstripe(inode, arg, lum_size);
1841 cl_lov_delay_create_clear(&file->f_flags);
1844 OBD_FREE(klum, lum_size);
1849 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1851 struct ll_inode_info *lli = ll_i2info(inode);
1852 struct cl_object *obj = lli->lli_clob;
1853 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1854 struct ll_grouplock grouplock;
1859 CWARN("group id for group lock must not be 0\n");
1863 if (ll_file_nolock(file))
1864 RETURN(-EOPNOTSUPP);
1866 spin_lock(&lli->lli_lock);
1867 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1868 CWARN("group lock already existed with gid %lu\n",
1869 fd->fd_grouplock.lg_gid);
1870 spin_unlock(&lli->lli_lock);
1873 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1874 spin_unlock(&lli->lli_lock);
1877 * XXX: group lock needs to protect all OST objects while PFL
1878 * can add new OST objects during the IO, so we'd instantiate
1879 * all OST objects before getting its group lock.
1884 struct cl_layout cl = {
1885 .cl_is_composite = false,
1888 env = cl_env_get(&refcheck);
1890 RETURN(PTR_ERR(env));
1892 rc = cl_object_layout_get(env, obj, &cl);
1893 if (!rc && cl.cl_is_composite)
1894 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1896 cl_env_put(env, &refcheck);
1901 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1902 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1906 spin_lock(&lli->lli_lock);
1907 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1908 spin_unlock(&lli->lli_lock);
1909 CERROR("another thread just won the race\n");
1910 cl_put_grouplock(&grouplock);
1914 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1915 fd->fd_grouplock = grouplock;
1916 spin_unlock(&lli->lli_lock);
1918 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1922 static int ll_put_grouplock(struct inode *inode, struct file *file,
1925 struct ll_inode_info *lli = ll_i2info(inode);
1926 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1927 struct ll_grouplock grouplock;
1930 spin_lock(&lli->lli_lock);
1931 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1932 spin_unlock(&lli->lli_lock);
1933 CWARN("no group lock held\n");
1937 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1939 if (fd->fd_grouplock.lg_gid != arg) {
1940 CWARN("group lock %lu doesn't match current id %lu\n",
1941 arg, fd->fd_grouplock.lg_gid);
1942 spin_unlock(&lli->lli_lock);
1946 grouplock = fd->fd_grouplock;
1947 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1948 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1949 spin_unlock(&lli->lli_lock);
1951 cl_put_grouplock(&grouplock);
1952 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1957 * Close inode open handle
1959 * \param dentry [in] dentry which contains the inode
1960 * \param it [in,out] intent which contains open info and result
1963 * \retval <0 failure
1965 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1967 struct inode *inode = dentry->d_inode;
1968 struct obd_client_handle *och;
1974 /* Root ? Do nothing. */
1975 if (dentry->d_inode->i_sb->s_root == dentry)
1978 /* No open handle to close? Move away */
1979 if (!it_disposition(it, DISP_OPEN_OPEN))
1982 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1984 OBD_ALLOC(och, sizeof(*och));
1986 GOTO(out, rc = -ENOMEM);
1988 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1990 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1992 /* this one is in place of ll_file_open */
1993 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1994 ptlrpc_req_finished(it->it_request);
1995 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2001 * Get size for inode for which FIEMAP mapping is requested.
2002 * Make the FIEMAP get_info call and returns the result.
2003 * \param fiemap kernel buffer to hold extens
2004 * \param num_bytes kernel buffer size
2006 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2012 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2015 /* Checks for fiemap flags */
2016 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2017 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2021 /* Check for FIEMAP_FLAG_SYNC */
2022 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2023 rc = filemap_fdatawrite(inode->i_mapping);
2028 env = cl_env_get(&refcheck);
2030 RETURN(PTR_ERR(env));
2032 if (i_size_read(inode) == 0) {
2033 rc = ll_glimpse_size(inode);
2038 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2039 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2040 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2042 /* If filesize is 0, then there would be no objects for mapping */
2043 if (fmkey.lfik_oa.o_size == 0) {
2044 fiemap->fm_mapped_extents = 0;
2048 fmkey.lfik_fiemap = *fiemap;
2050 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2051 &fmkey, fiemap, &num_bytes);
2053 cl_env_put(env, &refcheck);
2057 int ll_fid2path(struct inode *inode, void __user *arg)
2059 struct obd_export *exp = ll_i2mdexp(inode);
2060 const struct getinfo_fid2path __user *gfin = arg;
2062 struct getinfo_fid2path *gfout;
2068 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2069 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2072 /* Only need to get the buflen */
2073 if (get_user(pathlen, &gfin->gf_pathlen))
2076 if (pathlen > PATH_MAX)
2079 outsize = sizeof(*gfout) + pathlen;
2080 OBD_ALLOC(gfout, outsize);
2084 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2085 GOTO(gf_free, rc = -EFAULT);
2086 /* append root FID after gfout to let MDT know the root FID so that it
2087 * can lookup the correct path, this is mainly for fileset.
2088 * old server without fileset mount support will ignore this. */
2089 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2091 /* Call mdc_iocontrol */
2092 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2096 if (copy_to_user(arg, gfout, outsize))
2100 OBD_FREE(gfout, outsize);
2105 * Read the data_version for inode.
2107 * This value is computed using stripe object version on OST.
2108 * Version is computed using server side locking.
2110 * @param flags if do sync on the OST side;
2112 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2113 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2115 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2117 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2125 /* If no file object initialized, we consider its version is 0. */
2131 env = cl_env_get(&refcheck);
2133 RETURN(PTR_ERR(env));
2135 io = vvp_env_thread_io(env);
2137 io->u.ci_data_version.dv_data_version = 0;
2138 io->u.ci_data_version.dv_flags = flags;
2141 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2142 result = cl_io_loop(env, io);
2144 result = io->ci_result;
2146 *data_version = io->u.ci_data_version.dv_data_version;
2148 cl_io_fini(env, io);
2150 if (unlikely(io->ci_need_restart))
2153 cl_env_put(env, &refcheck);
2159 * Trigger a HSM release request for the provided inode.
2161 int ll_hsm_release(struct inode *inode)
2164 struct obd_client_handle *och = NULL;
2165 __u64 data_version = 0;
2170 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2171 ll_get_fsname(inode->i_sb, NULL, 0),
2172 PFID(&ll_i2info(inode)->lli_fid));
2174 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2176 GOTO(out, rc = PTR_ERR(och));
2178 /* Grab latest data_version and [am]time values */
2179 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2183 env = cl_env_get(&refcheck);
2185 GOTO(out, rc = PTR_ERR(env));
2187 ll_merge_attr(env, inode);
2188 cl_env_put(env, &refcheck);
2190 /* Release the file.
2191 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2192 * we still need it to pack l_remote_handle to MDT. */
2193 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2199 if (och != NULL && !IS_ERR(och)) /* close the file */
2200 ll_lease_close(och, inode, NULL);
2205 struct ll_swap_stack {
2208 struct inode *inode1;
2209 struct inode *inode2;
2214 static int ll_swap_layouts(struct file *file1, struct file *file2,
2215 struct lustre_swap_layouts *lsl)
2217 struct mdc_swap_layouts msl;
2218 struct md_op_data *op_data;
2221 struct ll_swap_stack *llss = NULL;
2224 OBD_ALLOC_PTR(llss);
2228 llss->inode1 = file_inode(file1);
2229 llss->inode2 = file_inode(file2);
2231 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2235 /* we use 2 bool because it is easier to swap than 2 bits */
2236 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2237 llss->check_dv1 = true;
2239 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2240 llss->check_dv2 = true;
2242 /* we cannot use lsl->sl_dvX directly because we may swap them */
2243 llss->dv1 = lsl->sl_dv1;
2244 llss->dv2 = lsl->sl_dv2;
2246 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2247 if (rc == 0) /* same file, done! */
2250 if (rc < 0) { /* sequentialize it */
2251 swap(llss->inode1, llss->inode2);
2253 swap(llss->dv1, llss->dv2);
2254 swap(llss->check_dv1, llss->check_dv2);
2258 if (gid != 0) { /* application asks to flush dirty cache */
2259 rc = ll_get_grouplock(llss->inode1, file1, gid);
2263 rc = ll_get_grouplock(llss->inode2, file2, gid);
2265 ll_put_grouplock(llss->inode1, file1, gid);
2270 /* ultimate check, before swaping the layouts we check if
2271 * dataversion has changed (if requested) */
2272 if (llss->check_dv1) {
2273 rc = ll_data_version(llss->inode1, &dv, 0);
2276 if (dv != llss->dv1)
2277 GOTO(putgl, rc = -EAGAIN);
2280 if (llss->check_dv2) {
2281 rc = ll_data_version(llss->inode2, &dv, 0);
2284 if (dv != llss->dv2)
2285 GOTO(putgl, rc = -EAGAIN);
2288 /* struct md_op_data is used to send the swap args to the mdt
2289 * only flags is missing, so we use struct mdc_swap_layouts
2290 * through the md_op_data->op_data */
2291 /* flags from user space have to be converted before they are send to
2292 * server, no flag is sent today, they are only used on the client */
2295 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2296 0, LUSTRE_OPC_ANY, &msl);
2297 if (IS_ERR(op_data))
2298 GOTO(free, rc = PTR_ERR(op_data));
2300 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2301 sizeof(*op_data), op_data, NULL);
2302 ll_finish_md_op_data(op_data);
2309 ll_put_grouplock(llss->inode2, file2, gid);
2310 ll_put_grouplock(llss->inode1, file1, gid);
2320 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2322 struct md_op_data *op_data;
2326 /* Detect out-of range masks */
2327 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2330 /* Non-root users are forbidden to set or clear flags which are
2331 * NOT defined in HSM_USER_MASK. */
2332 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2333 !cfs_capable(CFS_CAP_SYS_ADMIN))
2336 /* Detect out-of range archive id */
2337 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2338 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2341 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2342 LUSTRE_OPC_ANY, hss);
2343 if (IS_ERR(op_data))
2344 RETURN(PTR_ERR(op_data));
2346 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2347 sizeof(*op_data), op_data, NULL);
2349 ll_finish_md_op_data(op_data);
2354 static int ll_hsm_import(struct inode *inode, struct file *file,
2355 struct hsm_user_import *hui)
2357 struct hsm_state_set *hss = NULL;
2358 struct iattr *attr = NULL;
2362 if (!S_ISREG(inode->i_mode))
2368 GOTO(out, rc = -ENOMEM);
2370 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2371 hss->hss_archive_id = hui->hui_archive_id;
2372 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2373 rc = ll_hsm_state_set(inode, hss);
2377 OBD_ALLOC_PTR(attr);
2379 GOTO(out, rc = -ENOMEM);
2381 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2382 attr->ia_mode |= S_IFREG;
2383 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2384 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2385 attr->ia_size = hui->hui_size;
2386 attr->ia_mtime.tv_sec = hui->hui_mtime;
2387 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2388 attr->ia_atime.tv_sec = hui->hui_atime;
2389 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2391 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2392 ATTR_UID | ATTR_GID |
2393 ATTR_MTIME | ATTR_MTIME_SET |
2394 ATTR_ATIME | ATTR_ATIME_SET;
2398 rc = ll_setattr_raw(file_dentry(file), attr, true);
2402 inode_unlock(inode);
2414 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2416 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2417 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2420 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2422 struct inode *inode = file_inode(file);
2424 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2425 ATTR_MTIME | ATTR_MTIME_SET |
2426 ATTR_CTIME | ATTR_CTIME_SET,
2428 .tv_sec = lfu->lfu_atime_sec,
2429 .tv_nsec = lfu->lfu_atime_nsec,
2432 .tv_sec = lfu->lfu_mtime_sec,
2433 .tv_nsec = lfu->lfu_mtime_nsec,
2436 .tv_sec = lfu->lfu_ctime_sec,
2437 .tv_nsec = lfu->lfu_ctime_nsec,
2443 if (!capable(CAP_SYS_ADMIN))
2446 if (!S_ISREG(inode->i_mode))
2450 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2451 inode_unlock(inode);
2456 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2459 case MODE_READ_USER:
2461 case MODE_WRITE_USER:
2468 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2470 /* Used to allow the upper layers of the client to request an LDLM lock
2471 * without doing an actual read or write.
2473 * Used for ladvise lockahead to manually request specific locks.
2475 * \param[in] file file this ladvise lock request is on
2476 * \param[in] ladvise ladvise struct describing this lock request
2478 * \retval 0 success, no detailed result available (sync requests
2479 * and requests sent to the server [not handled locally]
2480 * cannot return detailed results)
2481 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2482 * see definitions for details.
2483 * \retval negative negative errno on error
2485 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2487 struct lu_env *env = NULL;
2488 struct cl_io *io = NULL;
2489 struct cl_lock *lock = NULL;
2490 struct cl_lock_descr *descr = NULL;
2491 struct dentry *dentry = file->f_path.dentry;
2492 struct inode *inode = dentry->d_inode;
2493 enum cl_lock_mode cl_mode;
2494 off_t start = ladvise->lla_start;
2495 off_t end = ladvise->lla_end;
2501 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2502 "start=%llu, end=%llu\n", dentry->d_name.len,
2503 dentry->d_name.name, dentry->d_inode,
2504 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2507 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2509 GOTO(out, result = cl_mode);
2511 /* Get IO environment */
2512 result = cl_io_get(inode, &env, &io, &refcheck);
2516 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2519 * nothing to do for this io. This currently happens when
2520 * stripe sub-object's are not yet created.
2522 result = io->ci_result;
2523 } else if (result == 0) {
2524 lock = vvp_env_lock(env);
2525 descr = &lock->cll_descr;
2527 descr->cld_obj = io->ci_obj;
2528 /* Convert byte offsets to pages */
2529 descr->cld_start = cl_index(io->ci_obj, start);
2530 descr->cld_end = cl_index(io->ci_obj, end);
2531 descr->cld_mode = cl_mode;
2532 /* CEF_MUST is used because we do not want to convert a
2533 * lockahead request to a lockless lock */
2534 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2537 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2538 descr->cld_enq_flags |= CEF_SPECULATIVE;
2540 result = cl_lock_request(env, io, lock);
2542 /* On success, we need to release the lock */
2544 cl_lock_release(env, lock);
2546 cl_io_fini(env, io);
2547 cl_env_put(env, &refcheck);
2549 /* -ECANCELED indicates a matching lock with a different extent
2550 * was already present, and -EEXIST indicates a matching lock
2551 * on exactly the same extent was already present.
2552 * We convert them to positive values for userspace to make
2553 * recognizing true errors easier.
2554 * Note we can only return these detailed results on async requests,
2555 * as sync requests look the same as i/o requests for locking. */
2556 if (result == -ECANCELED)
2557 result = LLA_RESULT_DIFFERENT;
2558 else if (result == -EEXIST)
2559 result = LLA_RESULT_SAME;
2564 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2566 static int ll_ladvise_sanity(struct inode *inode,
2567 struct llapi_lu_ladvise *ladvise)
2569 enum lu_ladvise_type advice = ladvise->lla_advice;
2570 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2571 * be in the first 32 bits of enum ladvise_flags */
2572 __u32 flags = ladvise->lla_peradvice_flags;
2573 /* 3 lines at 80 characters per line, should be plenty */
2576 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2578 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2579 "last supported advice is %s (value '%d'): rc = %d\n",
2580 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2581 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2585 /* Per-advice checks */
2587 case LU_LADVISE_LOCKNOEXPAND:
2588 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2590 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2592 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2593 ladvise_names[advice], rc);
2597 case LU_LADVISE_LOCKAHEAD:
2598 /* Currently only READ and WRITE modes can be requested */
2599 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2600 ladvise->lla_lockahead_mode == 0) {
2602 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2604 ll_get_fsname(inode->i_sb, NULL, 0),
2605 ladvise->lla_lockahead_mode,
2606 ladvise_names[advice], rc);
2609 case LU_LADVISE_WILLREAD:
2610 case LU_LADVISE_DONTNEED:
2612 /* Note fall through above - These checks apply to all advices
2613 * except LOCKNOEXPAND */
2614 if (flags & ~LF_DEFAULT_MASK) {
2616 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2618 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2619 ladvise_names[advice], rc);
2622 if (ladvise->lla_start >= ladvise->lla_end) {
2624 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2625 "for %s: rc = %d\n",
2626 ll_get_fsname(inode->i_sb, NULL, 0),
2627 ladvise->lla_start, ladvise->lla_end,
2628 ladvise_names[advice], rc);
2640 * Give file access advices
2642 * The ladvise interface is similar to Linux fadvise() system call, except it
2643 * forwards the advices directly from Lustre client to server. The server side
2644 * codes will apply appropriate read-ahead and caching techniques for the
2645 * corresponding files.
2647 * A typical workload for ladvise is e.g. a bunch of different clients are
2648 * doing small random reads of a file, so prefetching pages into OSS cache
2649 * with big linear reads before the random IO is a net benefit. Fetching
2650 * all that data into each client cache with fadvise() may not be, due to
2651 * much more data being sent to the client.
2653 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2654 struct llapi_lu_ladvise *ladvise)
2658 struct cl_ladvise_io *lio;
2663 env = cl_env_get(&refcheck);
2665 RETURN(PTR_ERR(env));
2667 io = vvp_env_thread_io(env);
2668 io->ci_obj = ll_i2info(inode)->lli_clob;
2670 /* initialize parameters for ladvise */
2671 lio = &io->u.ci_ladvise;
2672 lio->li_start = ladvise->lla_start;
2673 lio->li_end = ladvise->lla_end;
2674 lio->li_fid = ll_inode2fid(inode);
2675 lio->li_advice = ladvise->lla_advice;
2676 lio->li_flags = flags;
2678 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2679 rc = cl_io_loop(env, io);
2683 cl_io_fini(env, io);
2684 cl_env_put(env, &refcheck);
2688 static int ll_lock_noexpand(struct file *file, int flags)
2690 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2692 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2697 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2700 struct fsxattr fsxattr;
2702 if (copy_from_user(&fsxattr,
2703 (const struct fsxattr __user *)arg,
2707 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2708 if (copy_to_user((struct fsxattr __user *)arg,
2709 &fsxattr, sizeof(fsxattr)))
2715 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2719 struct md_op_data *op_data;
2720 struct ptlrpc_request *req = NULL;
2722 struct fsxattr fsxattr;
2724 /* only root could change project ID */
2725 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2728 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2729 LUSTRE_OPC_ANY, NULL);
2730 if (IS_ERR(op_data))
2731 RETURN(PTR_ERR(op_data));
2733 if (copy_from_user(&fsxattr,
2734 (const struct fsxattr __user *)arg,
2736 GOTO(out_fsxattr1, rc = -EFAULT);
2738 op_data->op_projid = fsxattr.fsx_projid;
2739 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2740 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2742 ptlrpc_req_finished(req);
2745 ll_finish_md_op_data(op_data);
2752 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2754 struct inode *inode = file_inode(file);
2755 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2759 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2760 PFID(ll_inode2fid(inode)), inode, cmd);
2761 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2763 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2764 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2768 case LL_IOC_GETFLAGS:
2769 /* Get the current value of the file flags */
2770 return put_user(fd->fd_flags, (int __user *)arg);
2771 case LL_IOC_SETFLAGS:
2772 case LL_IOC_CLRFLAGS:
2773 /* Set or clear specific file flags */
2774 /* XXX This probably needs checks to ensure the flags are
2775 * not abused, and to handle any flag side effects.
2777 if (get_user(flags, (int __user *) arg))
2780 if (cmd == LL_IOC_SETFLAGS) {
2781 if ((flags & LL_FILE_IGNORE_LOCK) &&
2782 !(file->f_flags & O_DIRECT)) {
2783 CERROR("%s: unable to disable locking on "
2784 "non-O_DIRECT file\n", current->comm);
2788 fd->fd_flags |= flags;
2790 fd->fd_flags &= ~flags;
2793 case LL_IOC_LOV_SETSTRIPE:
2794 case LL_IOC_LOV_SETSTRIPE_NEW:
2795 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2796 case LL_IOC_LOV_SETEA:
2797 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2798 case LL_IOC_LOV_SWAP_LAYOUTS: {
2800 struct lustre_swap_layouts lsl;
2803 if (copy_from_user(&lsl, (char __user *)arg,
2804 sizeof(struct lustre_swap_layouts)))
2807 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2810 file2 = fget(lsl.sl_fd);
2814 /* O_WRONLY or O_RDWR */
2815 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2816 GOTO(out, rc = -EPERM);
2818 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
2820 struct inode *inode2;
2821 struct ll_inode_info *lli;
2822 struct obd_client_handle *och = NULL;
2824 lli = ll_i2info(inode);
2825 mutex_lock(&lli->lli_och_mutex);
2826 if (fd->fd_lease_och != NULL) {
2827 och = fd->fd_lease_och;
2828 fd->fd_lease_och = NULL;
2830 mutex_unlock(&lli->lli_och_mutex);
2832 GOTO(out, rc = -ENOLCK);
2833 inode2 = file_inode(file2);
2834 rc = ll_swap_layouts_close(och, inode, inode2, intent);
2836 rc = ll_swap_layouts(file, file2, &lsl);
2842 case LL_IOC_LOV_GETSTRIPE:
2843 case LL_IOC_LOV_GETSTRIPE_NEW:
2844 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2845 case FSFILT_IOC_GETFLAGS:
2846 case FSFILT_IOC_SETFLAGS:
2847 RETURN(ll_iocontrol(inode, file, cmd, arg));
2848 case FSFILT_IOC_GETVERSION_OLD:
2849 case FSFILT_IOC_GETVERSION:
2850 RETURN(put_user(inode->i_generation, (int __user *)arg));
2851 case LL_IOC_GROUP_LOCK:
2852 RETURN(ll_get_grouplock(inode, file, arg));
2853 case LL_IOC_GROUP_UNLOCK:
2854 RETURN(ll_put_grouplock(inode, file, arg));
2855 case IOC_OBD_STATFS:
2856 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2858 /* We need to special case any other ioctls we want to handle,
2859 * to send them to the MDS/OST as appropriate and to properly
2860 * network encode the arg field.
2861 case FSFILT_IOC_SETVERSION_OLD:
2862 case FSFILT_IOC_SETVERSION:
2864 case LL_IOC_FLUSHCTX:
2865 RETURN(ll_flush_ctx(inode));
2866 case LL_IOC_PATH2FID: {
2867 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2868 sizeof(struct lu_fid)))
2873 case LL_IOC_GETPARENT:
2874 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2876 case OBD_IOC_FID2PATH:
2877 RETURN(ll_fid2path(inode, (void __user *)arg));
2878 case LL_IOC_DATA_VERSION: {
2879 struct ioc_data_version idv;
2882 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2885 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2886 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2889 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2895 case LL_IOC_GET_MDTIDX: {
2898 mdtidx = ll_get_mdt_idx(inode);
2902 if (put_user((int)mdtidx, (int __user *)arg))
2907 case OBD_IOC_GETDTNAME:
2908 case OBD_IOC_GETMDNAME:
2909 RETURN(ll_get_obd_name(inode, cmd, arg));
2910 case LL_IOC_HSM_STATE_GET: {
2911 struct md_op_data *op_data;
2912 struct hsm_user_state *hus;
2919 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2920 LUSTRE_OPC_ANY, hus);
2921 if (IS_ERR(op_data)) {
2923 RETURN(PTR_ERR(op_data));
2926 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2929 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2932 ll_finish_md_op_data(op_data);
2936 case LL_IOC_HSM_STATE_SET: {
2937 struct hsm_state_set *hss;
2944 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2949 rc = ll_hsm_state_set(inode, hss);
2954 case LL_IOC_HSM_ACTION: {
2955 struct md_op_data *op_data;
2956 struct hsm_current_action *hca;
2963 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2964 LUSTRE_OPC_ANY, hca);
2965 if (IS_ERR(op_data)) {
2967 RETURN(PTR_ERR(op_data));
2970 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2973 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2976 ll_finish_md_op_data(op_data);
2980 case LL_IOC_SET_LEASE: {
2981 struct ll_inode_info *lli = ll_i2info(inode);
2982 struct obd_client_handle *och = NULL;
2987 case LL_LEASE_WRLCK:
2988 if (!(file->f_mode & FMODE_WRITE))
2990 fmode = FMODE_WRITE;
2992 case LL_LEASE_RDLCK:
2993 if (!(file->f_mode & FMODE_READ))
2997 case LL_LEASE_UNLCK:
2998 mutex_lock(&lli->lli_och_mutex);
2999 if (fd->fd_lease_och != NULL) {
3000 och = fd->fd_lease_och;
3001 fd->fd_lease_och = NULL;
3003 mutex_unlock(&lli->lli_och_mutex);
3008 fmode = och->och_flags;
3009 rc = ll_lease_close(och, inode, &lease_broken);
3013 rc = ll_lease_och_release(inode, file);
3020 RETURN(ll_lease_type_from_fmode(fmode));
3025 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3027 /* apply for lease */
3028 och = ll_lease_open(inode, file, fmode, 0);
3030 RETURN(PTR_ERR(och));
3033 mutex_lock(&lli->lli_och_mutex);
3034 if (fd->fd_lease_och == NULL) {
3035 fd->fd_lease_och = och;
3038 mutex_unlock(&lli->lli_och_mutex);
3040 /* impossible now that only excl is supported for now */
3041 ll_lease_close(och, inode, &lease_broken);
3046 case LL_IOC_GET_LEASE: {
3047 struct ll_inode_info *lli = ll_i2info(inode);
3048 struct ldlm_lock *lock = NULL;
3051 mutex_lock(&lli->lli_och_mutex);
3052 if (fd->fd_lease_och != NULL) {
3053 struct obd_client_handle *och = fd->fd_lease_och;
3055 lock = ldlm_handle2lock(&och->och_lease_handle);
3057 lock_res_and_lock(lock);
3058 if (!ldlm_is_cancel(lock))
3059 fmode = och->och_flags;
3061 unlock_res_and_lock(lock);
3062 LDLM_LOCK_PUT(lock);
3065 mutex_unlock(&lli->lli_och_mutex);
3067 RETURN(ll_lease_type_from_fmode(fmode));
3069 case LL_IOC_HSM_IMPORT: {
3070 struct hsm_user_import *hui;
3076 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3081 rc = ll_hsm_import(inode, file, hui);
3086 case LL_IOC_FUTIMES_3: {
3087 struct ll_futimes_3 lfu;
3089 if (copy_from_user(&lfu,
3090 (const struct ll_futimes_3 __user *)arg,
3094 RETURN(ll_file_futimes_3(file, &lfu));
3096 case LL_IOC_LADVISE: {
3097 struct llapi_ladvise_hdr *k_ladvise_hdr;
3098 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3101 int alloc_size = sizeof(*k_ladvise_hdr);
3104 u_ladvise_hdr = (void __user *)arg;
3105 OBD_ALLOC_PTR(k_ladvise_hdr);
3106 if (k_ladvise_hdr == NULL)
3109 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3110 GOTO(out_ladvise, rc = -EFAULT);
3112 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3113 k_ladvise_hdr->lah_count < 1)
3114 GOTO(out_ladvise, rc = -EINVAL);
3116 num_advise = k_ladvise_hdr->lah_count;
3117 if (num_advise >= LAH_COUNT_MAX)
3118 GOTO(out_ladvise, rc = -EFBIG);
3120 OBD_FREE_PTR(k_ladvise_hdr);
3121 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3122 lah_advise[num_advise]);
3123 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3124 if (k_ladvise_hdr == NULL)
3128 * TODO: submit multiple advices to one server in a single RPC
3130 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3131 GOTO(out_ladvise, rc = -EFAULT);
3133 for (i = 0; i < num_advise; i++) {
3134 struct llapi_lu_ladvise *k_ladvise =
3135 &k_ladvise_hdr->lah_advise[i];
3136 struct llapi_lu_ladvise __user *u_ladvise =
3137 &u_ladvise_hdr->lah_advise[i];
3139 rc = ll_ladvise_sanity(inode, k_ladvise);
3141 GOTO(out_ladvise, rc);
3143 switch (k_ladvise->lla_advice) {
3144 case LU_LADVISE_LOCKNOEXPAND:
3145 rc = ll_lock_noexpand(file,
3146 k_ladvise->lla_peradvice_flags);
3147 GOTO(out_ladvise, rc);
3148 case LU_LADVISE_LOCKAHEAD:
3150 rc = ll_file_lock_ahead(file, k_ladvise);
3153 GOTO(out_ladvise, rc);
3156 &u_ladvise->lla_lockahead_result))
3157 GOTO(out_ladvise, rc = -EFAULT);
3160 rc = ll_ladvise(inode, file,
3161 k_ladvise_hdr->lah_flags,
3164 GOTO(out_ladvise, rc);
3171 OBD_FREE(k_ladvise_hdr, alloc_size);
3174 case LL_IOC_FSGETXATTR:
3175 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3176 case LL_IOC_FSSETXATTR:
3177 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3179 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3181 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3182 (void __user *)arg));
3186 #ifndef HAVE_FILE_LLSEEK_SIZE
3187 static inline loff_t
3188 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3190 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3192 if (offset > maxsize)
3195 if (offset != file->f_pos) {
3196 file->f_pos = offset;
3197 file->f_version = 0;
3203 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3204 loff_t maxsize, loff_t eof)
3206 struct inode *inode = file_inode(file);
3214 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3215 * position-querying operation. Avoid rewriting the "same"
3216 * f_pos value back to the file because a concurrent read(),
3217 * write() or lseek() might have altered it
3222 * f_lock protects against read/modify/write race with other
3223 * SEEK_CURs. Note that parallel writes and reads behave
3227 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3228 inode_unlock(inode);
3232 * In the generic case the entire file is data, so as long as
3233 * offset isn't at the end of the file then the offset is data.
3240 * There is a virtual hole at the end of the file, so as long as
3241 * offset isn't i_size or larger, return i_size.
3249 return llseek_execute(file, offset, maxsize);
3253 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3255 struct inode *inode = file_inode(file);
3256 loff_t retval, eof = 0;
3259 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3260 (origin == SEEK_CUR) ? file->f_pos : 0);
3261 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3262 PFID(ll_inode2fid(inode)), inode, retval, retval,
3264 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3266 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3267 retval = ll_glimpse_size(inode);
3270 eof = i_size_read(inode);
3273 retval = ll_generic_file_llseek_size(file, offset, origin,
3274 ll_file_maxbytes(inode), eof);
3278 static int ll_flush(struct file *file, fl_owner_t id)
3280 struct inode *inode = file_inode(file);
3281 struct ll_inode_info *lli = ll_i2info(inode);
3282 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3285 LASSERT(!S_ISDIR(inode->i_mode));
3287 /* catch async errors that were recorded back when async writeback
3288 * failed for pages in this mapping. */
3289 rc = lli->lli_async_rc;
3290 lli->lli_async_rc = 0;
3291 if (lli->lli_clob != NULL) {
3292 err = lov_read_and_clear_async_rc(lli->lli_clob);
3297 /* The application has been told write failure already.
3298 * Do not report failure again. */
3299 if (fd->fd_write_failed)
3301 return rc ? -EIO : 0;
3305 * Called to make sure a portion of file has been written out.
3306 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3308 * Return how many pages have been written.
3310 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3311 enum cl_fsync_mode mode, int ignore_layout)
3315 struct cl_fsync_io *fio;
3320 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3321 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3324 env = cl_env_get(&refcheck);
3326 RETURN(PTR_ERR(env));
3328 io = vvp_env_thread_io(env);
3329 io->ci_obj = ll_i2info(inode)->lli_clob;
3330 io->ci_ignore_layout = ignore_layout;
3332 /* initialize parameters for sync */
3333 fio = &io->u.ci_fsync;
3334 fio->fi_start = start;
3336 fio->fi_fid = ll_inode2fid(inode);
3337 fio->fi_mode = mode;
3338 fio->fi_nr_written = 0;
3340 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3341 result = cl_io_loop(env, io);
3343 result = io->ci_result;
3345 result = fio->fi_nr_written;
3346 cl_io_fini(env, io);
3347 cl_env_put(env, &refcheck);
3353 * When dentry is provided (the 'else' case), file_dentry() may be
3354 * null and dentry must be used directly rather than pulled from
3355 * file_dentry() as is done otherwise.
3358 #ifdef HAVE_FILE_FSYNC_4ARGS
3359 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3361 struct dentry *dentry = file_dentry(file);
3363 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3364 int ll_fsync(struct file *file, int datasync)
3366 struct dentry *dentry = file_dentry(file);
3368 loff_t end = LLONG_MAX;
3370 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3373 loff_t end = LLONG_MAX;
3375 struct inode *inode = dentry->d_inode;
3376 struct ll_inode_info *lli = ll_i2info(inode);
3377 struct ptlrpc_request *req;
3381 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3382 PFID(ll_inode2fid(inode)), inode);
3383 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3385 #ifdef HAVE_FILE_FSYNC_4ARGS
3386 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3387 lock_inode = !lli->lli_inode_locked;
3391 /* fsync's caller has already called _fdata{sync,write}, we want
3392 * that IO to finish before calling the osc and mdc sync methods */
3393 rc = filemap_fdatawait(inode->i_mapping);
3396 /* catch async errors that were recorded back when async writeback
3397 * failed for pages in this mapping. */
3398 if (!S_ISDIR(inode->i_mode)) {
3399 err = lli->lli_async_rc;
3400 lli->lli_async_rc = 0;
3403 if (lli->lli_clob != NULL) {
3404 err = lov_read_and_clear_async_rc(lli->lli_clob);
3410 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3414 ptlrpc_req_finished(req);
3416 if (S_ISREG(inode->i_mode)) {
3417 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3419 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3420 if (rc == 0 && err < 0)
3423 fd->fd_write_failed = true;
3425 fd->fd_write_failed = false;
3428 #ifdef HAVE_FILE_FSYNC_4ARGS
3430 inode_unlock(inode);
3436 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3438 struct inode *inode = file_inode(file);
3439 struct ll_sb_info *sbi = ll_i2sbi(inode);
3440 struct ldlm_enqueue_info einfo = {
3441 .ei_type = LDLM_FLOCK,
3442 .ei_cb_cp = ldlm_flock_completion_ast,
3443 .ei_cbdata = file_lock,
3445 struct md_op_data *op_data;
3446 struct lustre_handle lockh = { 0 };
3447 union ldlm_policy_data flock = { { 0 } };
3448 int fl_type = file_lock->fl_type;
3454 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3455 PFID(ll_inode2fid(inode)), file_lock);
3457 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3459 if (file_lock->fl_flags & FL_FLOCK) {
3460 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3461 /* flocks are whole-file locks */
3462 flock.l_flock.end = OFFSET_MAX;
3463 /* For flocks owner is determined by the local file desctiptor*/
3464 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3465 } else if (file_lock->fl_flags & FL_POSIX) {
3466 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3467 flock.l_flock.start = file_lock->fl_start;
3468 flock.l_flock.end = file_lock->fl_end;
3472 flock.l_flock.pid = file_lock->fl_pid;
3474 /* Somewhat ugly workaround for svc lockd.
3475 * lockd installs custom fl_lmops->lm_compare_owner that checks
3476 * for the fl_owner to be the same (which it always is on local node
3477 * I guess between lockd processes) and then compares pid.
3478 * As such we assign pid to the owner field to make it all work,
3479 * conflict with normal locks is unlikely since pid space and
3480 * pointer space for current->files are not intersecting */
3481 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3482 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3486 einfo.ei_mode = LCK_PR;
3489 /* An unlock request may or may not have any relation to
3490 * existing locks so we may not be able to pass a lock handle
3491 * via a normal ldlm_lock_cancel() request. The request may even
3492 * unlock a byte range in the middle of an existing lock. In
3493 * order to process an unlock request we need all of the same
3494 * information that is given with a normal read or write record
3495 * lock request. To avoid creating another ldlm unlock (cancel)
3496 * message we'll treat a LCK_NL flock request as an unlock. */
3497 einfo.ei_mode = LCK_NL;
3500 einfo.ei_mode = LCK_PW;
3503 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3518 flags = LDLM_FL_BLOCK_NOWAIT;
3524 flags = LDLM_FL_TEST_LOCK;
3527 CERROR("unknown fcntl lock command: %d\n", cmd);
3531 /* Save the old mode so that if the mode in the lock changes we
3532 * can decrement the appropriate reader or writer refcount. */
3533 file_lock->fl_type = einfo.ei_mode;
3535 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3536 LUSTRE_OPC_ANY, NULL);
3537 if (IS_ERR(op_data))
3538 RETURN(PTR_ERR(op_data));
3540 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3541 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3542 flock.l_flock.pid, flags, einfo.ei_mode,
3543 flock.l_flock.start, flock.l_flock.end);
3545 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3548 /* Restore the file lock type if not TEST lock. */
3549 if (!(flags & LDLM_FL_TEST_LOCK))
3550 file_lock->fl_type = fl_type;
3552 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3553 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3554 !(flags & LDLM_FL_TEST_LOCK))
3555 rc2 = locks_lock_file_wait(file, file_lock);
3557 if ((file_lock->fl_flags & FL_FLOCK) &&
3558 (rc == 0 || file_lock->fl_type == F_UNLCK))
3559 rc2 = flock_lock_file_wait(file, file_lock);
3560 if ((file_lock->fl_flags & FL_POSIX) &&
3561 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3562 !(flags & LDLM_FL_TEST_LOCK))
3563 rc2 = posix_lock_file_wait(file, file_lock);
3564 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3566 if (rc2 && file_lock->fl_type != F_UNLCK) {
3567 einfo.ei_mode = LCK_NL;
3568 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3573 ll_finish_md_op_data(op_data);
3578 int ll_get_fid_by_name(struct inode *parent, const char *name,
3579 int namelen, struct lu_fid *fid,
3580 struct inode **inode)
3582 struct md_op_data *op_data = NULL;
3583 struct mdt_body *body;
3584 struct ptlrpc_request *req;
3588 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3589 LUSTRE_OPC_ANY, NULL);
3590 if (IS_ERR(op_data))
3591 RETURN(PTR_ERR(op_data));
3593 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3594 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3595 ll_finish_md_op_data(op_data);
3599 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3601 GOTO(out_req, rc = -EFAULT);
3603 *fid = body->mbo_fid1;
3606 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3608 ptlrpc_req_finished(req);
3612 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3613 const char *name, int namelen)
3615 struct dentry *dchild = NULL;
3616 struct inode *child_inode = NULL;
3617 struct md_op_data *op_data;
3618 struct ptlrpc_request *request = NULL;
3619 struct obd_client_handle *och = NULL;
3621 struct mdt_body *body;
3623 __u64 data_version = 0;
3626 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3627 name, PFID(ll_inode2fid(parent)), mdtidx);
3629 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3630 0, LUSTRE_OPC_ANY, NULL);
3631 if (IS_ERR(op_data))
3632 RETURN(PTR_ERR(op_data));
3634 /* Get child FID first */
3635 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3638 dchild = d_lookup(file_dentry(file), &qstr);
3639 if (dchild != NULL) {
3640 if (dchild->d_inode != NULL)
3641 child_inode = igrab(dchild->d_inode);
3645 if (child_inode == NULL) {
3646 rc = ll_get_fid_by_name(parent, name, namelen,
3647 &op_data->op_fid3, &child_inode);
3652 if (child_inode == NULL)
3653 GOTO(out_free, rc = -EINVAL);
3656 * lfs migrate command needs to be blocked on the client
3657 * by checking the migrate FID against the FID of the
3660 if (child_inode == parent->i_sb->s_root->d_inode)
3661 GOTO(out_iput, rc = -EINVAL);
3663 inode_lock(child_inode);
3664 op_data->op_fid3 = *ll_inode2fid(child_inode);
3665 if (!fid_is_sane(&op_data->op_fid3)) {
3666 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3667 ll_get_fsname(parent->i_sb, NULL, 0), name,
3668 PFID(&op_data->op_fid3));
3669 GOTO(out_unlock, rc = -EINVAL);
3672 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3674 GOTO(out_unlock, rc);
3677 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3678 PFID(&op_data->op_fid3), mdtidx);
3679 GOTO(out_unlock, rc = 0);
3682 if (S_ISREG(child_inode->i_mode)) {
3683 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3687 GOTO(out_unlock, rc);
3690 rc = ll_data_version(child_inode, &data_version,
3693 GOTO(out_close, rc);
3695 op_data->op_handle = och->och_fh;
3696 op_data->op_data = och->och_mod;
3697 op_data->op_data_version = data_version;
3698 op_data->op_lease_handle = och->och_lease_handle;
3699 op_data->op_bias |= MDS_RENAME_MIGRATE;
3702 op_data->op_mds = mdtidx;
3703 op_data->op_cli_flags = CLI_MIGRATE;
3704 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3705 namelen, name, namelen, &request);
3707 LASSERT(request != NULL);
3708 ll_update_times(request, parent);
3710 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3711 LASSERT(body != NULL);
3713 /* If the server does release layout lock, then we cleanup
3714 * the client och here, otherwise release it in out_close: */
3716 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3717 obd_mod_put(och->och_mod);
3718 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3720 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3726 if (request != NULL) {
3727 ptlrpc_req_finished(request);
3731 /* Try again if the file layout has changed. */
3732 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3736 if (och != NULL) /* close the file */
3737 ll_lease_close(och, child_inode, NULL);
3739 clear_nlink(child_inode);
3741 inode_unlock(child_inode);
3745 ll_finish_md_op_data(op_data);
3750 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3758 * test if some locks matching bits and l_req_mode are acquired
3759 * - bits can be in different locks
3760 * - if found clear the common lock bits in *bits
3761 * - the bits not found, are kept in *bits
3763 * \param bits [IN] searched lock bits [IN]
3764 * \param l_req_mode [IN] searched lock mode
3765 * \retval boolean, true iff all bits are found
3767 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3769 struct lustre_handle lockh;
3770 union ldlm_policy_data policy;
3771 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3772 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3781 fid = &ll_i2info(inode)->lli_fid;
3782 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3783 ldlm_lockname[mode]);
3785 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3786 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3787 policy.l_inodebits.bits = *bits & (1 << i);
3788 if (policy.l_inodebits.bits == 0)
3791 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3792 &policy, mode, &lockh)) {
3793 struct ldlm_lock *lock;
3795 lock = ldlm_handle2lock(&lockh);
3798 ~(lock->l_policy_data.l_inodebits.bits);
3799 LDLM_LOCK_PUT(lock);
3801 *bits &= ~policy.l_inodebits.bits;
3808 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3809 struct lustre_handle *lockh, __u64 flags,
3810 enum ldlm_mode mode)
3812 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3817 fid = &ll_i2info(inode)->lli_fid;
3818 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3820 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3821 fid, LDLM_IBITS, &policy, mode, lockh);
3826 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3828 /* Already unlinked. Just update nlink and return success */
3829 if (rc == -ENOENT) {
3831 /* If it is striped directory, and there is bad stripe
3832 * Let's revalidate the dentry again, instead of returning
3834 if (S_ISDIR(inode->i_mode) &&
3835 ll_i2info(inode)->lli_lsm_md != NULL)
3838 /* This path cannot be hit for regular files unless in
3839 * case of obscure races, so no need to to validate
3841 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3843 } else if (rc != 0) {
3844 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3845 "%s: revalidate FID "DFID" error: rc = %d\n",
3846 ll_get_fsname(inode->i_sb, NULL, 0),
3847 PFID(ll_inode2fid(inode)), rc);
3853 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3855 struct inode *inode = dentry->d_inode;
3856 struct ptlrpc_request *req = NULL;
3857 struct obd_export *exp;
3861 LASSERT(inode != NULL);
3863 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3864 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3866 exp = ll_i2mdexp(inode);
3868 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3869 * But under CMD case, it caused some lock issues, should be fixed
3870 * with new CMD ibits lock. See bug 12718 */
3871 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3872 struct lookup_intent oit = { .it_op = IT_GETATTR };
3873 struct md_op_data *op_data;
3875 if (ibits == MDS_INODELOCK_LOOKUP)
3876 oit.it_op = IT_LOOKUP;
3878 /* Call getattr by fid, so do not provide name at all. */
3879 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3880 dentry->d_inode, NULL, 0, 0,
3881 LUSTRE_OPC_ANY, NULL);
3882 if (IS_ERR(op_data))
3883 RETURN(PTR_ERR(op_data));
3885 rc = md_intent_lock(exp, op_data, &oit, &req,
3886 &ll_md_blocking_ast, 0);
3887 ll_finish_md_op_data(op_data);
3889 rc = ll_inode_revalidate_fini(inode, rc);
3893 rc = ll_revalidate_it_finish(req, &oit, dentry);
3895 ll_intent_release(&oit);
3899 /* Unlinked? Unhash dentry, so it is not picked up later by
3900 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3901 here to preserve get_cwd functionality on 2.6.
3903 if (!dentry->d_inode->i_nlink) {
3904 ll_lock_dcache(inode);
3905 d_lustre_invalidate(dentry, 0);
3906 ll_unlock_dcache(inode);
3909 ll_lookup_finish_locks(&oit, dentry);
3910 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3911 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3912 u64 valid = OBD_MD_FLGETATTR;
3913 struct md_op_data *op_data;
3916 if (S_ISREG(inode->i_mode)) {
3917 rc = ll_get_default_mdsize(sbi, &ealen);
3920 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3923 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3924 0, ealen, LUSTRE_OPC_ANY,
3926 if (IS_ERR(op_data))
3927 RETURN(PTR_ERR(op_data));
3929 op_data->op_valid = valid;
3930 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3931 ll_finish_md_op_data(op_data);
3933 rc = ll_inode_revalidate_fini(inode, rc);
3937 rc = ll_prep_inode(&inode, req, NULL, NULL);
3940 ptlrpc_req_finished(req);
3944 static int ll_merge_md_attr(struct inode *inode)
3946 struct cl_attr attr = { 0 };
3949 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3950 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3951 &attr, ll_md_blocking_ast);
3955 set_nlink(inode, attr.cat_nlink);
3956 inode->i_blocks = attr.cat_blocks;
3957 i_size_write(inode, attr.cat_size);
3959 ll_i2info(inode)->lli_atime = attr.cat_atime;
3960 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3961 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3967 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3969 struct inode *inode = dentry->d_inode;
3973 rc = __ll_inode_revalidate(dentry, ibits);
3977 /* if object isn't regular file, don't validate size */
3978 if (!S_ISREG(inode->i_mode)) {
3979 if (S_ISDIR(inode->i_mode) &&
3980 ll_i2info(inode)->lli_lsm_md != NULL) {
3981 rc = ll_merge_md_attr(inode);
3986 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3987 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3988 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3990 /* In case of restore, the MDT has the right size and has
3991 * already send it back without granting the layout lock,
3992 * inode is up-to-date so glimpse is useless.
3993 * Also to glimpse we need the layout, in case of a running
3994 * restore the MDT holds the layout lock so the glimpse will
3995 * block up to the end of restore (getattr will block)
3997 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3998 rc = ll_glimpse_size(inode);
4003 static inline dev_t ll_compat_encode_dev(dev_t dev)
4005 /* The compat_sys_*stat*() syscalls will fail unless the
4006 * device majors and minors are both less than 256. Note that
4007 * the value returned here will be passed through
4008 * old_encode_dev() in cp_compat_stat(). And so we are not
4009 * trying to return a valid compat (u16) device number, just
4010 * one that will pass the old_valid_dev() check. */
4012 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4015 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4016 int ll_getattr(const struct path *path, struct kstat *stat,
4017 u32 request_mask, unsigned int flags)
4020 struct dentry *de = path->dentry;
4022 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4025 struct inode *inode = de->d_inode;
4026 struct ll_sb_info *sbi = ll_i2sbi(inode);
4027 struct ll_inode_info *lli = ll_i2info(inode);
4030 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4031 MDS_INODELOCK_LOOKUP);
4032 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4037 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4039 if (ll_need_32bit_api(sbi)) {
4040 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4041 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4042 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4044 stat->ino = inode->i_ino;
4045 stat->dev = inode->i_sb->s_dev;
4046 stat->rdev = inode->i_rdev;
4049 stat->mode = inode->i_mode;
4050 stat->uid = inode->i_uid;
4051 stat->gid = inode->i_gid;
4052 stat->atime = inode->i_atime;
4053 stat->mtime = inode->i_mtime;
4054 stat->ctime = inode->i_ctime;
4055 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4057 stat->nlink = inode->i_nlink;
4058 stat->size = i_size_read(inode);
4059 stat->blocks = inode->i_blocks;
4064 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4065 __u64 start, __u64 len)
4069 struct fiemap *fiemap;
4070 unsigned int extent_count = fieinfo->fi_extents_max;
4072 num_bytes = sizeof(*fiemap) + (extent_count *
4073 sizeof(struct fiemap_extent));
4074 OBD_ALLOC_LARGE(fiemap, num_bytes);
4079 fiemap->fm_flags = fieinfo->fi_flags;
4080 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4081 fiemap->fm_start = start;
4082 fiemap->fm_length = len;
4083 if (extent_count > 0 &&
4084 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4085 sizeof(struct fiemap_extent)) != 0)
4086 GOTO(out, rc = -EFAULT);
4088 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4090 fieinfo->fi_flags = fiemap->fm_flags;
4091 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4092 if (extent_count > 0 &&
4093 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4094 fiemap->fm_mapped_extents *
4095 sizeof(struct fiemap_extent)) != 0)
4096 GOTO(out, rc = -EFAULT);
4098 OBD_FREE_LARGE(fiemap, num_bytes);
4102 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4104 struct ll_inode_info *lli = ll_i2info(inode);
4105 struct posix_acl *acl = NULL;
4108 spin_lock(&lli->lli_lock);
4109 /* VFS' acl_permission_check->check_acl will release the refcount */
4110 acl = posix_acl_dup(lli->lli_posix_acl);
4111 spin_unlock(&lli->lli_lock);
4116 #ifdef HAVE_IOP_SET_ACL
4117 #ifdef CONFIG_FS_POSIX_ACL
4118 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4120 const char *name = NULL;
4127 case ACL_TYPE_ACCESS:
4129 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4133 name = XATTR_NAME_POSIX_ACL_ACCESS;
4135 case ACL_TYPE_DEFAULT:
4136 if (!S_ISDIR(inode->i_mode))
4137 GOTO(out, rc = acl ? -EACCES : 0);
4138 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4141 GOTO(out, rc = -EINVAL);
4145 size = posix_acl_xattr_size(acl->a_count);
4146 value = kmalloc(size, GFP_NOFS);
4148 GOTO(out, rc = -ENOMEM);
4150 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4155 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4156 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4161 set_cached_acl(inode, type, acl);
4163 forget_cached_acl(inode, type);
4166 #endif /* CONFIG_FS_POSIX_ACL */
4167 #endif /* HAVE_IOP_SET_ACL */
4169 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4171 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4172 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4174 ll_check_acl(struct inode *inode, int mask)
4177 # ifdef CONFIG_FS_POSIX_ACL
4178 struct posix_acl *acl;
4182 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4183 if (flags & IPERM_FLAG_RCU)
4186 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4191 rc = posix_acl_permission(inode, acl, mask);
4192 posix_acl_release(acl);
4195 # else /* !CONFIG_FS_POSIX_ACL */
4197 # endif /* CONFIG_FS_POSIX_ACL */
4199 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4201 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4202 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4204 # ifdef HAVE_INODE_PERMISION_2ARGS
4205 int ll_inode_permission(struct inode *inode, int mask)
4207 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4212 struct ll_sb_info *sbi;
4213 struct root_squash_info *squash;
4214 struct cred *cred = NULL;
4215 const struct cred *old_cred = NULL;
4217 bool squash_id = false;
4220 #ifdef MAY_NOT_BLOCK
4221 if (mask & MAY_NOT_BLOCK)
4223 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4224 if (flags & IPERM_FLAG_RCU)
4228 /* as root inode are NOT getting validated in lookup operation,
4229 * need to do it before permission check. */
4231 if (inode == inode->i_sb->s_root->d_inode) {
4232 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4233 MDS_INODELOCK_LOOKUP);
4238 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4239 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4241 /* squash fsuid/fsgid if needed */
4242 sbi = ll_i2sbi(inode);
4243 squash = &sbi->ll_squash;
4244 if (unlikely(squash->rsi_uid != 0 &&
4245 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4246 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4250 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4251 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4252 squash->rsi_uid, squash->rsi_gid);
4254 /* update current process's credentials
4255 * and FS capability */
4256 cred = prepare_creds();
4260 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4261 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4262 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4263 if ((1 << cap) & CFS_CAP_FS_MASK)
4264 cap_lower(cred->cap_effective, cap);
4266 old_cred = override_creds(cred);
4269 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4270 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4271 /* restore current process's credentials and FS capability */
4273 revert_creds(old_cred);
4280 /* -o localflock - only provides locally consistent flock locks */
4281 struct file_operations ll_file_operations = {
4282 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4283 # ifdef HAVE_SYNC_READ_WRITE
4284 .read = new_sync_read,
4285 .write = new_sync_write,
4287 .read_iter = ll_file_read_iter,
4288 .write_iter = ll_file_write_iter,
4289 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4290 .read = ll_file_read,
4291 .aio_read = ll_file_aio_read,
4292 .write = ll_file_write,
4293 .aio_write = ll_file_aio_write,
4294 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4295 .unlocked_ioctl = ll_file_ioctl,
4296 .open = ll_file_open,
4297 .release = ll_file_release,
4298 .mmap = ll_file_mmap,
4299 .llseek = ll_file_seek,
4300 .splice_read = ll_file_splice_read,
4305 struct file_operations ll_file_operations_flock = {
4306 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4307 # ifdef HAVE_SYNC_READ_WRITE
4308 .read = new_sync_read,
4309 .write = new_sync_write,
4310 # endif /* HAVE_SYNC_READ_WRITE */
4311 .read_iter = ll_file_read_iter,
4312 .write_iter = ll_file_write_iter,
4313 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4314 .read = ll_file_read,
4315 .aio_read = ll_file_aio_read,
4316 .write = ll_file_write,
4317 .aio_write = ll_file_aio_write,
4318 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4319 .unlocked_ioctl = ll_file_ioctl,
4320 .open = ll_file_open,
4321 .release = ll_file_release,
4322 .mmap = ll_file_mmap,
4323 .llseek = ll_file_seek,
4324 .splice_read = ll_file_splice_read,
4327 .flock = ll_file_flock,
4328 .lock = ll_file_flock
4331 /* These are for -o noflock - to return ENOSYS on flock calls */
4332 struct file_operations ll_file_operations_noflock = {
4333 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4334 # ifdef HAVE_SYNC_READ_WRITE
4335 .read = new_sync_read,
4336 .write = new_sync_write,
4337 # endif /* HAVE_SYNC_READ_WRITE */
4338 .read_iter = ll_file_read_iter,
4339 .write_iter = ll_file_write_iter,
4340 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4341 .read = ll_file_read,
4342 .aio_read = ll_file_aio_read,
4343 .write = ll_file_write,
4344 .aio_write = ll_file_aio_write,
4345 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4346 .unlocked_ioctl = ll_file_ioctl,
4347 .open = ll_file_open,
4348 .release = ll_file_release,
4349 .mmap = ll_file_mmap,
4350 .llseek = ll_file_seek,
4351 .splice_read = ll_file_splice_read,
4354 .flock = ll_file_noflock,
4355 .lock = ll_file_noflock
4358 struct inode_operations ll_file_inode_operations = {
4359 .setattr = ll_setattr,
4360 .getattr = ll_getattr,
4361 .permission = ll_inode_permission,
4362 #ifdef HAVE_IOP_XATTR
4363 .setxattr = ll_setxattr,
4364 .getxattr = ll_getxattr,
4365 .removexattr = ll_removexattr,
4367 .listxattr = ll_listxattr,
4368 .fiemap = ll_fiemap,
4369 #ifdef HAVE_IOP_GET_ACL
4370 .get_acl = ll_get_acl,
4372 #ifdef HAVE_IOP_SET_ACL
4373 .set_acl = ll_set_acl,
4377 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4379 struct ll_inode_info *lli = ll_i2info(inode);
4380 struct cl_object *obj = lli->lli_clob;
4389 env = cl_env_get(&refcheck);
4391 RETURN(PTR_ERR(env));
4393 rc = cl_conf_set(env, lli->lli_clob, conf);
4397 if (conf->coc_opc == OBJECT_CONF_SET) {
4398 struct ldlm_lock *lock = conf->coc_lock;
4399 struct cl_layout cl = {
4403 LASSERT(lock != NULL);
4404 LASSERT(ldlm_has_layout(lock));
4406 /* it can only be allowed to match after layout is
4407 * applied to inode otherwise false layout would be
4408 * seen. Applying layout shoud happen before dropping
4409 * the intent lock. */
4410 ldlm_lock_allow_match(lock);
4412 rc = cl_object_layout_get(env, obj, &cl);
4417 DFID": layout version change: %u -> %u\n",
4418 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4420 ll_layout_version_set(lli, cl.cl_layout_gen);
4424 cl_env_put(env, &refcheck);
4429 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4430 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4433 struct ll_sb_info *sbi = ll_i2sbi(inode);
4434 struct ptlrpc_request *req;
4435 struct mdt_body *body;
4442 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4443 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4444 lock->l_lvb_data, lock->l_lvb_len);
4446 if (lock->l_lvb_data != NULL)
4449 /* if layout lock was granted right away, the layout is returned
4450 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4451 * blocked and then granted via completion ast, we have to fetch
4452 * layout here. Please note that we can't use the LVB buffer in
4453 * completion AST because it doesn't have a large enough buffer */
4454 rc = ll_get_default_mdsize(sbi, &lmmsize);
4456 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4457 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4462 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4464 GOTO(out, rc = -EPROTO);
4466 lmmsize = body->mbo_eadatasize;
4467 if (lmmsize == 0) /* empty layout */
4470 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4472 GOTO(out, rc = -EFAULT);
4474 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4475 if (lvbdata == NULL)
4476 GOTO(out, rc = -ENOMEM);
4478 memcpy(lvbdata, lmm, lmmsize);
4479 lock_res_and_lock(lock);
4480 if (unlikely(lock->l_lvb_data == NULL)) {
4481 lock->l_lvb_type = LVB_T_LAYOUT;
4482 lock->l_lvb_data = lvbdata;
4483 lock->l_lvb_len = lmmsize;
4486 unlock_res_and_lock(lock);
4489 OBD_FREE_LARGE(lvbdata, lmmsize);
4494 ptlrpc_req_finished(req);
4499 * Apply the layout to the inode. Layout lock is held and will be released
4502 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4503 struct inode *inode)
4505 struct ll_inode_info *lli = ll_i2info(inode);
4506 struct ll_sb_info *sbi = ll_i2sbi(inode);
4507 struct ldlm_lock *lock;
4508 struct cl_object_conf conf;
4511 bool wait_layout = false;
4514 LASSERT(lustre_handle_is_used(lockh));
4516 lock = ldlm_handle2lock(lockh);
4517 LASSERT(lock != NULL);
4518 LASSERT(ldlm_has_layout(lock));
4520 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4521 PFID(&lli->lli_fid), inode);
4523 /* in case this is a caching lock and reinstate with new inode */
4524 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4526 lock_res_and_lock(lock);
4527 lvb_ready = ldlm_is_lvb_ready(lock);
4528 unlock_res_and_lock(lock);
4530 /* checking lvb_ready is racy but this is okay. The worst case is
4531 * that multi processes may configure the file on the same time. */
4535 rc = ll_layout_fetch(inode, lock);
4539 /* for layout lock, lmm is stored in lock's lvb.
4540 * lvb_data is immutable if the lock is held so it's safe to access it
4543 * set layout to file. Unlikely this will fail as old layout was
4544 * surely eliminated */
4545 memset(&conf, 0, sizeof conf);
4546 conf.coc_opc = OBJECT_CONF_SET;
4547 conf.coc_inode = inode;
4548 conf.coc_lock = lock;
4549 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4550 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4551 rc = ll_layout_conf(inode, &conf);
4553 /* refresh layout failed, need to wait */
4554 wait_layout = rc == -EBUSY;
4557 LDLM_LOCK_PUT(lock);
4558 ldlm_lock_decref(lockh, mode);
4560 /* wait for IO to complete if it's still being used. */
4562 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4563 ll_get_fsname(inode->i_sb, NULL, 0),
4564 PFID(&lli->lli_fid), inode);
4566 memset(&conf, 0, sizeof conf);
4567 conf.coc_opc = OBJECT_CONF_WAIT;
4568 conf.coc_inode = inode;
4569 rc = ll_layout_conf(inode, &conf);
4573 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4574 ll_get_fsname(inode->i_sb, NULL, 0),
4575 PFID(&lli->lli_fid), rc);
4581 * Issue layout intent RPC to MDS.
4582 * \param inode [in] file inode
4583 * \param intent [in] layout intent
4585 * \retval 0 on success
4586 * \retval < 0 error code
4588 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4590 struct ll_inode_info *lli = ll_i2info(inode);
4591 struct ll_sb_info *sbi = ll_i2sbi(inode);
4592 struct md_op_data *op_data;
4593 struct lookup_intent it;
4594 struct ptlrpc_request *req;
4598 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4599 0, 0, LUSTRE_OPC_ANY, NULL);
4600 if (IS_ERR(op_data))
4601 RETURN(PTR_ERR(op_data));
4603 op_data->op_data = intent;
4604 op_data->op_data_size = sizeof(*intent);
4606 memset(&it, 0, sizeof(it));
4607 it.it_op = IT_LAYOUT;
4608 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4609 intent->li_opc == LAYOUT_INTENT_TRUNC)
4610 it.it_flags = FMODE_WRITE;
4612 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4613 ll_get_fsname(inode->i_sb, NULL, 0),
4614 PFID(&lli->lli_fid), inode);
4616 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4617 &ll_md_blocking_ast, 0);
4618 if (it.it_request != NULL)
4619 ptlrpc_req_finished(it.it_request);
4620 it.it_request = NULL;
4622 ll_finish_md_op_data(op_data);
4624 /* set lock data in case this is a new lock */
4626 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4628 ll_intent_drop_lock(&it);
4634 * This function checks if there exists a LAYOUT lock on the client side,
4635 * or enqueues it if it doesn't have one in cache.
4637 * This function will not hold layout lock so it may be revoked any time after
4638 * this function returns. Any operations depend on layout should be redone
4641 * This function should be called before lov_io_init() to get an uptodate
4642 * layout version, the caller should save the version number and after IO
4643 * is finished, this function should be called again to verify that layout
4644 * is not changed during IO time.
4646 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4648 struct ll_inode_info *lli = ll_i2info(inode);
4649 struct ll_sb_info *sbi = ll_i2sbi(inode);
4650 struct lustre_handle lockh;
4651 struct layout_intent intent = {
4652 .li_opc = LAYOUT_INTENT_ACCESS,
4654 enum ldlm_mode mode;
4658 *gen = ll_layout_version_get(lli);
4659 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4663 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4664 LASSERT(S_ISREG(inode->i_mode));
4666 /* take layout lock mutex to enqueue layout lock exclusively. */
4667 mutex_lock(&lli->lli_layout_mutex);
4670 /* mostly layout lock is caching on the local side, so try to
4671 * match it before grabbing layout lock mutex. */
4672 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4673 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4674 if (mode != 0) { /* hit cached lock */
4675 rc = ll_layout_lock_set(&lockh, mode, inode);
4681 rc = ll_layout_intent(inode, &intent);
4687 *gen = ll_layout_version_get(lli);
4688 mutex_unlock(&lli->lli_layout_mutex);
4694 * Issue layout intent RPC indicating where in a file an IO is about to write.
4696 * \param[in] inode file inode.
4697 * \param[in] start start offset of fille in bytes where an IO is about to
4699 * \param[in] end exclusive end offset in bytes of the write range.
4701 * \retval 0 on success
4702 * \retval < 0 error code
4704 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4706 struct layout_intent intent = {
4707 .li_opc = LAYOUT_INTENT_WRITE,
4714 rc = ll_layout_intent(inode, &intent);
4720 * This function send a restore request to the MDT
4722 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4724 struct hsm_user_request *hur;
4728 len = sizeof(struct hsm_user_request) +
4729 sizeof(struct hsm_user_item);
4730 OBD_ALLOC(hur, len);
4734 hur->hur_request.hr_action = HUA_RESTORE;
4735 hur->hur_request.hr_archive_id = 0;
4736 hur->hur_request.hr_flags = 0;
4737 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4738 sizeof(hur->hur_user_item[0].hui_fid));
4739 hur->hur_user_item[0].hui_extent.offset = offset;
4740 hur->hur_user_item[0].hui_extent.length = length;
4741 hur->hur_request.hr_itemcount = 1;
4742 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,