4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_SWAP:
148 LASSERT(data != NULL);
149 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
150 op_data->op_data_version = 0;
151 op_data->op_lease_handle = och->och_lease_handle;
152 op_data->op_fid2 = *ll_inode2fid(data);
155 case MDS_HSM_RELEASE:
156 LASSERT(data != NULL);
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *(__u64 *)data;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 LASSERT(data == NULL);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 if (rc != 0 && rc != -EINTR)
170 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
171 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
174 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
175 struct mdt_body *body;
177 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
178 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
182 ll_finish_md_op_data(op_data);
186 md_clear_open_replay_data(md_exp, och);
187 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
190 ptlrpc_req_finished(req); /* This is close request */
194 int ll_md_real_close(struct inode *inode, fmode_t fmode)
196 struct ll_inode_info *lli = ll_i2info(inode);
197 struct obd_client_handle **och_p;
198 struct obd_client_handle *och;
203 if (fmode & FMODE_WRITE) {
204 och_p = &lli->lli_mds_write_och;
205 och_usecount = &lli->lli_open_fd_write_count;
206 } else if (fmode & FMODE_EXEC) {
207 och_p = &lli->lli_mds_exec_och;
208 och_usecount = &lli->lli_open_fd_exec_count;
210 LASSERT(fmode & FMODE_READ);
211 och_p = &lli->lli_mds_read_och;
212 och_usecount = &lli->lli_open_fd_read_count;
215 mutex_lock(&lli->lli_och_mutex);
216 if (*och_usecount > 0) {
217 /* There are still users of this handle, so skip
219 mutex_unlock(&lli->lli_och_mutex);
225 mutex_unlock(&lli->lli_och_mutex);
228 /* There might be a race and this handle may already
230 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
236 static int ll_md_close(struct inode *inode, struct file *file)
238 union ldlm_policy_data policy = {
239 .l_inodebits = { MDS_INODELOCK_OPEN },
241 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
242 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
243 struct ll_inode_info *lli = ll_i2info(inode);
244 struct lustre_handle lockh;
245 enum ldlm_mode lockmode;
249 /* clear group lock, if present */
250 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
251 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
253 if (fd->fd_lease_och != NULL) {
256 /* Usually the lease is not released when the
257 * application crashed, we need to release here. */
258 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
259 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
260 PFID(&lli->lli_fid), rc, lease_broken);
262 fd->fd_lease_och = NULL;
265 if (fd->fd_och != NULL) {
266 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
271 /* Let's see if we have good enough OPEN lock on the file and if
272 we can skip talking to MDS */
273 mutex_lock(&lli->lli_och_mutex);
274 if (fd->fd_omode & FMODE_WRITE) {
276 LASSERT(lli->lli_open_fd_write_count);
277 lli->lli_open_fd_write_count--;
278 } else if (fd->fd_omode & FMODE_EXEC) {
280 LASSERT(lli->lli_open_fd_exec_count);
281 lli->lli_open_fd_exec_count--;
284 LASSERT(lli->lli_open_fd_read_count);
285 lli->lli_open_fd_read_count--;
287 mutex_unlock(&lli->lli_och_mutex);
289 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
290 LDLM_IBITS, &policy, lockmode, &lockh))
291 rc = ll_md_real_close(inode, fd->fd_omode);
294 LUSTRE_FPRIVATE(file) = NULL;
295 ll_file_data_put(fd);
300 /* While this returns an error code, fput() the caller does not, so we need
301 * to make every effort to clean up all of our state here. Also, applications
302 * rarely check close errors and even if an error is returned they will not
303 * re-try the close call.
305 int ll_file_release(struct inode *inode, struct file *file)
307 struct ll_file_data *fd;
308 struct ll_sb_info *sbi = ll_i2sbi(inode);
309 struct ll_inode_info *lli = ll_i2info(inode);
313 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
314 PFID(ll_inode2fid(inode)), inode);
316 if (inode->i_sb->s_root != file_dentry(file))
317 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
318 fd = LUSTRE_FPRIVATE(file);
321 /* The last ref on @file, maybe not the the owner pid of statahead,
322 * because parent and child process can share the same file handle. */
323 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
324 ll_deauthorize_statahead(inode, fd);
326 if (inode->i_sb->s_root == file_dentry(file)) {
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
332 if (!S_ISDIR(inode->i_mode)) {
333 if (lli->lli_clob != NULL)
334 lov_read_and_clear_async_rc(lli->lli_clob);
335 lli->lli_async_rc = 0;
338 rc = ll_md_close(inode, file);
340 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
341 libcfs_debug_dumplog();
346 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
347 struct lookup_intent *itp)
349 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
350 struct dentry *parent = de->d_parent;
351 const char *name = NULL;
353 struct md_op_data *op_data;
354 struct ptlrpc_request *req = NULL;
358 LASSERT(parent != NULL);
359 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
361 /* if server supports open-by-fid, or file name is invalid, don't pack
362 * name in open request */
363 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
364 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
365 name = de->d_name.name;
366 len = de->d_name.len;
369 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
370 name, len, 0, LUSTRE_OPC_ANY, NULL);
372 RETURN(PTR_ERR(op_data));
373 op_data->op_data = lmm;
374 op_data->op_data_size = lmmsize;
376 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
377 &ll_md_blocking_ast, 0);
378 ll_finish_md_op_data(op_data);
380 /* reason for keep own exit path - don`t flood log
381 * with messages with -ESTALE errors.
383 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
384 it_open_error(DISP_OPEN_OPEN, itp))
386 ll_release_openhandle(de, itp);
390 if (it_disposition(itp, DISP_LOOKUP_NEG))
391 GOTO(out, rc = -ENOENT);
393 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
394 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
395 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
399 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
400 if (!rc && itp->it_lock_mode)
401 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
404 ptlrpc_req_finished(req);
405 ll_intent_drop_lock(itp);
407 /* We did open by fid, but by the time we got to the server,
408 * the object disappeared. If this is a create, we cannot really
409 * tell the userspace that the file it was trying to create
410 * does not exist. Instead let's return -ESTALE, and the VFS will
411 * retry the create with LOOKUP_REVAL that we are going to catch
412 * in ll_revalidate_dentry() and use lookup then.
414 if (rc == -ENOENT && itp->it_op & IT_CREAT)
420 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
421 struct obd_client_handle *och)
423 struct mdt_body *body;
425 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
426 och->och_fh = body->mbo_handle;
427 och->och_fid = body->mbo_fid1;
428 och->och_lease_handle.cookie = it->it_lock_handle;
429 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430 och->och_flags = it->it_flags;
432 return md_set_open_replay_data(md_exp, och, it);
435 static int ll_local_open(struct file *file, struct lookup_intent *it,
436 struct ll_file_data *fd, struct obd_client_handle *och)
438 struct inode *inode = file_inode(file);
441 LASSERT(!LUSTRE_FPRIVATE(file));
448 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
453 LUSTRE_FPRIVATE(file) = fd;
454 ll_readahead_init(inode, &fd->fd_ras);
455 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
457 /* ll_cl_context initialize */
458 rwlock_init(&fd->fd_lock);
459 INIT_LIST_HEAD(&fd->fd_lccs);
464 /* Open a file, and (for the very first open) create objects on the OSTs at
465 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
466 * creation or open until ll_lov_setstripe() ioctl is called.
468 * If we already have the stripe MD locally then we don't request it in
469 * md_open(), by passing a lmm_size = 0.
471 * It is up to the application to ensure no other processes open this file
472 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
473 * used. We might be able to avoid races of that sort by getting lli_open_sem
474 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
475 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
477 int ll_file_open(struct inode *inode, struct file *file)
479 struct ll_inode_info *lli = ll_i2info(inode);
480 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
481 .it_flags = file->f_flags };
482 struct obd_client_handle **och_p = NULL;
483 __u64 *och_usecount = NULL;
484 struct ll_file_data *fd;
488 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
489 PFID(ll_inode2fid(inode)), inode, file->f_flags);
491 it = file->private_data; /* XXX: compat macro */
492 file->private_data = NULL; /* prevent ll_local_open assertion */
494 fd = ll_file_data_get();
496 GOTO(out_openerr, rc = -ENOMEM);
499 if (S_ISDIR(inode->i_mode))
500 ll_authorize_statahead(inode, fd);
502 if (inode->i_sb->s_root == file_dentry(file)) {
503 LUSTRE_FPRIVATE(file) = fd;
507 if (!it || !it->it_disposition) {
508 /* Convert f_flags into access mode. We cannot use file->f_mode,
509 * because everything but O_ACCMODE mask was stripped from
511 if ((oit.it_flags + 1) & O_ACCMODE)
513 if (file->f_flags & O_TRUNC)
514 oit.it_flags |= FMODE_WRITE;
516 /* kernel only call f_op->open in dentry_open. filp_open calls
517 * dentry_open after call to open_namei that checks permissions.
518 * Only nfsd_open call dentry_open directly without checking
519 * permissions and because of that this code below is safe. */
520 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
521 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
523 /* We do not want O_EXCL here, presumably we opened the file
524 * already? XXX - NFS implications? */
525 oit.it_flags &= ~O_EXCL;
527 /* bug20584, if "it_flags" contains O_CREAT, the file will be
528 * created if necessary, then "IT_CREAT" should be set to keep
529 * consistent with it */
530 if (oit.it_flags & O_CREAT)
531 oit.it_op |= IT_CREAT;
537 /* Let's see if we have file open on MDS already. */
538 if (it->it_flags & FMODE_WRITE) {
539 och_p = &lli->lli_mds_write_och;
540 och_usecount = &lli->lli_open_fd_write_count;
541 } else if (it->it_flags & FMODE_EXEC) {
542 och_p = &lli->lli_mds_exec_och;
543 och_usecount = &lli->lli_open_fd_exec_count;
545 och_p = &lli->lli_mds_read_och;
546 och_usecount = &lli->lli_open_fd_read_count;
549 mutex_lock(&lli->lli_och_mutex);
550 if (*och_p) { /* Open handle is present */
551 if (it_disposition(it, DISP_OPEN_OPEN)) {
552 /* Well, there's extra open request that we do not need,
553 let's close it somehow. This will decref request. */
554 rc = it_open_error(DISP_OPEN_OPEN, it);
556 mutex_unlock(&lli->lli_och_mutex);
557 GOTO(out_openerr, rc);
560 ll_release_openhandle(file_dentry(file), it);
564 rc = ll_local_open(file, it, fd, NULL);
567 mutex_unlock(&lli->lli_och_mutex);
568 GOTO(out_openerr, rc);
571 LASSERT(*och_usecount == 0);
572 if (!it->it_disposition) {
573 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
574 /* We cannot just request lock handle now, new ELC code
575 means that one of other OPEN locks for this file
576 could be cancelled, and since blocking ast handler
577 would attempt to grab och_mutex as well, that would
578 result in a deadlock */
579 mutex_unlock(&lli->lli_och_mutex);
581 * Normally called under two situations:
583 * 2. A race/condition on MDS resulting in no open
584 * handle to be returned from LOOKUP|OPEN request,
585 * for example if the target entry was a symlink.
587 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
588 * marked by a bit set in ll_iget_for_nfs. Clear the
589 * bit so that it's not confusing later callers.
591 * NB; when ldd is NULL, it must have come via normal
592 * lookup path only, since ll_iget_for_nfs always calls
595 if (ldd && ldd->lld_nfs_dentry) {
596 ldd->lld_nfs_dentry = 0;
597 it->it_flags |= MDS_OPEN_LOCK;
601 * Always specify MDS_OPEN_BY_FID because we don't want
602 * to get file with different fid.
604 it->it_flags |= MDS_OPEN_BY_FID;
605 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
608 GOTO(out_openerr, rc);
612 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
614 GOTO(out_och_free, rc = -ENOMEM);
618 /* md_intent_lock() didn't get a request ref if there was an
619 * open error, so don't do cleanup on the request here
621 /* XXX (green): Should not we bail out on any error here, not
622 * just open error? */
623 rc = it_open_error(DISP_OPEN_OPEN, it);
625 GOTO(out_och_free, rc);
627 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
628 "inode %p: disposition %x, status %d\n", inode,
629 it_disposition(it, ~0), it->it_status);
631 rc = ll_local_open(file, it, fd, *och_p);
633 GOTO(out_och_free, rc);
635 mutex_unlock(&lli->lli_och_mutex);
638 /* Must do this outside lli_och_mutex lock to prevent deadlock where
639 different kind of OPEN lock for this same inode gets cancelled
640 by ldlm_cancel_lru */
641 if (!S_ISREG(inode->i_mode))
642 GOTO(out_och_free, rc);
644 cl_lov_delay_create_clear(&file->f_flags);
645 GOTO(out_och_free, rc);
649 if (och_p && *och_p) {
650 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
651 *och_p = NULL; /* OBD_FREE writes some magic there */
654 mutex_unlock(&lli->lli_och_mutex);
657 if (lli->lli_opendir_key == fd)
658 ll_deauthorize_statahead(inode, fd);
660 ll_file_data_put(fd);
662 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
665 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
666 ptlrpc_req_finished(it->it_request);
667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
673 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *desc, void *data, int flag)
677 struct lustre_handle lockh;
681 case LDLM_CB_BLOCKING:
682 ldlm_lock2handle(lock, &lockh);
683 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
685 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
689 case LDLM_CB_CANCELING:
697 * When setting a lease on a file, we take ownership of the lli_mds_*_och
698 * and save it as fd->fd_och so as to force client to reopen the file even
699 * if it has an open lock in cache already.
701 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
702 struct lustre_handle *old_handle)
704 struct ll_inode_info *lli = ll_i2info(inode);
705 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
706 struct obd_client_handle **och_p;
711 /* Get the openhandle of the file */
712 mutex_lock(&lli->lli_och_mutex);
713 if (fd->fd_lease_och != NULL)
714 GOTO(out_unlock, rc = -EBUSY);
716 if (fd->fd_och == NULL) {
717 if (file->f_mode & FMODE_WRITE) {
718 LASSERT(lli->lli_mds_write_och != NULL);
719 och_p = &lli->lli_mds_write_och;
720 och_usecount = &lli->lli_open_fd_write_count;
722 LASSERT(lli->lli_mds_read_och != NULL);
723 och_p = &lli->lli_mds_read_och;
724 och_usecount = &lli->lli_open_fd_read_count;
727 if (*och_usecount > 1)
728 GOTO(out_unlock, rc = -EBUSY);
735 *old_handle = fd->fd_och->och_fh;
739 mutex_unlock(&lli->lli_och_mutex);
744 * Release ownership on lli_mds_*_och when putting back a file lease.
746 static int ll_lease_och_release(struct inode *inode, struct file *file)
748 struct ll_inode_info *lli = ll_i2info(inode);
749 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
750 struct obd_client_handle **och_p;
751 struct obd_client_handle *old_och = NULL;
756 mutex_lock(&lli->lli_och_mutex);
757 if (file->f_mode & FMODE_WRITE) {
758 och_p = &lli->lli_mds_write_och;
759 och_usecount = &lli->lli_open_fd_write_count;
761 och_p = &lli->lli_mds_read_och;
762 och_usecount = &lli->lli_open_fd_read_count;
765 /* The file may have been open by another process (broken lease) so
766 * *och_p is not NULL. In this case we should simply increase usecount
769 if (*och_p != NULL) {
770 old_och = fd->fd_och;
777 mutex_unlock(&lli->lli_och_mutex);
780 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
786 * Acquire a lease and open the file.
788 static struct obd_client_handle *
789 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
792 struct lookup_intent it = { .it_op = IT_OPEN };
793 struct ll_sb_info *sbi = ll_i2sbi(inode);
794 struct md_op_data *op_data;
795 struct ptlrpc_request *req = NULL;
796 struct lustre_handle old_handle = { 0 };
797 struct obd_client_handle *och = NULL;
802 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
803 RETURN(ERR_PTR(-EINVAL));
806 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
807 RETURN(ERR_PTR(-EPERM));
809 rc = ll_lease_och_acquire(inode, file, &old_handle);
816 RETURN(ERR_PTR(-ENOMEM));
818 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
819 LUSTRE_OPC_ANY, NULL);
821 GOTO(out, rc = PTR_ERR(op_data));
823 /* To tell the MDT this openhandle is from the same owner */
824 op_data->op_handle = old_handle;
826 it.it_flags = fmode | open_flags;
827 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
828 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
829 &ll_md_blocking_lease_ast,
830 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
831 * it can be cancelled which may mislead applications that the lease is
833 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
834 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
835 * doesn't deal with openhandle, so normal openhandle will be leaked. */
836 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
837 ll_finish_md_op_data(op_data);
838 ptlrpc_req_finished(req);
840 GOTO(out_release_it, rc);
842 if (it_disposition(&it, DISP_LOOKUP_NEG))
843 GOTO(out_release_it, rc = -ENOENT);
845 rc = it_open_error(DISP_OPEN_OPEN, &it);
847 GOTO(out_release_it, rc);
849 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
850 ll_och_fill(sbi->ll_md_exp, &it, och);
852 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
853 GOTO(out_close, rc = -EOPNOTSUPP);
855 /* already get lease, handle lease lock */
856 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
857 if (it.it_lock_mode == 0 ||
858 it.it_lock_bits != MDS_INODELOCK_OPEN) {
859 /* open lock must return for lease */
860 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
861 PFID(ll_inode2fid(inode)), it.it_lock_mode,
863 GOTO(out_close, rc = -EPROTO);
866 ll_intent_release(&it);
870 /* Cancel open lock */
871 if (it.it_lock_mode != 0) {
872 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
875 och->och_lease_handle.cookie = 0ULL;
877 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
879 CERROR("%s: error closing file "DFID": %d\n",
880 ll_get_fsname(inode->i_sb, NULL, 0),
881 PFID(&ll_i2info(inode)->lli_fid), rc2);
882 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
884 ll_intent_release(&it);
892 * Check whether a layout swap can be done between two inodes.
894 * \param[in] inode1 First inode to check
895 * \param[in] inode2 Second inode to check
897 * \retval 0 on success, layout swap can be performed between both inodes
898 * \retval negative error code if requirements are not met
900 static int ll_check_swap_layouts_validity(struct inode *inode1,
901 struct inode *inode2)
903 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
906 if (inode_permission(inode1, MAY_WRITE) ||
907 inode_permission(inode2, MAY_WRITE))
910 if (inode1->i_sb != inode2->i_sb)
916 static int ll_swap_layouts_close(struct obd_client_handle *och,
917 struct inode *inode, struct inode *inode2)
919 const struct lu_fid *fid1 = ll_inode2fid(inode);
920 const struct lu_fid *fid2;
924 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
925 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
927 rc = ll_check_swap_layouts_validity(inode, inode2);
929 GOTO(out_free_och, rc);
931 /* We now know that inode2 is a lustre inode */
932 fid2 = ll_inode2fid(inode2);
934 rc = lu_fid_cmp(fid1, fid2);
936 GOTO(out_free_och, rc = -EINVAL);
938 /* Close the file and swap layouts between inode & inode2.
939 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
940 * because we still need it to pack l_remote_handle to MDT. */
941 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
944 och = NULL; /* freed in ll_close_inode_openhandle() */
954 * Release lease and close the file.
955 * It will check if the lease has ever broken.
957 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
960 struct ldlm_lock *lock;
961 bool cancelled = true;
965 lock = ldlm_handle2lock(&och->och_lease_handle);
967 lock_res_and_lock(lock);
968 cancelled = ldlm_is_cancel(lock);
969 unlock_res_and_lock(lock);
973 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
974 PFID(&ll_i2info(inode)->lli_fid), cancelled);
977 ldlm_cli_cancel(&och->och_lease_handle, 0);
979 if (lease_broken != NULL)
980 *lease_broken = cancelled;
982 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
986 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
988 struct ll_inode_info *lli = ll_i2info(inode);
989 struct cl_object *obj = lli->lli_clob;
990 struct cl_attr *attr = vvp_env_thread_attr(env);
998 ll_inode_size_lock(inode);
1000 /* Merge timestamps the most recently obtained from MDS with
1001 * timestamps obtained from OSTs.
1003 * Do not overwrite atime of inode because it may be refreshed
1004 * by file_accessed() function. If the read was served by cache
1005 * data, there is no RPC to be sent so that atime may not be
1006 * transferred to OSTs at all. MDT only updates atime at close time
1007 * if it's at least 'mdd.*.atime_diff' older.
1008 * All in all, the atime in Lustre does not strictly comply with
1009 * POSIX. Solving this problem needs to send an RPC to MDT for each
1010 * read, this will hurt performance. */
1011 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1012 LTIME_S(inode->i_atime) = lli->lli_atime;
1013 lli->lli_update_atime = 0;
1015 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1016 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1018 atime = LTIME_S(inode->i_atime);
1019 mtime = LTIME_S(inode->i_mtime);
1020 ctime = LTIME_S(inode->i_ctime);
1022 cl_object_attr_lock(obj);
1023 rc = cl_object_attr_get(env, obj, attr);
1024 cl_object_attr_unlock(obj);
1027 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1029 if (atime < attr->cat_atime)
1030 atime = attr->cat_atime;
1032 if (ctime < attr->cat_ctime)
1033 ctime = attr->cat_ctime;
1035 if (mtime < attr->cat_mtime)
1036 mtime = attr->cat_mtime;
1038 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1039 PFID(&lli->lli_fid), attr->cat_size);
1041 i_size_write(inode, attr->cat_size);
1042 inode->i_blocks = attr->cat_blocks;
1044 LTIME_S(inode->i_atime) = atime;
1045 LTIME_S(inode->i_mtime) = mtime;
1046 LTIME_S(inode->i_ctime) = ctime;
1049 ll_inode_size_unlock(inode);
1054 static bool file_is_noatime(const struct file *file)
1056 const struct vfsmount *mnt = file->f_path.mnt;
1057 const struct inode *inode = file_inode((struct file *)file);
1059 /* Adapted from file_accessed() and touch_atime().*/
1060 if (file->f_flags & O_NOATIME)
1063 if (inode->i_flags & S_NOATIME)
1066 if (IS_NOATIME(inode))
1069 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1072 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1075 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1081 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1083 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1085 struct inode *inode = file_inode(file);
1086 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1088 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1089 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1090 io->u.ci_rw.rw_file = file;
1091 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1092 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1093 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1095 if (iot == CIT_WRITE) {
1096 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1097 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1098 file->f_flags & O_DIRECT ||
1101 io->ci_obj = ll_i2info(inode)->lli_clob;
1102 io->ci_lockreq = CILR_MAYBE;
1103 if (ll_file_nolock(file)) {
1104 io->ci_lockreq = CILR_NEVER;
1105 io->ci_no_srvlock = 1;
1106 } else if (file->f_flags & O_APPEND) {
1107 io->ci_lockreq = CILR_MANDATORY;
1109 io->ci_noatime = file_is_noatime(file);
1110 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1111 io->ci_pio = !io->u.ci_rw.rw_append;
1116 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1118 struct cl_io_pt *pt = ptask->pt_cbdata;
1119 struct file *file = pt->cip_file;
1122 loff_t pos = pt->cip_pos;
1127 env = cl_env_get(&refcheck);
1129 RETURN(PTR_ERR(env));
1131 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1132 file_dentry(file)->d_name.name,
1133 pt->cip_iot == CIT_READ ? "read" : "write",
1134 pos, pos + pt->cip_count);
1137 io = vvp_env_thread_io(env);
1138 ll_io_init(io, file, pt->cip_iot);
1139 io->u.ci_rw.rw_iter = pt->cip_iter;
1140 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1141 io->ci_pio = 0; /* It's already in parallel task */
1143 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1144 pt->cip_count - pt->cip_result);
1146 struct vvp_io *vio = vvp_env_io(env);
1148 vio->vui_io_subtype = IO_NORMAL;
1149 vio->vui_fd = LUSTRE_FPRIVATE(file);
1151 ll_cl_add(file, env, io, LCC_RW);
1152 rc = cl_io_loop(env, io);
1153 ll_cl_remove(file, env);
1155 /* cl_io_rw_init() handled IO */
1159 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1165 if (io->ci_nob > 0) {
1166 pt->cip_result += io->ci_nob;
1167 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1169 pt->cip_iocb.ki_pos = pos;
1170 #ifdef HAVE_KIOCB_KI_LEFT
1171 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1172 #elif defined(HAVE_KI_NBYTES)
1173 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1177 cl_io_fini(env, io);
1179 if ((rc == 0 || rc == -ENODATA) &&
1180 pt->cip_result < pt->cip_count &&
1181 io->ci_need_restart) {
1183 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1184 file_dentry(file)->d_name.name,
1185 pt->cip_iot == CIT_READ ? "read" : "write",
1186 pos, pos + pt->cip_count - pt->cip_result,
1187 pt->cip_result, rc);
1191 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1192 file_dentry(file)->d_name.name,
1193 pt->cip_iot == CIT_READ ? "read" : "write",
1194 pt->cip_result, rc);
1196 cl_env_put(env, &refcheck);
1197 RETURN(pt->cip_result > 0 ? 0 : rc);
1201 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1202 struct file *file, enum cl_io_type iot,
1203 loff_t *ppos, size_t count)
1205 struct range_lock range;
1206 struct vvp_io *vio = vvp_env_io(env);
1207 struct inode *inode = file_inode(file);
1208 struct ll_inode_info *lli = ll_i2info(inode);
1209 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1217 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1218 file_dentry(file)->d_name.name,
1219 iot == CIT_READ ? "read" : "write", pos, pos + count);
1222 io = vvp_env_thread_io(env);
1223 ll_io_init(io, file, iot);
1224 if (args->via_io_subtype == IO_NORMAL) {
1225 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1226 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1231 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1232 bool range_locked = false;
1234 if (file->f_flags & O_APPEND)
1235 range_lock_init(&range, 0, LUSTRE_EOF);
1237 range_lock_init(&range, pos, pos + count - 1);
1239 vio->vui_fd = LUSTRE_FPRIVATE(file);
1240 vio->vui_io_subtype = args->via_io_subtype;
1242 switch (vio->vui_io_subtype) {
1244 /* Direct IO reads must also take range lock,
1245 * or multiple reads will try to work on the same pages
1246 * See LU-6227 for details. */
1247 if (((iot == CIT_WRITE) ||
1248 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1249 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1250 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1252 rc = range_lock(&lli->lli_write_tree, &range);
1256 range_locked = true;
1260 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1261 vio->u.splice.vui_flags = args->u.splice.via_flags;
1264 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1268 ll_cl_add(file, env, io, LCC_RW);
1269 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1270 !lli->lli_inode_locked) {
1272 lli->lli_inode_locked = 1;
1274 rc = cl_io_loop(env, io);
1275 if (lli->lli_inode_locked) {
1276 lli->lli_inode_locked = 0;
1277 inode_unlock(inode);
1279 ll_cl_remove(file, env);
1282 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1284 range_unlock(&lli->lli_write_tree, &range);
1287 /* cl_io_rw_init() handled IO */
1291 if (io->ci_nob > 0) {
1292 result += io->ci_nob;
1293 count -= io->ci_nob;
1295 if (args->via_io_subtype == IO_NORMAL) {
1296 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1298 args->u.normal.via_iocb->ki_pos = pos;
1299 #ifdef HAVE_KIOCB_KI_LEFT
1300 args->u.normal.via_iocb->ki_left = count;
1301 #elif defined(HAVE_KI_NBYTES)
1302 args->u.normal.via_iocb->ki_nbytes = count;
1306 pos = io->u.ci_rw.rw_range.cir_pos;
1310 cl_io_fini(env, io);
1312 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1314 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1315 file_dentry(file)->d_name.name,
1316 iot == CIT_READ ? "read" : "write",
1317 pos, pos + count, result, rc);
1321 if (iot == CIT_READ) {
1323 ll_stats_ops_tally(ll_i2sbi(inode),
1324 LPROC_LL_READ_BYTES, result);
1325 } else if (iot == CIT_WRITE) {
1327 ll_stats_ops_tally(ll_i2sbi(inode),
1328 LPROC_LL_WRITE_BYTES, result);
1329 fd->fd_write_failed = false;
1330 } else if (result == 0 && rc == 0) {
1333 fd->fd_write_failed = true;
1335 fd->fd_write_failed = false;
1336 } else if (rc != -ERESTARTSYS) {
1337 fd->fd_write_failed = true;
1341 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1342 file_dentry(file)->d_name.name,
1343 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1347 RETURN(result > 0 ? result : rc);
1351 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1352 * especially for small I/O.
1354 * To serve a read request, CLIO has to create and initialize a cl_io and
1355 * then request DLM lock. This has turned out to have siginificant overhead
1356 * and affects the performance of small I/O dramatically.
1358 * It's not necessary to create a cl_io for each I/O. Under the help of read
1359 * ahead, most of the pages being read are already in memory cache and we can
1360 * read those pages directly because if the pages exist, the corresponding DLM
1361 * lock must exist so that page content must be valid.
1363 * In fast read implementation, the llite speculatively finds and reads pages
1364 * in memory cache. There are three scenarios for fast read:
1365 * - If the page exists and is uptodate, kernel VM will provide the data and
1366 * CLIO won't be intervened;
1367 * - If the page was brought into memory by read ahead, it will be exported
1368 * and read ahead parameters will be updated;
1369 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1370 * it will go back and invoke normal read, i.e., a cl_io will be created
1371 * and DLM lock will be requested.
1373 * POSIX compliance: posix standard states that read is intended to be atomic.
1374 * Lustre read implementation is in line with Linux kernel read implementation
1375 * and neither of them complies with POSIX standard in this matter. Fast read
1376 * doesn't make the situation worse on single node but it may interleave write
1377 * results from multiple nodes due to short read handling in ll_file_aio_read().
1379 * \param env - lu_env
1380 * \param iocb - kiocb from kernel
1381 * \param iter - user space buffers where the data will be copied
1383 * \retval - number of bytes have been read, or error code if error occurred.
1386 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1390 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1393 /* NB: we can't do direct IO for fast read because it will need a lock
1394 * to make IO engine happy. */
1395 if (iocb->ki_filp->f_flags & O_DIRECT)
1398 result = generic_file_read_iter(iocb, iter);
1400 /* If the first page is not in cache, generic_file_aio_read() will be
1401 * returned with -ENODATA.
1402 * See corresponding code in ll_readpage(). */
1403 if (result == -ENODATA)
1407 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1408 LPROC_LL_READ_BYTES, result);
1414 * Read from a file (through the page cache).
1416 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1419 struct vvp_io_args *args;
1424 result = ll_do_fast_read(iocb, to);
1425 if (result < 0 || iov_iter_count(to) == 0)
1428 env = cl_env_get(&refcheck);
1430 return PTR_ERR(env);
1432 args = ll_env_args(env, IO_NORMAL);
1433 args->u.normal.via_iter = to;
1434 args->u.normal.via_iocb = iocb;
1436 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1437 &iocb->ki_pos, iov_iter_count(to));
1440 else if (result == 0)
1443 cl_env_put(env, &refcheck);
1449 * Write to a file (through the page cache).
1451 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1453 struct vvp_io_args *args;
1458 env = cl_env_get(&refcheck);
1460 return PTR_ERR(env);
1462 args = ll_env_args(env, IO_NORMAL);
1463 args->u.normal.via_iter = from;
1464 args->u.normal.via_iocb = iocb;
1466 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1467 &iocb->ki_pos, iov_iter_count(from));
1468 cl_env_put(env, &refcheck);
1472 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1474 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1476 static int ll_file_get_iov_count(const struct iovec *iov,
1477 unsigned long *nr_segs, size_t *count)
1482 for (seg = 0; seg < *nr_segs; seg++) {
1483 const struct iovec *iv = &iov[seg];
1486 * If any segment has a negative length, or the cumulative
1487 * length ever wraps negative then return -EINVAL.
1490 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1492 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1497 cnt -= iv->iov_len; /* This segment is no good */
1504 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1505 unsigned long nr_segs, loff_t pos)
1512 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1516 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1517 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1518 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1519 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1520 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1522 result = ll_file_read_iter(iocb, &to);
1527 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1530 struct iovec iov = { .iov_base = buf, .iov_len = count };
1535 init_sync_kiocb(&kiocb, file);
1536 kiocb.ki_pos = *ppos;
1537 #ifdef HAVE_KIOCB_KI_LEFT
1538 kiocb.ki_left = count;
1539 #elif defined(HAVE_KI_NBYTES)
1540 kiocb.i_nbytes = count;
1543 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1544 *ppos = kiocb.ki_pos;
1550 * Write to a file (through the page cache).
1553 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1554 unsigned long nr_segs, loff_t pos)
1556 struct iov_iter from;
1561 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1565 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1566 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1567 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1568 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1569 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1571 result = ll_file_write_iter(iocb, &from);
1576 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1577 size_t count, loff_t *ppos)
1580 struct iovec iov = { .iov_base = (void __user *)buf,
1582 struct kiocb *kiocb;
1587 env = cl_env_get(&refcheck);
1589 RETURN(PTR_ERR(env));
1591 kiocb = &ll_env_info(env)->lti_kiocb;
1592 init_sync_kiocb(kiocb, file);
1593 kiocb->ki_pos = *ppos;
1594 #ifdef HAVE_KIOCB_KI_LEFT
1595 kiocb->ki_left = count;
1596 #elif defined(HAVE_KI_NBYTES)
1597 kiocb->ki_nbytes = count;
1600 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1601 *ppos = kiocb->ki_pos;
1603 cl_env_put(env, &refcheck);
1606 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1609 * Send file content (through pagecache) somewhere with helper
1611 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1612 struct pipe_inode_info *pipe, size_t count,
1616 struct vvp_io_args *args;
1621 env = cl_env_get(&refcheck);
1623 RETURN(PTR_ERR(env));
1625 args = ll_env_args(env, IO_SPLICE);
1626 args->u.splice.via_pipe = pipe;
1627 args->u.splice.via_flags = flags;
1629 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1630 cl_env_put(env, &refcheck);
1634 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1635 __u64 flags, struct lov_user_md *lum, int lum_size)
1637 struct lookup_intent oit = {
1639 .it_flags = flags | MDS_OPEN_BY_FID,
1644 ll_inode_size_lock(inode);
1645 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1647 GOTO(out_unlock, rc);
1649 ll_release_openhandle(dentry, &oit);
1652 ll_inode_size_unlock(inode);
1653 ll_intent_release(&oit);
1658 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1659 struct lov_mds_md **lmmp, int *lmm_size,
1660 struct ptlrpc_request **request)
1662 struct ll_sb_info *sbi = ll_i2sbi(inode);
1663 struct mdt_body *body;
1664 struct lov_mds_md *lmm = NULL;
1665 struct ptlrpc_request *req = NULL;
1666 struct md_op_data *op_data;
1669 rc = ll_get_default_mdsize(sbi, &lmmsize);
1673 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1674 strlen(filename), lmmsize,
1675 LUSTRE_OPC_ANY, NULL);
1676 if (IS_ERR(op_data))
1677 RETURN(PTR_ERR(op_data));
1679 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1680 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1681 ll_finish_md_op_data(op_data);
1683 CDEBUG(D_INFO, "md_getattr_name failed "
1684 "on %s: rc %d\n", filename, rc);
1688 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1689 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1691 lmmsize = body->mbo_eadatasize;
1693 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1695 GOTO(out, rc = -ENODATA);
1698 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1699 LASSERT(lmm != NULL);
1701 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1702 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1703 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1704 GOTO(out, rc = -EPROTO);
1707 * This is coming from the MDS, so is probably in
1708 * little endian. We convert it to host endian before
1709 * passing it to userspace.
1711 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1714 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1715 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1716 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1717 if (le32_to_cpu(lmm->lmm_pattern) &
1718 LOV_PATTERN_F_RELEASED)
1722 /* if function called for directory - we should
1723 * avoid swab not existent lsm objects */
1724 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1725 lustre_swab_lov_user_md_v1(
1726 (struct lov_user_md_v1 *)lmm);
1727 if (S_ISREG(body->mbo_mode))
1728 lustre_swab_lov_user_md_objects(
1729 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1731 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1732 lustre_swab_lov_user_md_v3(
1733 (struct lov_user_md_v3 *)lmm);
1734 if (S_ISREG(body->mbo_mode))
1735 lustre_swab_lov_user_md_objects(
1736 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1738 } else if (lmm->lmm_magic ==
1739 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1740 lustre_swab_lov_comp_md_v1(
1741 (struct lov_comp_md_v1 *)lmm);
1747 *lmm_size = lmmsize;
1752 static int ll_lov_setea(struct inode *inode, struct file *file,
1755 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1756 struct lov_user_md *lump;
1757 int lum_size = sizeof(struct lov_user_md) +
1758 sizeof(struct lov_user_ost_data);
1762 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1765 OBD_ALLOC_LARGE(lump, lum_size);
1769 if (copy_from_user(lump, arg, lum_size))
1770 GOTO(out_lump, rc = -EFAULT);
1772 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1774 cl_lov_delay_create_clear(&file->f_flags);
1777 OBD_FREE_LARGE(lump, lum_size);
1781 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1788 env = cl_env_get(&refcheck);
1790 RETURN(PTR_ERR(env));
1792 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1793 cl_env_put(env, &refcheck);
1797 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1800 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1801 struct lov_user_md *klum;
1803 __u64 flags = FMODE_WRITE;
1806 rc = ll_copy_user_md(lum, &klum);
1811 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1816 rc = put_user(0, &lum->lmm_stripe_count);
1820 rc = ll_layout_refresh(inode, &gen);
1824 rc = ll_file_getstripe(inode, arg, lum_size);
1826 cl_lov_delay_create_clear(&file->f_flags);
1829 OBD_FREE(klum, lum_size);
1834 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1836 struct ll_inode_info *lli = ll_i2info(inode);
1837 struct cl_object *obj = lli->lli_clob;
1838 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1839 struct ll_grouplock grouplock;
1844 CWARN("group id for group lock must not be 0\n");
1848 if (ll_file_nolock(file))
1849 RETURN(-EOPNOTSUPP);
1851 spin_lock(&lli->lli_lock);
1852 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1853 CWARN("group lock already existed with gid %lu\n",
1854 fd->fd_grouplock.lg_gid);
1855 spin_unlock(&lli->lli_lock);
1858 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1859 spin_unlock(&lli->lli_lock);
1862 * XXX: group lock needs to protect all OST objects while PFL
1863 * can add new OST objects during the IO, so we'd instantiate
1864 * all OST objects before getting its group lock.
1869 struct cl_layout cl = {
1870 .cl_is_composite = false,
1873 env = cl_env_get(&refcheck);
1875 RETURN(PTR_ERR(env));
1877 rc = cl_object_layout_get(env, obj, &cl);
1878 if (!rc && cl.cl_is_composite)
1879 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1881 cl_env_put(env, &refcheck);
1886 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1887 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1891 spin_lock(&lli->lli_lock);
1892 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1893 spin_unlock(&lli->lli_lock);
1894 CERROR("another thread just won the race\n");
1895 cl_put_grouplock(&grouplock);
1899 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1900 fd->fd_grouplock = grouplock;
1901 spin_unlock(&lli->lli_lock);
1903 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1907 static int ll_put_grouplock(struct inode *inode, struct file *file,
1910 struct ll_inode_info *lli = ll_i2info(inode);
1911 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1912 struct ll_grouplock grouplock;
1915 spin_lock(&lli->lli_lock);
1916 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1917 spin_unlock(&lli->lli_lock);
1918 CWARN("no group lock held\n");
1922 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1924 if (fd->fd_grouplock.lg_gid != arg) {
1925 CWARN("group lock %lu doesn't match current id %lu\n",
1926 arg, fd->fd_grouplock.lg_gid);
1927 spin_unlock(&lli->lli_lock);
1931 grouplock = fd->fd_grouplock;
1932 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1933 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1934 spin_unlock(&lli->lli_lock);
1936 cl_put_grouplock(&grouplock);
1937 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1942 * Close inode open handle
1944 * \param dentry [in] dentry which contains the inode
1945 * \param it [in,out] intent which contains open info and result
1948 * \retval <0 failure
1950 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1952 struct inode *inode = dentry->d_inode;
1953 struct obd_client_handle *och;
1959 /* Root ? Do nothing. */
1960 if (dentry->d_inode->i_sb->s_root == dentry)
1963 /* No open handle to close? Move away */
1964 if (!it_disposition(it, DISP_OPEN_OPEN))
1967 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1969 OBD_ALLOC(och, sizeof(*och));
1971 GOTO(out, rc = -ENOMEM);
1973 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1975 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1977 /* this one is in place of ll_file_open */
1978 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1979 ptlrpc_req_finished(it->it_request);
1980 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1986 * Get size for inode for which FIEMAP mapping is requested.
1987 * Make the FIEMAP get_info call and returns the result.
1988 * \param fiemap kernel buffer to hold extens
1989 * \param num_bytes kernel buffer size
1991 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1997 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2000 /* Checks for fiemap flags */
2001 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2002 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2006 /* Check for FIEMAP_FLAG_SYNC */
2007 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2008 rc = filemap_fdatawrite(inode->i_mapping);
2013 env = cl_env_get(&refcheck);
2015 RETURN(PTR_ERR(env));
2017 if (i_size_read(inode) == 0) {
2018 rc = ll_glimpse_size(inode);
2023 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2024 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2025 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2027 /* If filesize is 0, then there would be no objects for mapping */
2028 if (fmkey.lfik_oa.o_size == 0) {
2029 fiemap->fm_mapped_extents = 0;
2033 fmkey.lfik_fiemap = *fiemap;
2035 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2036 &fmkey, fiemap, &num_bytes);
2038 cl_env_put(env, &refcheck);
2042 int ll_fid2path(struct inode *inode, void __user *arg)
2044 struct obd_export *exp = ll_i2mdexp(inode);
2045 const struct getinfo_fid2path __user *gfin = arg;
2047 struct getinfo_fid2path *gfout;
2053 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2054 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2057 /* Only need to get the buflen */
2058 if (get_user(pathlen, &gfin->gf_pathlen))
2061 if (pathlen > PATH_MAX)
2064 outsize = sizeof(*gfout) + pathlen;
2065 OBD_ALLOC(gfout, outsize);
2069 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2070 GOTO(gf_free, rc = -EFAULT);
2071 /* append root FID after gfout to let MDT know the root FID so that it
2072 * can lookup the correct path, this is mainly for fileset.
2073 * old server without fileset mount support will ignore this. */
2074 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2076 /* Call mdc_iocontrol */
2077 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2081 if (copy_to_user(arg, gfout, outsize))
2085 OBD_FREE(gfout, outsize);
2090 * Read the data_version for inode.
2092 * This value is computed using stripe object version on OST.
2093 * Version is computed using server side locking.
2095 * @param flags if do sync on the OST side;
2097 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2098 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2100 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2102 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2110 /* If no file object initialized, we consider its version is 0. */
2116 env = cl_env_get(&refcheck);
2118 RETURN(PTR_ERR(env));
2120 io = vvp_env_thread_io(env);
2122 io->u.ci_data_version.dv_data_version = 0;
2123 io->u.ci_data_version.dv_flags = flags;
2126 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2127 result = cl_io_loop(env, io);
2129 result = io->ci_result;
2131 *data_version = io->u.ci_data_version.dv_data_version;
2133 cl_io_fini(env, io);
2135 if (unlikely(io->ci_need_restart))
2138 cl_env_put(env, &refcheck);
2144 * Trigger a HSM release request for the provided inode.
2146 int ll_hsm_release(struct inode *inode)
2149 struct obd_client_handle *och = NULL;
2150 __u64 data_version = 0;
2155 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2156 ll_get_fsname(inode->i_sb, NULL, 0),
2157 PFID(&ll_i2info(inode)->lli_fid));
2159 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2161 GOTO(out, rc = PTR_ERR(och));
2163 /* Grab latest data_version and [am]time values */
2164 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2168 env = cl_env_get(&refcheck);
2170 GOTO(out, rc = PTR_ERR(env));
2172 ll_merge_attr(env, inode);
2173 cl_env_put(env, &refcheck);
2175 /* Release the file.
2176 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2177 * we still need it to pack l_remote_handle to MDT. */
2178 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2184 if (och != NULL && !IS_ERR(och)) /* close the file */
2185 ll_lease_close(och, inode, NULL);
2190 struct ll_swap_stack {
2193 struct inode *inode1;
2194 struct inode *inode2;
2199 static int ll_swap_layouts(struct file *file1, struct file *file2,
2200 struct lustre_swap_layouts *lsl)
2202 struct mdc_swap_layouts msl;
2203 struct md_op_data *op_data;
2206 struct ll_swap_stack *llss = NULL;
2209 OBD_ALLOC_PTR(llss);
2213 llss->inode1 = file_inode(file1);
2214 llss->inode2 = file_inode(file2);
2216 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2220 /* we use 2 bool because it is easier to swap than 2 bits */
2221 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2222 llss->check_dv1 = true;
2224 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2225 llss->check_dv2 = true;
2227 /* we cannot use lsl->sl_dvX directly because we may swap them */
2228 llss->dv1 = lsl->sl_dv1;
2229 llss->dv2 = lsl->sl_dv2;
2231 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2232 if (rc == 0) /* same file, done! */
2235 if (rc < 0) { /* sequentialize it */
2236 swap(llss->inode1, llss->inode2);
2238 swap(llss->dv1, llss->dv2);
2239 swap(llss->check_dv1, llss->check_dv2);
2243 if (gid != 0) { /* application asks to flush dirty cache */
2244 rc = ll_get_grouplock(llss->inode1, file1, gid);
2248 rc = ll_get_grouplock(llss->inode2, file2, gid);
2250 ll_put_grouplock(llss->inode1, file1, gid);
2255 /* ultimate check, before swaping the layouts we check if
2256 * dataversion has changed (if requested) */
2257 if (llss->check_dv1) {
2258 rc = ll_data_version(llss->inode1, &dv, 0);
2261 if (dv != llss->dv1)
2262 GOTO(putgl, rc = -EAGAIN);
2265 if (llss->check_dv2) {
2266 rc = ll_data_version(llss->inode2, &dv, 0);
2269 if (dv != llss->dv2)
2270 GOTO(putgl, rc = -EAGAIN);
2273 /* struct md_op_data is used to send the swap args to the mdt
2274 * only flags is missing, so we use struct mdc_swap_layouts
2275 * through the md_op_data->op_data */
2276 /* flags from user space have to be converted before they are send to
2277 * server, no flag is sent today, they are only used on the client */
2280 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2281 0, LUSTRE_OPC_ANY, &msl);
2282 if (IS_ERR(op_data))
2283 GOTO(free, rc = PTR_ERR(op_data));
2285 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2286 sizeof(*op_data), op_data, NULL);
2287 ll_finish_md_op_data(op_data);
2294 ll_put_grouplock(llss->inode2, file2, gid);
2295 ll_put_grouplock(llss->inode1, file1, gid);
2305 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2307 struct md_op_data *op_data;
2311 /* Detect out-of range masks */
2312 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2315 /* Non-root users are forbidden to set or clear flags which are
2316 * NOT defined in HSM_USER_MASK. */
2317 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2318 !cfs_capable(CFS_CAP_SYS_ADMIN))
2321 /* Detect out-of range archive id */
2322 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2323 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2326 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2327 LUSTRE_OPC_ANY, hss);
2328 if (IS_ERR(op_data))
2329 RETURN(PTR_ERR(op_data));
2331 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2332 sizeof(*op_data), op_data, NULL);
2334 ll_finish_md_op_data(op_data);
2339 static int ll_hsm_import(struct inode *inode, struct file *file,
2340 struct hsm_user_import *hui)
2342 struct hsm_state_set *hss = NULL;
2343 struct iattr *attr = NULL;
2347 if (!S_ISREG(inode->i_mode))
2353 GOTO(out, rc = -ENOMEM);
2355 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2356 hss->hss_archive_id = hui->hui_archive_id;
2357 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2358 rc = ll_hsm_state_set(inode, hss);
2362 OBD_ALLOC_PTR(attr);
2364 GOTO(out, rc = -ENOMEM);
2366 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2367 attr->ia_mode |= S_IFREG;
2368 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2369 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2370 attr->ia_size = hui->hui_size;
2371 attr->ia_mtime.tv_sec = hui->hui_mtime;
2372 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2373 attr->ia_atime.tv_sec = hui->hui_atime;
2374 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2376 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2377 ATTR_UID | ATTR_GID |
2378 ATTR_MTIME | ATTR_MTIME_SET |
2379 ATTR_ATIME | ATTR_ATIME_SET;
2383 rc = ll_setattr_raw(file_dentry(file), attr, true);
2387 inode_unlock(inode);
2399 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2401 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2402 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2405 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2407 struct inode *inode = file_inode(file);
2409 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2410 ATTR_MTIME | ATTR_MTIME_SET |
2411 ATTR_CTIME | ATTR_CTIME_SET,
2413 .tv_sec = lfu->lfu_atime_sec,
2414 .tv_nsec = lfu->lfu_atime_nsec,
2417 .tv_sec = lfu->lfu_mtime_sec,
2418 .tv_nsec = lfu->lfu_mtime_nsec,
2421 .tv_sec = lfu->lfu_ctime_sec,
2422 .tv_nsec = lfu->lfu_ctime_nsec,
2428 if (!capable(CAP_SYS_ADMIN))
2431 if (!S_ISREG(inode->i_mode))
2435 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2436 inode_unlock(inode);
2441 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2444 case MODE_READ_USER:
2446 case MODE_WRITE_USER:
2453 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2455 /* Used to allow the upper layers of the client to request an LDLM lock
2456 * without doing an actual read or write.
2458 * Used for ladvise lockahead to manually request specific locks.
2460 * \param[in] file file this ladvise lock request is on
2461 * \param[in] ladvise ladvise struct describing this lock request
2463 * \retval 0 success, no detailed result available (sync requests
2464 * and requests sent to the server [not handled locally]
2465 * cannot return detailed results)
2466 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2467 * see definitions for details.
2468 * \retval negative negative errno on error
2470 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2472 struct lu_env *env = NULL;
2473 struct cl_io *io = NULL;
2474 struct cl_lock *lock = NULL;
2475 struct cl_lock_descr *descr = NULL;
2476 struct dentry *dentry = file->f_path.dentry;
2477 struct inode *inode = dentry->d_inode;
2478 enum cl_lock_mode cl_mode;
2479 off_t start = ladvise->lla_start;
2480 off_t end = ladvise->lla_end;
2486 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2487 "start=%llu, end=%llu\n", dentry->d_name.len,
2488 dentry->d_name.name, dentry->d_inode,
2489 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2492 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2494 GOTO(out, result = cl_mode);
2496 /* Get IO environment */
2497 result = cl_io_get(inode, &env, &io, &refcheck);
2501 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2504 * nothing to do for this io. This currently happens when
2505 * stripe sub-object's are not yet created.
2507 result = io->ci_result;
2508 } else if (result == 0) {
2509 lock = vvp_env_lock(env);
2510 descr = &lock->cll_descr;
2512 descr->cld_obj = io->ci_obj;
2513 /* Convert byte offsets to pages */
2514 descr->cld_start = cl_index(io->ci_obj, start);
2515 descr->cld_end = cl_index(io->ci_obj, end);
2516 descr->cld_mode = cl_mode;
2517 /* CEF_MUST is used because we do not want to convert a
2518 * lockahead request to a lockless lock */
2519 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2522 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2523 descr->cld_enq_flags |= CEF_SPECULATIVE;
2525 result = cl_lock_request(env, io, lock);
2527 /* On success, we need to release the lock */
2529 cl_lock_release(env, lock);
2531 cl_io_fini(env, io);
2532 cl_env_put(env, &refcheck);
2534 /* -ECANCELED indicates a matching lock with a different extent
2535 * was already present, and -EEXIST indicates a matching lock
2536 * on exactly the same extent was already present.
2537 * We convert them to positive values for userspace to make
2538 * recognizing true errors easier.
2539 * Note we can only return these detailed results on async requests,
2540 * as sync requests look the same as i/o requests for locking. */
2541 if (result == -ECANCELED)
2542 result = LLA_RESULT_DIFFERENT;
2543 else if (result == -EEXIST)
2544 result = LLA_RESULT_SAME;
2549 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2551 static int ll_ladvise_sanity(struct inode *inode,
2552 struct llapi_lu_ladvise *ladvise)
2554 enum lu_ladvise_type advice = ladvise->lla_advice;
2555 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2556 * be in the first 32 bits of enum ladvise_flags */
2557 __u32 flags = ladvise->lla_peradvice_flags;
2558 /* 3 lines at 80 characters per line, should be plenty */
2561 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2563 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2564 "last supported advice is %s (value '%d'): rc = %d\n",
2565 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2566 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2570 /* Per-advice checks */
2572 case LU_LADVISE_LOCKNOEXPAND:
2573 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2575 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2577 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2578 ladvise_names[advice], rc);
2582 case LU_LADVISE_LOCKAHEAD:
2583 /* Currently only READ and WRITE modes can be requested */
2584 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2585 ladvise->lla_lockahead_mode == 0) {
2587 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2589 ll_get_fsname(inode->i_sb, NULL, 0),
2590 ladvise->lla_lockahead_mode,
2591 ladvise_names[advice], rc);
2594 case LU_LADVISE_WILLREAD:
2595 case LU_LADVISE_DONTNEED:
2597 /* Note fall through above - These checks apply to all advices
2598 * except LOCKNOEXPAND */
2599 if (flags & ~LF_DEFAULT_MASK) {
2601 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2603 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2604 ladvise_names[advice], rc);
2607 if (ladvise->lla_start >= ladvise->lla_end) {
2609 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2610 "for %s: rc = %d\n",
2611 ll_get_fsname(inode->i_sb, NULL, 0),
2612 ladvise->lla_start, ladvise->lla_end,
2613 ladvise_names[advice], rc);
2625 * Give file access advices
2627 * The ladvise interface is similar to Linux fadvise() system call, except it
2628 * forwards the advices directly from Lustre client to server. The server side
2629 * codes will apply appropriate read-ahead and caching techniques for the
2630 * corresponding files.
2632 * A typical workload for ladvise is e.g. a bunch of different clients are
2633 * doing small random reads of a file, so prefetching pages into OSS cache
2634 * with big linear reads before the random IO is a net benefit. Fetching
2635 * all that data into each client cache with fadvise() may not be, due to
2636 * much more data being sent to the client.
2638 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2639 struct llapi_lu_ladvise *ladvise)
2643 struct cl_ladvise_io *lio;
2648 env = cl_env_get(&refcheck);
2650 RETURN(PTR_ERR(env));
2652 io = vvp_env_thread_io(env);
2653 io->ci_obj = ll_i2info(inode)->lli_clob;
2655 /* initialize parameters for ladvise */
2656 lio = &io->u.ci_ladvise;
2657 lio->li_start = ladvise->lla_start;
2658 lio->li_end = ladvise->lla_end;
2659 lio->li_fid = ll_inode2fid(inode);
2660 lio->li_advice = ladvise->lla_advice;
2661 lio->li_flags = flags;
2663 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2664 rc = cl_io_loop(env, io);
2668 cl_io_fini(env, io);
2669 cl_env_put(env, &refcheck);
2673 static int ll_lock_noexpand(struct file *file, int flags)
2675 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2677 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2682 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2685 struct fsxattr fsxattr;
2687 if (copy_from_user(&fsxattr,
2688 (const struct fsxattr __user *)arg,
2692 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2693 if (copy_to_user((struct fsxattr __user *)arg,
2694 &fsxattr, sizeof(fsxattr)))
2700 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2704 struct md_op_data *op_data;
2705 struct ptlrpc_request *req = NULL;
2707 struct fsxattr fsxattr;
2709 /* only root could change project ID */
2710 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2713 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2714 LUSTRE_OPC_ANY, NULL);
2715 if (IS_ERR(op_data))
2716 RETURN(PTR_ERR(op_data));
2718 if (copy_from_user(&fsxattr,
2719 (const struct fsxattr __user *)arg,
2721 GOTO(out_fsxattr1, rc = -EFAULT);
2723 op_data->op_projid = fsxattr.fsx_projid;
2724 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2725 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2727 ptlrpc_req_finished(req);
2730 ll_finish_md_op_data(op_data);
2737 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2739 struct inode *inode = file_inode(file);
2740 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2744 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2745 PFID(ll_inode2fid(inode)), inode, cmd);
2746 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2748 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2749 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2753 case LL_IOC_GETFLAGS:
2754 /* Get the current value of the file flags */
2755 return put_user(fd->fd_flags, (int __user *)arg);
2756 case LL_IOC_SETFLAGS:
2757 case LL_IOC_CLRFLAGS:
2758 /* Set or clear specific file flags */
2759 /* XXX This probably needs checks to ensure the flags are
2760 * not abused, and to handle any flag side effects.
2762 if (get_user(flags, (int __user *) arg))
2765 if (cmd == LL_IOC_SETFLAGS) {
2766 if ((flags & LL_FILE_IGNORE_LOCK) &&
2767 !(file->f_flags & O_DIRECT)) {
2768 CERROR("%s: unable to disable locking on "
2769 "non-O_DIRECT file\n", current->comm);
2773 fd->fd_flags |= flags;
2775 fd->fd_flags &= ~flags;
2778 case LL_IOC_LOV_SETSTRIPE:
2779 case LL_IOC_LOV_SETSTRIPE_NEW:
2780 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2781 case LL_IOC_LOV_SETEA:
2782 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2783 case LL_IOC_LOV_SWAP_LAYOUTS: {
2785 struct lustre_swap_layouts lsl;
2787 if (copy_from_user(&lsl, (char __user *)arg,
2788 sizeof(struct lustre_swap_layouts)))
2791 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2794 file2 = fget(lsl.sl_fd);
2798 /* O_WRONLY or O_RDWR */
2799 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2800 GOTO(out, rc = -EPERM);
2802 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2803 struct inode *inode2;
2804 struct ll_inode_info *lli;
2805 struct obd_client_handle *och = NULL;
2807 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2808 GOTO(out, rc = -EINVAL);
2810 lli = ll_i2info(inode);
2811 mutex_lock(&lli->lli_och_mutex);
2812 if (fd->fd_lease_och != NULL) {
2813 och = fd->fd_lease_och;
2814 fd->fd_lease_och = NULL;
2816 mutex_unlock(&lli->lli_och_mutex);
2818 GOTO(out, rc = -ENOLCK);
2819 inode2 = file_inode(file2);
2820 rc = ll_swap_layouts_close(och, inode, inode2);
2822 rc = ll_swap_layouts(file, file2, &lsl);
2828 case LL_IOC_LOV_GETSTRIPE:
2829 case LL_IOC_LOV_GETSTRIPE_NEW:
2830 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2831 case FSFILT_IOC_GETFLAGS:
2832 case FSFILT_IOC_SETFLAGS:
2833 RETURN(ll_iocontrol(inode, file, cmd, arg));
2834 case FSFILT_IOC_GETVERSION_OLD:
2835 case FSFILT_IOC_GETVERSION:
2836 RETURN(put_user(inode->i_generation, (int __user *)arg));
2837 case LL_IOC_GROUP_LOCK:
2838 RETURN(ll_get_grouplock(inode, file, arg));
2839 case LL_IOC_GROUP_UNLOCK:
2840 RETURN(ll_put_grouplock(inode, file, arg));
2841 case IOC_OBD_STATFS:
2842 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2844 /* We need to special case any other ioctls we want to handle,
2845 * to send them to the MDS/OST as appropriate and to properly
2846 * network encode the arg field.
2847 case FSFILT_IOC_SETVERSION_OLD:
2848 case FSFILT_IOC_SETVERSION:
2850 case LL_IOC_FLUSHCTX:
2851 RETURN(ll_flush_ctx(inode));
2852 case LL_IOC_PATH2FID: {
2853 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2854 sizeof(struct lu_fid)))
2859 case LL_IOC_GETPARENT:
2860 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2862 case OBD_IOC_FID2PATH:
2863 RETURN(ll_fid2path(inode, (void __user *)arg));
2864 case LL_IOC_DATA_VERSION: {
2865 struct ioc_data_version idv;
2868 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2871 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2872 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2875 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2881 case LL_IOC_GET_MDTIDX: {
2884 mdtidx = ll_get_mdt_idx(inode);
2888 if (put_user((int)mdtidx, (int __user *)arg))
2893 case OBD_IOC_GETDTNAME:
2894 case OBD_IOC_GETMDNAME:
2895 RETURN(ll_get_obd_name(inode, cmd, arg));
2896 case LL_IOC_HSM_STATE_GET: {
2897 struct md_op_data *op_data;
2898 struct hsm_user_state *hus;
2905 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2906 LUSTRE_OPC_ANY, hus);
2907 if (IS_ERR(op_data)) {
2909 RETURN(PTR_ERR(op_data));
2912 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2915 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2918 ll_finish_md_op_data(op_data);
2922 case LL_IOC_HSM_STATE_SET: {
2923 struct hsm_state_set *hss;
2930 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2935 rc = ll_hsm_state_set(inode, hss);
2940 case LL_IOC_HSM_ACTION: {
2941 struct md_op_data *op_data;
2942 struct hsm_current_action *hca;
2949 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2950 LUSTRE_OPC_ANY, hca);
2951 if (IS_ERR(op_data)) {
2953 RETURN(PTR_ERR(op_data));
2956 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2959 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2962 ll_finish_md_op_data(op_data);
2966 case LL_IOC_SET_LEASE: {
2967 struct ll_inode_info *lli = ll_i2info(inode);
2968 struct obd_client_handle *och = NULL;
2973 case LL_LEASE_WRLCK:
2974 if (!(file->f_mode & FMODE_WRITE))
2976 fmode = FMODE_WRITE;
2978 case LL_LEASE_RDLCK:
2979 if (!(file->f_mode & FMODE_READ))
2983 case LL_LEASE_UNLCK:
2984 mutex_lock(&lli->lli_och_mutex);
2985 if (fd->fd_lease_och != NULL) {
2986 och = fd->fd_lease_och;
2987 fd->fd_lease_och = NULL;
2989 mutex_unlock(&lli->lli_och_mutex);
2994 fmode = och->och_flags;
2995 rc = ll_lease_close(och, inode, &lease_broken);
2999 rc = ll_lease_och_release(inode, file);
3006 RETURN(ll_lease_type_from_fmode(fmode));
3011 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3013 /* apply for lease */
3014 och = ll_lease_open(inode, file, fmode, 0);
3016 RETURN(PTR_ERR(och));
3019 mutex_lock(&lli->lli_och_mutex);
3020 if (fd->fd_lease_och == NULL) {
3021 fd->fd_lease_och = och;
3024 mutex_unlock(&lli->lli_och_mutex);
3026 /* impossible now that only excl is supported for now */
3027 ll_lease_close(och, inode, &lease_broken);
3032 case LL_IOC_GET_LEASE: {
3033 struct ll_inode_info *lli = ll_i2info(inode);
3034 struct ldlm_lock *lock = NULL;
3037 mutex_lock(&lli->lli_och_mutex);
3038 if (fd->fd_lease_och != NULL) {
3039 struct obd_client_handle *och = fd->fd_lease_och;
3041 lock = ldlm_handle2lock(&och->och_lease_handle);
3043 lock_res_and_lock(lock);
3044 if (!ldlm_is_cancel(lock))
3045 fmode = och->och_flags;
3047 unlock_res_and_lock(lock);
3048 LDLM_LOCK_PUT(lock);
3051 mutex_unlock(&lli->lli_och_mutex);
3053 RETURN(ll_lease_type_from_fmode(fmode));
3055 case LL_IOC_HSM_IMPORT: {
3056 struct hsm_user_import *hui;
3062 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3067 rc = ll_hsm_import(inode, file, hui);
3072 case LL_IOC_FUTIMES_3: {
3073 struct ll_futimes_3 lfu;
3075 if (copy_from_user(&lfu,
3076 (const struct ll_futimes_3 __user *)arg,
3080 RETURN(ll_file_futimes_3(file, &lfu));
3082 case LL_IOC_LADVISE: {
3083 struct llapi_ladvise_hdr *k_ladvise_hdr;
3084 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3087 int alloc_size = sizeof(*k_ladvise_hdr);
3090 u_ladvise_hdr = (void __user *)arg;
3091 OBD_ALLOC_PTR(k_ladvise_hdr);
3092 if (k_ladvise_hdr == NULL)
3095 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3096 GOTO(out_ladvise, rc = -EFAULT);
3098 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3099 k_ladvise_hdr->lah_count < 1)
3100 GOTO(out_ladvise, rc = -EINVAL);
3102 num_advise = k_ladvise_hdr->lah_count;
3103 if (num_advise >= LAH_COUNT_MAX)
3104 GOTO(out_ladvise, rc = -EFBIG);
3106 OBD_FREE_PTR(k_ladvise_hdr);
3107 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3108 lah_advise[num_advise]);
3109 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3110 if (k_ladvise_hdr == NULL)
3114 * TODO: submit multiple advices to one server in a single RPC
3116 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3117 GOTO(out_ladvise, rc = -EFAULT);
3119 for (i = 0; i < num_advise; i++) {
3120 struct llapi_lu_ladvise *k_ladvise =
3121 &k_ladvise_hdr->lah_advise[i];
3122 struct llapi_lu_ladvise __user *u_ladvise =
3123 &u_ladvise_hdr->lah_advise[i];
3125 rc = ll_ladvise_sanity(inode, k_ladvise);
3127 GOTO(out_ladvise, rc);
3129 switch (k_ladvise->lla_advice) {
3130 case LU_LADVISE_LOCKNOEXPAND:
3131 rc = ll_lock_noexpand(file,
3132 k_ladvise->lla_peradvice_flags);
3133 GOTO(out_ladvise, rc);
3134 case LU_LADVISE_LOCKAHEAD:
3136 rc = ll_file_lock_ahead(file, k_ladvise);
3139 GOTO(out_ladvise, rc);
3142 &u_ladvise->lla_lockahead_result))
3143 GOTO(out_ladvise, rc = -EFAULT);
3146 rc = ll_ladvise(inode, file,
3147 k_ladvise_hdr->lah_flags,
3150 GOTO(out_ladvise, rc);
3157 OBD_FREE(k_ladvise_hdr, alloc_size);
3160 case LL_IOC_FSGETXATTR:
3161 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3162 case LL_IOC_FSSETXATTR:
3163 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3165 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3167 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3168 (void __user *)arg));
3172 #ifndef HAVE_FILE_LLSEEK_SIZE
3173 static inline loff_t
3174 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3176 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3178 if (offset > maxsize)
3181 if (offset != file->f_pos) {
3182 file->f_pos = offset;
3183 file->f_version = 0;
3189 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3190 loff_t maxsize, loff_t eof)
3192 struct inode *inode = file_inode(file);
3200 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3201 * position-querying operation. Avoid rewriting the "same"
3202 * f_pos value back to the file because a concurrent read(),
3203 * write() or lseek() might have altered it
3208 * f_lock protects against read/modify/write race with other
3209 * SEEK_CURs. Note that parallel writes and reads behave
3213 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3214 inode_unlock(inode);
3218 * In the generic case the entire file is data, so as long as
3219 * offset isn't at the end of the file then the offset is data.
3226 * There is a virtual hole at the end of the file, so as long as
3227 * offset isn't i_size or larger, return i_size.
3235 return llseek_execute(file, offset, maxsize);
3239 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3241 struct inode *inode = file_inode(file);
3242 loff_t retval, eof = 0;
3245 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3246 (origin == SEEK_CUR) ? file->f_pos : 0);
3247 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3248 PFID(ll_inode2fid(inode)), inode, retval, retval,
3250 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3252 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3253 retval = ll_glimpse_size(inode);
3256 eof = i_size_read(inode);
3259 retval = ll_generic_file_llseek_size(file, offset, origin,
3260 ll_file_maxbytes(inode), eof);
3264 static int ll_flush(struct file *file, fl_owner_t id)
3266 struct inode *inode = file_inode(file);
3267 struct ll_inode_info *lli = ll_i2info(inode);
3268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3271 LASSERT(!S_ISDIR(inode->i_mode));
3273 /* catch async errors that were recorded back when async writeback
3274 * failed for pages in this mapping. */
3275 rc = lli->lli_async_rc;
3276 lli->lli_async_rc = 0;
3277 if (lli->lli_clob != NULL) {
3278 err = lov_read_and_clear_async_rc(lli->lli_clob);
3283 /* The application has been told write failure already.
3284 * Do not report failure again. */
3285 if (fd->fd_write_failed)
3287 return rc ? -EIO : 0;
3291 * Called to make sure a portion of file has been written out.
3292 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3294 * Return how many pages have been written.
3296 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3297 enum cl_fsync_mode mode, int ignore_layout)
3301 struct cl_fsync_io *fio;
3306 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3307 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3310 env = cl_env_get(&refcheck);
3312 RETURN(PTR_ERR(env));
3314 io = vvp_env_thread_io(env);
3315 io->ci_obj = ll_i2info(inode)->lli_clob;
3316 io->ci_ignore_layout = ignore_layout;
3318 /* initialize parameters for sync */
3319 fio = &io->u.ci_fsync;
3320 fio->fi_start = start;
3322 fio->fi_fid = ll_inode2fid(inode);
3323 fio->fi_mode = mode;
3324 fio->fi_nr_written = 0;
3326 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3327 result = cl_io_loop(env, io);
3329 result = io->ci_result;
3331 result = fio->fi_nr_written;
3332 cl_io_fini(env, io);
3333 cl_env_put(env, &refcheck);
3339 * When dentry is provided (the 'else' case), file_dentry() may be
3340 * null and dentry must be used directly rather than pulled from
3341 * file_dentry() as is done otherwise.
3344 #ifdef HAVE_FILE_FSYNC_4ARGS
3345 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3347 struct dentry *dentry = file_dentry(file);
3349 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3350 int ll_fsync(struct file *file, int datasync)
3352 struct dentry *dentry = file_dentry(file);
3354 loff_t end = LLONG_MAX;
3356 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3359 loff_t end = LLONG_MAX;
3361 struct inode *inode = dentry->d_inode;
3362 struct ll_inode_info *lli = ll_i2info(inode);
3363 struct ptlrpc_request *req;
3367 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3368 PFID(ll_inode2fid(inode)), inode);
3369 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3371 #ifdef HAVE_FILE_FSYNC_4ARGS
3372 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3373 lock_inode = !lli->lli_inode_locked;
3377 /* fsync's caller has already called _fdata{sync,write}, we want
3378 * that IO to finish before calling the osc and mdc sync methods */
3379 rc = filemap_fdatawait(inode->i_mapping);
3382 /* catch async errors that were recorded back when async writeback
3383 * failed for pages in this mapping. */
3384 if (!S_ISDIR(inode->i_mode)) {
3385 err = lli->lli_async_rc;
3386 lli->lli_async_rc = 0;
3389 if (lli->lli_clob != NULL) {
3390 err = lov_read_and_clear_async_rc(lli->lli_clob);
3396 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3400 ptlrpc_req_finished(req);
3402 if (S_ISREG(inode->i_mode)) {
3403 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3405 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3406 if (rc == 0 && err < 0)
3409 fd->fd_write_failed = true;
3411 fd->fd_write_failed = false;
3414 #ifdef HAVE_FILE_FSYNC_4ARGS
3416 inode_unlock(inode);
3422 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3424 struct inode *inode = file_inode(file);
3425 struct ll_sb_info *sbi = ll_i2sbi(inode);
3426 struct ldlm_enqueue_info einfo = {
3427 .ei_type = LDLM_FLOCK,
3428 .ei_cb_cp = ldlm_flock_completion_ast,
3429 .ei_cbdata = file_lock,
3431 struct md_op_data *op_data;
3432 struct lustre_handle lockh = { 0 };
3433 union ldlm_policy_data flock = { { 0 } };
3434 int fl_type = file_lock->fl_type;
3440 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3441 PFID(ll_inode2fid(inode)), file_lock);
3443 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3445 if (file_lock->fl_flags & FL_FLOCK) {
3446 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3447 /* flocks are whole-file locks */
3448 flock.l_flock.end = OFFSET_MAX;
3449 /* For flocks owner is determined by the local file desctiptor*/
3450 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3451 } else if (file_lock->fl_flags & FL_POSIX) {
3452 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3453 flock.l_flock.start = file_lock->fl_start;
3454 flock.l_flock.end = file_lock->fl_end;
3458 flock.l_flock.pid = file_lock->fl_pid;
3460 /* Somewhat ugly workaround for svc lockd.
3461 * lockd installs custom fl_lmops->lm_compare_owner that checks
3462 * for the fl_owner to be the same (which it always is on local node
3463 * I guess between lockd processes) and then compares pid.
3464 * As such we assign pid to the owner field to make it all work,
3465 * conflict with normal locks is unlikely since pid space and
3466 * pointer space for current->files are not intersecting */
3467 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3468 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3472 einfo.ei_mode = LCK_PR;
3475 /* An unlock request may or may not have any relation to
3476 * existing locks so we may not be able to pass a lock handle
3477 * via a normal ldlm_lock_cancel() request. The request may even
3478 * unlock a byte range in the middle of an existing lock. In
3479 * order to process an unlock request we need all of the same
3480 * information that is given with a normal read or write record
3481 * lock request. To avoid creating another ldlm unlock (cancel)
3482 * message we'll treat a LCK_NL flock request as an unlock. */
3483 einfo.ei_mode = LCK_NL;
3486 einfo.ei_mode = LCK_PW;
3489 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3504 flags = LDLM_FL_BLOCK_NOWAIT;
3510 flags = LDLM_FL_TEST_LOCK;
3513 CERROR("unknown fcntl lock command: %d\n", cmd);
3517 /* Save the old mode so that if the mode in the lock changes we
3518 * can decrement the appropriate reader or writer refcount. */
3519 file_lock->fl_type = einfo.ei_mode;
3521 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3522 LUSTRE_OPC_ANY, NULL);
3523 if (IS_ERR(op_data))
3524 RETURN(PTR_ERR(op_data));
3526 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3527 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3528 flock.l_flock.pid, flags, einfo.ei_mode,
3529 flock.l_flock.start, flock.l_flock.end);
3531 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3534 /* Restore the file lock type if not TEST lock. */
3535 if (!(flags & LDLM_FL_TEST_LOCK))
3536 file_lock->fl_type = fl_type;
3538 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3539 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3540 !(flags & LDLM_FL_TEST_LOCK))
3541 rc2 = locks_lock_file_wait(file, file_lock);
3543 if ((file_lock->fl_flags & FL_FLOCK) &&
3544 (rc == 0 || file_lock->fl_type == F_UNLCK))
3545 rc2 = flock_lock_file_wait(file, file_lock);
3546 if ((file_lock->fl_flags & FL_POSIX) &&
3547 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3548 !(flags & LDLM_FL_TEST_LOCK))
3549 rc2 = posix_lock_file_wait(file, file_lock);
3550 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3552 if (rc2 && file_lock->fl_type != F_UNLCK) {
3553 einfo.ei_mode = LCK_NL;
3554 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3559 ll_finish_md_op_data(op_data);
3564 int ll_get_fid_by_name(struct inode *parent, const char *name,
3565 int namelen, struct lu_fid *fid,
3566 struct inode **inode)
3568 struct md_op_data *op_data = NULL;
3569 struct mdt_body *body;
3570 struct ptlrpc_request *req;
3574 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3575 LUSTRE_OPC_ANY, NULL);
3576 if (IS_ERR(op_data))
3577 RETURN(PTR_ERR(op_data));
3579 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3580 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3581 ll_finish_md_op_data(op_data);
3585 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3587 GOTO(out_req, rc = -EFAULT);
3589 *fid = body->mbo_fid1;
3592 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3594 ptlrpc_req_finished(req);
3598 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3599 const char *name, int namelen)
3601 struct dentry *dchild = NULL;
3602 struct inode *child_inode = NULL;
3603 struct md_op_data *op_data;
3604 struct ptlrpc_request *request = NULL;
3605 struct obd_client_handle *och = NULL;
3607 struct mdt_body *body;
3609 __u64 data_version = 0;
3612 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3613 name, PFID(ll_inode2fid(parent)), mdtidx);
3615 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3616 0, LUSTRE_OPC_ANY, NULL);
3617 if (IS_ERR(op_data))
3618 RETURN(PTR_ERR(op_data));
3620 /* Get child FID first */
3621 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3624 dchild = d_lookup(file_dentry(file), &qstr);
3625 if (dchild != NULL) {
3626 if (dchild->d_inode != NULL)
3627 child_inode = igrab(dchild->d_inode);
3631 if (child_inode == NULL) {
3632 rc = ll_get_fid_by_name(parent, name, namelen,
3633 &op_data->op_fid3, &child_inode);
3638 if (child_inode == NULL)
3639 GOTO(out_free, rc = -EINVAL);
3642 * lfs migrate command needs to be blocked on the client
3643 * by checking the migrate FID against the FID of the
3646 if (child_inode == parent->i_sb->s_root->d_inode)
3647 GOTO(out_iput, rc = -EINVAL);
3649 inode_lock(child_inode);
3650 op_data->op_fid3 = *ll_inode2fid(child_inode);
3651 if (!fid_is_sane(&op_data->op_fid3)) {
3652 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3653 ll_get_fsname(parent->i_sb, NULL, 0), name,
3654 PFID(&op_data->op_fid3));
3655 GOTO(out_unlock, rc = -EINVAL);
3658 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3660 GOTO(out_unlock, rc);
3663 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3664 PFID(&op_data->op_fid3), mdtidx);
3665 GOTO(out_unlock, rc = 0);
3668 if (S_ISREG(child_inode->i_mode)) {
3669 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3673 GOTO(out_unlock, rc);
3676 rc = ll_data_version(child_inode, &data_version,
3679 GOTO(out_close, rc);
3681 op_data->op_handle = och->och_fh;
3682 op_data->op_data = och->och_mod;
3683 op_data->op_data_version = data_version;
3684 op_data->op_lease_handle = och->och_lease_handle;
3685 op_data->op_bias |= MDS_RENAME_MIGRATE;
3688 op_data->op_mds = mdtidx;
3689 op_data->op_cli_flags = CLI_MIGRATE;
3690 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3691 namelen, name, namelen, &request);
3693 LASSERT(request != NULL);
3694 ll_update_times(request, parent);
3696 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3697 LASSERT(body != NULL);
3699 /* If the server does release layout lock, then we cleanup
3700 * the client och here, otherwise release it in out_close: */
3702 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3703 obd_mod_put(och->och_mod);
3704 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3706 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3712 if (request != NULL) {
3713 ptlrpc_req_finished(request);
3717 /* Try again if the file layout has changed. */
3718 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3722 if (och != NULL) /* close the file */
3723 ll_lease_close(och, child_inode, NULL);
3725 clear_nlink(child_inode);
3727 inode_unlock(child_inode);
3731 ll_finish_md_op_data(op_data);
3736 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3744 * test if some locks matching bits and l_req_mode are acquired
3745 * - bits can be in different locks
3746 * - if found clear the common lock bits in *bits
3747 * - the bits not found, are kept in *bits
3749 * \param bits [IN] searched lock bits [IN]
3750 * \param l_req_mode [IN] searched lock mode
3751 * \retval boolean, true iff all bits are found
3753 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3755 struct lustre_handle lockh;
3756 union ldlm_policy_data policy;
3757 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3758 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3767 fid = &ll_i2info(inode)->lli_fid;
3768 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3769 ldlm_lockname[mode]);
3771 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3772 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3773 policy.l_inodebits.bits = *bits & (1 << i);
3774 if (policy.l_inodebits.bits == 0)
3777 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3778 &policy, mode, &lockh)) {
3779 struct ldlm_lock *lock;
3781 lock = ldlm_handle2lock(&lockh);
3784 ~(lock->l_policy_data.l_inodebits.bits);
3785 LDLM_LOCK_PUT(lock);
3787 *bits &= ~policy.l_inodebits.bits;
3794 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3795 struct lustre_handle *lockh, __u64 flags,
3796 enum ldlm_mode mode)
3798 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3803 fid = &ll_i2info(inode)->lli_fid;
3804 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3806 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3807 fid, LDLM_IBITS, &policy, mode, lockh);
3812 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3814 /* Already unlinked. Just update nlink and return success */
3815 if (rc == -ENOENT) {
3817 /* If it is striped directory, and there is bad stripe
3818 * Let's revalidate the dentry again, instead of returning
3820 if (S_ISDIR(inode->i_mode) &&
3821 ll_i2info(inode)->lli_lsm_md != NULL)
3824 /* This path cannot be hit for regular files unless in
3825 * case of obscure races, so no need to to validate
3827 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3829 } else if (rc != 0) {
3830 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3831 "%s: revalidate FID "DFID" error: rc = %d\n",
3832 ll_get_fsname(inode->i_sb, NULL, 0),
3833 PFID(ll_inode2fid(inode)), rc);
3839 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3841 struct inode *inode = dentry->d_inode;
3842 struct ptlrpc_request *req = NULL;
3843 struct obd_export *exp;
3847 LASSERT(inode != NULL);
3849 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3850 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3852 exp = ll_i2mdexp(inode);
3854 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3855 * But under CMD case, it caused some lock issues, should be fixed
3856 * with new CMD ibits lock. See bug 12718 */
3857 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3858 struct lookup_intent oit = { .it_op = IT_GETATTR };
3859 struct md_op_data *op_data;
3861 if (ibits == MDS_INODELOCK_LOOKUP)
3862 oit.it_op = IT_LOOKUP;
3864 /* Call getattr by fid, so do not provide name at all. */
3865 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3866 dentry->d_inode, NULL, 0, 0,
3867 LUSTRE_OPC_ANY, NULL);
3868 if (IS_ERR(op_data))
3869 RETURN(PTR_ERR(op_data));
3871 rc = md_intent_lock(exp, op_data, &oit, &req,
3872 &ll_md_blocking_ast, 0);
3873 ll_finish_md_op_data(op_data);
3875 rc = ll_inode_revalidate_fini(inode, rc);
3879 rc = ll_revalidate_it_finish(req, &oit, dentry);
3881 ll_intent_release(&oit);
3885 /* Unlinked? Unhash dentry, so it is not picked up later by
3886 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3887 here to preserve get_cwd functionality on 2.6.
3889 if (!dentry->d_inode->i_nlink) {
3890 ll_lock_dcache(inode);
3891 d_lustre_invalidate(dentry, 0);
3892 ll_unlock_dcache(inode);
3895 ll_lookup_finish_locks(&oit, dentry);
3896 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3897 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3898 u64 valid = OBD_MD_FLGETATTR;
3899 struct md_op_data *op_data;
3902 if (S_ISREG(inode->i_mode)) {
3903 rc = ll_get_default_mdsize(sbi, &ealen);
3906 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3909 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3910 0, ealen, LUSTRE_OPC_ANY,
3912 if (IS_ERR(op_data))
3913 RETURN(PTR_ERR(op_data));
3915 op_data->op_valid = valid;
3916 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3917 ll_finish_md_op_data(op_data);
3919 rc = ll_inode_revalidate_fini(inode, rc);
3923 rc = ll_prep_inode(&inode, req, NULL, NULL);
3926 ptlrpc_req_finished(req);
3930 static int ll_merge_md_attr(struct inode *inode)
3932 struct cl_attr attr = { 0 };
3935 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3936 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3937 &attr, ll_md_blocking_ast);
3941 set_nlink(inode, attr.cat_nlink);
3942 inode->i_blocks = attr.cat_blocks;
3943 i_size_write(inode, attr.cat_size);
3945 ll_i2info(inode)->lli_atime = attr.cat_atime;
3946 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3947 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3953 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3955 struct inode *inode = dentry->d_inode;
3959 rc = __ll_inode_revalidate(dentry, ibits);
3963 /* if object isn't regular file, don't validate size */
3964 if (!S_ISREG(inode->i_mode)) {
3965 if (S_ISDIR(inode->i_mode) &&
3966 ll_i2info(inode)->lli_lsm_md != NULL) {
3967 rc = ll_merge_md_attr(inode);
3972 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3973 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3974 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3976 /* In case of restore, the MDT has the right size and has
3977 * already send it back without granting the layout lock,
3978 * inode is up-to-date so glimpse is useless.
3979 * Also to glimpse we need the layout, in case of a running
3980 * restore the MDT holds the layout lock so the glimpse will
3981 * block up to the end of restore (getattr will block)
3983 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3984 rc = ll_glimpse_size(inode);
3989 static inline dev_t ll_compat_encode_dev(dev_t dev)
3991 /* The compat_sys_*stat*() syscalls will fail unless the
3992 * device majors and minors are both less than 256. Note that
3993 * the value returned here will be passed through
3994 * old_encode_dev() in cp_compat_stat(). And so we are not
3995 * trying to return a valid compat (u16) device number, just
3996 * one that will pass the old_valid_dev() check. */
3998 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4001 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4002 int ll_getattr(const struct path *path, struct kstat *stat,
4003 u32 request_mask, unsigned int flags)
4006 struct dentry *de = path->dentry;
4008 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4011 struct inode *inode = de->d_inode;
4012 struct ll_sb_info *sbi = ll_i2sbi(inode);
4013 struct ll_inode_info *lli = ll_i2info(inode);
4016 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4017 MDS_INODELOCK_LOOKUP);
4018 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4023 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4025 if (ll_need_32bit_api(sbi)) {
4026 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4027 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4028 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4030 stat->ino = inode->i_ino;
4031 stat->dev = inode->i_sb->s_dev;
4032 stat->rdev = inode->i_rdev;
4035 stat->mode = inode->i_mode;
4036 stat->uid = inode->i_uid;
4037 stat->gid = inode->i_gid;
4038 stat->atime = inode->i_atime;
4039 stat->mtime = inode->i_mtime;
4040 stat->ctime = inode->i_ctime;
4041 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4043 stat->nlink = inode->i_nlink;
4044 stat->size = i_size_read(inode);
4045 stat->blocks = inode->i_blocks;
4050 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4051 __u64 start, __u64 len)
4055 struct fiemap *fiemap;
4056 unsigned int extent_count = fieinfo->fi_extents_max;
4058 num_bytes = sizeof(*fiemap) + (extent_count *
4059 sizeof(struct fiemap_extent));
4060 OBD_ALLOC_LARGE(fiemap, num_bytes);
4065 fiemap->fm_flags = fieinfo->fi_flags;
4066 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4067 fiemap->fm_start = start;
4068 fiemap->fm_length = len;
4069 if (extent_count > 0 &&
4070 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4071 sizeof(struct fiemap_extent)) != 0)
4072 GOTO(out, rc = -EFAULT);
4074 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4076 fieinfo->fi_flags = fiemap->fm_flags;
4077 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4078 if (extent_count > 0 &&
4079 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4080 fiemap->fm_mapped_extents *
4081 sizeof(struct fiemap_extent)) != 0)
4082 GOTO(out, rc = -EFAULT);
4084 OBD_FREE_LARGE(fiemap, num_bytes);
4088 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4090 struct ll_inode_info *lli = ll_i2info(inode);
4091 struct posix_acl *acl = NULL;
4094 spin_lock(&lli->lli_lock);
4095 /* VFS' acl_permission_check->check_acl will release the refcount */
4096 acl = posix_acl_dup(lli->lli_posix_acl);
4097 spin_unlock(&lli->lli_lock);
4102 #ifdef HAVE_IOP_SET_ACL
4103 #ifdef CONFIG_FS_POSIX_ACL
4104 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4106 const char *name = NULL;
4113 case ACL_TYPE_ACCESS:
4115 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4119 name = XATTR_NAME_POSIX_ACL_ACCESS;
4121 case ACL_TYPE_DEFAULT:
4122 if (!S_ISDIR(inode->i_mode))
4123 GOTO(out, rc = acl ? -EACCES : 0);
4124 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4127 GOTO(out, rc = -EINVAL);
4131 size = posix_acl_xattr_size(acl->a_count);
4132 value = kmalloc(size, GFP_NOFS);
4134 GOTO(out, rc = -ENOMEM);
4136 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4141 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4142 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4147 set_cached_acl(inode, type, acl);
4149 forget_cached_acl(inode, type);
4152 #endif /* CONFIG_FS_POSIX_ACL */
4153 #endif /* HAVE_IOP_SET_ACL */
4155 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4157 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4158 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4160 ll_check_acl(struct inode *inode, int mask)
4163 # ifdef CONFIG_FS_POSIX_ACL
4164 struct posix_acl *acl;
4168 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4169 if (flags & IPERM_FLAG_RCU)
4172 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4177 rc = posix_acl_permission(inode, acl, mask);
4178 posix_acl_release(acl);
4181 # else /* !CONFIG_FS_POSIX_ACL */
4183 # endif /* CONFIG_FS_POSIX_ACL */
4185 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4187 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4188 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4190 # ifdef HAVE_INODE_PERMISION_2ARGS
4191 int ll_inode_permission(struct inode *inode, int mask)
4193 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4198 struct ll_sb_info *sbi;
4199 struct root_squash_info *squash;
4200 struct cred *cred = NULL;
4201 const struct cred *old_cred = NULL;
4203 bool squash_id = false;
4206 #ifdef MAY_NOT_BLOCK
4207 if (mask & MAY_NOT_BLOCK)
4209 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4210 if (flags & IPERM_FLAG_RCU)
4214 /* as root inode are NOT getting validated in lookup operation,
4215 * need to do it before permission check. */
4217 if (inode == inode->i_sb->s_root->d_inode) {
4218 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4219 MDS_INODELOCK_LOOKUP);
4224 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4225 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4227 /* squash fsuid/fsgid if needed */
4228 sbi = ll_i2sbi(inode);
4229 squash = &sbi->ll_squash;
4230 if (unlikely(squash->rsi_uid != 0 &&
4231 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4232 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4236 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4237 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4238 squash->rsi_uid, squash->rsi_gid);
4240 /* update current process's credentials
4241 * and FS capability */
4242 cred = prepare_creds();
4246 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4247 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4248 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4249 if ((1 << cap) & CFS_CAP_FS_MASK)
4250 cap_lower(cred->cap_effective, cap);
4252 old_cred = override_creds(cred);
4255 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4256 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4257 /* restore current process's credentials and FS capability */
4259 revert_creds(old_cred);
4266 /* -o localflock - only provides locally consistent flock locks */
4267 struct file_operations ll_file_operations = {
4268 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4269 # ifdef HAVE_SYNC_READ_WRITE
4270 .read = new_sync_read,
4271 .write = new_sync_write,
4273 .read_iter = ll_file_read_iter,
4274 .write_iter = ll_file_write_iter,
4275 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4276 .read = ll_file_read,
4277 .aio_read = ll_file_aio_read,
4278 .write = ll_file_write,
4279 .aio_write = ll_file_aio_write,
4280 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4281 .unlocked_ioctl = ll_file_ioctl,
4282 .open = ll_file_open,
4283 .release = ll_file_release,
4284 .mmap = ll_file_mmap,
4285 .llseek = ll_file_seek,
4286 .splice_read = ll_file_splice_read,
4291 struct file_operations ll_file_operations_flock = {
4292 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4293 # ifdef HAVE_SYNC_READ_WRITE
4294 .read = new_sync_read,
4295 .write = new_sync_write,
4296 # endif /* HAVE_SYNC_READ_WRITE */
4297 .read_iter = ll_file_read_iter,
4298 .write_iter = ll_file_write_iter,
4299 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4300 .read = ll_file_read,
4301 .aio_read = ll_file_aio_read,
4302 .write = ll_file_write,
4303 .aio_write = ll_file_aio_write,
4304 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4305 .unlocked_ioctl = ll_file_ioctl,
4306 .open = ll_file_open,
4307 .release = ll_file_release,
4308 .mmap = ll_file_mmap,
4309 .llseek = ll_file_seek,
4310 .splice_read = ll_file_splice_read,
4313 .flock = ll_file_flock,
4314 .lock = ll_file_flock
4317 /* These are for -o noflock - to return ENOSYS on flock calls */
4318 struct file_operations ll_file_operations_noflock = {
4319 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4320 # ifdef HAVE_SYNC_READ_WRITE
4321 .read = new_sync_read,
4322 .write = new_sync_write,
4323 # endif /* HAVE_SYNC_READ_WRITE */
4324 .read_iter = ll_file_read_iter,
4325 .write_iter = ll_file_write_iter,
4326 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4327 .read = ll_file_read,
4328 .aio_read = ll_file_aio_read,
4329 .write = ll_file_write,
4330 .aio_write = ll_file_aio_write,
4331 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4332 .unlocked_ioctl = ll_file_ioctl,
4333 .open = ll_file_open,
4334 .release = ll_file_release,
4335 .mmap = ll_file_mmap,
4336 .llseek = ll_file_seek,
4337 .splice_read = ll_file_splice_read,
4340 .flock = ll_file_noflock,
4341 .lock = ll_file_noflock
4344 struct inode_operations ll_file_inode_operations = {
4345 .setattr = ll_setattr,
4346 .getattr = ll_getattr,
4347 .permission = ll_inode_permission,
4348 #ifdef HAVE_IOP_XATTR
4349 .setxattr = ll_setxattr,
4350 .getxattr = ll_getxattr,
4351 .removexattr = ll_removexattr,
4353 .listxattr = ll_listxattr,
4354 .fiemap = ll_fiemap,
4355 #ifdef HAVE_IOP_GET_ACL
4356 .get_acl = ll_get_acl,
4358 #ifdef HAVE_IOP_SET_ACL
4359 .set_acl = ll_set_acl,
4363 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4365 struct ll_inode_info *lli = ll_i2info(inode);
4366 struct cl_object *obj = lli->lli_clob;
4375 env = cl_env_get(&refcheck);
4377 RETURN(PTR_ERR(env));
4379 rc = cl_conf_set(env, lli->lli_clob, conf);
4383 if (conf->coc_opc == OBJECT_CONF_SET) {
4384 struct ldlm_lock *lock = conf->coc_lock;
4385 struct cl_layout cl = {
4389 LASSERT(lock != NULL);
4390 LASSERT(ldlm_has_layout(lock));
4392 /* it can only be allowed to match after layout is
4393 * applied to inode otherwise false layout would be
4394 * seen. Applying layout shoud happen before dropping
4395 * the intent lock. */
4396 ldlm_lock_allow_match(lock);
4398 rc = cl_object_layout_get(env, obj, &cl);
4403 DFID": layout version change: %u -> %u\n",
4404 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4406 ll_layout_version_set(lli, cl.cl_layout_gen);
4410 cl_env_put(env, &refcheck);
4415 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4416 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4419 struct ll_sb_info *sbi = ll_i2sbi(inode);
4420 struct ptlrpc_request *req;
4421 struct mdt_body *body;
4428 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4429 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4430 lock->l_lvb_data, lock->l_lvb_len);
4432 if (lock->l_lvb_data != NULL)
4435 /* if layout lock was granted right away, the layout is returned
4436 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4437 * blocked and then granted via completion ast, we have to fetch
4438 * layout here. Please note that we can't use the LVB buffer in
4439 * completion AST because it doesn't have a large enough buffer */
4440 rc = ll_get_default_mdsize(sbi, &lmmsize);
4442 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4443 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4448 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4450 GOTO(out, rc = -EPROTO);
4452 lmmsize = body->mbo_eadatasize;
4453 if (lmmsize == 0) /* empty layout */
4456 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4458 GOTO(out, rc = -EFAULT);
4460 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4461 if (lvbdata == NULL)
4462 GOTO(out, rc = -ENOMEM);
4464 memcpy(lvbdata, lmm, lmmsize);
4465 lock_res_and_lock(lock);
4466 if (unlikely(lock->l_lvb_data == NULL)) {
4467 lock->l_lvb_type = LVB_T_LAYOUT;
4468 lock->l_lvb_data = lvbdata;
4469 lock->l_lvb_len = lmmsize;
4472 unlock_res_and_lock(lock);
4475 OBD_FREE_LARGE(lvbdata, lmmsize);
4480 ptlrpc_req_finished(req);
4485 * Apply the layout to the inode. Layout lock is held and will be released
4488 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4489 struct inode *inode)
4491 struct ll_inode_info *lli = ll_i2info(inode);
4492 struct ll_sb_info *sbi = ll_i2sbi(inode);
4493 struct ldlm_lock *lock;
4494 struct cl_object_conf conf;
4497 bool wait_layout = false;
4500 LASSERT(lustre_handle_is_used(lockh));
4502 lock = ldlm_handle2lock(lockh);
4503 LASSERT(lock != NULL);
4504 LASSERT(ldlm_has_layout(lock));
4506 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4507 PFID(&lli->lli_fid), inode);
4509 /* in case this is a caching lock and reinstate with new inode */
4510 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4512 lock_res_and_lock(lock);
4513 lvb_ready = ldlm_is_lvb_ready(lock);
4514 unlock_res_and_lock(lock);
4516 /* checking lvb_ready is racy but this is okay. The worst case is
4517 * that multi processes may configure the file on the same time. */
4521 rc = ll_layout_fetch(inode, lock);
4525 /* for layout lock, lmm is stored in lock's lvb.
4526 * lvb_data is immutable if the lock is held so it's safe to access it
4529 * set layout to file. Unlikely this will fail as old layout was
4530 * surely eliminated */
4531 memset(&conf, 0, sizeof conf);
4532 conf.coc_opc = OBJECT_CONF_SET;
4533 conf.coc_inode = inode;
4534 conf.coc_lock = lock;
4535 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4536 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4537 rc = ll_layout_conf(inode, &conf);
4539 /* refresh layout failed, need to wait */
4540 wait_layout = rc == -EBUSY;
4543 LDLM_LOCK_PUT(lock);
4544 ldlm_lock_decref(lockh, mode);
4546 /* wait for IO to complete if it's still being used. */
4548 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4549 ll_get_fsname(inode->i_sb, NULL, 0),
4550 PFID(&lli->lli_fid), inode);
4552 memset(&conf, 0, sizeof conf);
4553 conf.coc_opc = OBJECT_CONF_WAIT;
4554 conf.coc_inode = inode;
4555 rc = ll_layout_conf(inode, &conf);
4559 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4560 ll_get_fsname(inode->i_sb, NULL, 0),
4561 PFID(&lli->lli_fid), rc);
4567 * Issue layout intent RPC to MDS.
4568 * \param inode [in] file inode
4569 * \param intent [in] layout intent
4571 * \retval 0 on success
4572 * \retval < 0 error code
4574 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4576 struct ll_inode_info *lli = ll_i2info(inode);
4577 struct ll_sb_info *sbi = ll_i2sbi(inode);
4578 struct md_op_data *op_data;
4579 struct lookup_intent it;
4580 struct ptlrpc_request *req;
4584 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4585 0, 0, LUSTRE_OPC_ANY, NULL);
4586 if (IS_ERR(op_data))
4587 RETURN(PTR_ERR(op_data));
4589 op_data->op_data = intent;
4590 op_data->op_data_size = sizeof(*intent);
4592 memset(&it, 0, sizeof(it));
4593 it.it_op = IT_LAYOUT;
4594 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4595 intent->li_opc == LAYOUT_INTENT_TRUNC)
4596 it.it_flags = FMODE_WRITE;
4598 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4599 ll_get_fsname(inode->i_sb, NULL, 0),
4600 PFID(&lli->lli_fid), inode);
4602 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4603 &ll_md_blocking_ast, 0);
4604 if (it.it_request != NULL)
4605 ptlrpc_req_finished(it.it_request);
4606 it.it_request = NULL;
4608 ll_finish_md_op_data(op_data);
4610 /* set lock data in case this is a new lock */
4612 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4614 ll_intent_drop_lock(&it);
4620 * This function checks if there exists a LAYOUT lock on the client side,
4621 * or enqueues it if it doesn't have one in cache.
4623 * This function will not hold layout lock so it may be revoked any time after
4624 * this function returns. Any operations depend on layout should be redone
4627 * This function should be called before lov_io_init() to get an uptodate
4628 * layout version, the caller should save the version number and after IO
4629 * is finished, this function should be called again to verify that layout
4630 * is not changed during IO time.
4632 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4634 struct ll_inode_info *lli = ll_i2info(inode);
4635 struct ll_sb_info *sbi = ll_i2sbi(inode);
4636 struct lustre_handle lockh;
4637 struct layout_intent intent = {
4638 .li_opc = LAYOUT_INTENT_ACCESS,
4640 enum ldlm_mode mode;
4644 *gen = ll_layout_version_get(lli);
4645 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4649 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4650 LASSERT(S_ISREG(inode->i_mode));
4652 /* take layout lock mutex to enqueue layout lock exclusively. */
4653 mutex_lock(&lli->lli_layout_mutex);
4656 /* mostly layout lock is caching on the local side, so try to
4657 * match it before grabbing layout lock mutex. */
4658 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4659 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4660 if (mode != 0) { /* hit cached lock */
4661 rc = ll_layout_lock_set(&lockh, mode, inode);
4667 rc = ll_layout_intent(inode, &intent);
4673 *gen = ll_layout_version_get(lli);
4674 mutex_unlock(&lli->lli_layout_mutex);
4680 * Issue layout intent RPC indicating where in a file an IO is about to write.
4682 * \param[in] inode file inode.
4683 * \param[in] start start offset of fille in bytes where an IO is about to
4685 * \param[in] end exclusive end offset in bytes of the write range.
4687 * \retval 0 on success
4688 * \retval < 0 error code
4690 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4692 struct layout_intent intent = {
4693 .li_opc = LAYOUT_INTENT_WRITE,
4700 rc = ll_layout_intent(inode, &intent);
4706 * This function send a restore request to the MDT
4708 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4710 struct hsm_user_request *hur;
4714 len = sizeof(struct hsm_user_request) +
4715 sizeof(struct hsm_user_item);
4716 OBD_ALLOC(hur, len);
4720 hur->hur_request.hr_action = HUA_RESTORE;
4721 hur->hur_request.hr_archive_id = 0;
4722 hur->hur_request.hr_flags = 0;
4723 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4724 sizeof(hur->hur_user_item[0].hui_fid));
4725 hur->hur_user_item[0].hui_extent.offset = offset;
4726 hur->hur_user_item[0].hui_extent.length = length;
4727 hur->hur_request.hr_itemcount = 1;
4728 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,