4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_CLOSE_RESYNC_DONE: {
160 struct ll_ioc_lease *ioc = data;
162 LASSERT(data != NULL);
163 op_data->op_attr_blocks +=
164 ioc->lil_count * op_data->op_attr_blocks;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_data = &ioc->lil_ids[0];
170 op_data->op_data_size =
171 ioc->lil_count * sizeof(ioc->lil_ids[0]);
175 case MDS_HSM_RELEASE:
176 LASSERT(data != NULL);
177 op_data->op_bias |= MDS_HSM_RELEASE;
178 op_data->op_data_version = *(__u64 *)data;
179 op_data->op_lease_handle = och->och_lease_handle;
180 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
184 LASSERT(data == NULL);
188 rc = md_close(md_exp, op_data, och->och_mod, &req);
189 if (rc != 0 && rc != -EINTR)
190 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
191 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
193 if (rc == 0 && op_data->op_bias & bias) {
194 struct mdt_body *body;
196 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
197 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
201 ll_finish_md_op_data(op_data);
205 md_clear_open_replay_data(md_exp, och);
206 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
209 ptlrpc_req_finished(req); /* This is close request */
213 int ll_md_real_close(struct inode *inode, fmode_t fmode)
215 struct ll_inode_info *lli = ll_i2info(inode);
216 struct obd_client_handle **och_p;
217 struct obd_client_handle *och;
222 if (fmode & FMODE_WRITE) {
223 och_p = &lli->lli_mds_write_och;
224 och_usecount = &lli->lli_open_fd_write_count;
225 } else if (fmode & FMODE_EXEC) {
226 och_p = &lli->lli_mds_exec_och;
227 och_usecount = &lli->lli_open_fd_exec_count;
229 LASSERT(fmode & FMODE_READ);
230 och_p = &lli->lli_mds_read_och;
231 och_usecount = &lli->lli_open_fd_read_count;
234 mutex_lock(&lli->lli_och_mutex);
235 if (*och_usecount > 0) {
236 /* There are still users of this handle, so skip
238 mutex_unlock(&lli->lli_och_mutex);
244 mutex_unlock(&lli->lli_och_mutex);
247 /* There might be a race and this handle may already
249 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
255 static int ll_md_close(struct inode *inode, struct file *file)
257 union ldlm_policy_data policy = {
258 .l_inodebits = { MDS_INODELOCK_OPEN },
260 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
261 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct lustre_handle lockh;
264 enum ldlm_mode lockmode;
268 /* clear group lock, if present */
269 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
270 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
272 if (fd->fd_lease_och != NULL) {
275 /* Usually the lease is not released when the
276 * application crashed, we need to release here. */
277 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
278 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
279 PFID(&lli->lli_fid), rc, lease_broken);
281 fd->fd_lease_och = NULL;
284 if (fd->fd_och != NULL) {
285 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
290 /* Let's see if we have good enough OPEN lock on the file and if
291 we can skip talking to MDS */
292 mutex_lock(&lli->lli_och_mutex);
293 if (fd->fd_omode & FMODE_WRITE) {
295 LASSERT(lli->lli_open_fd_write_count);
296 lli->lli_open_fd_write_count--;
297 } else if (fd->fd_omode & FMODE_EXEC) {
299 LASSERT(lli->lli_open_fd_exec_count);
300 lli->lli_open_fd_exec_count--;
303 LASSERT(lli->lli_open_fd_read_count);
304 lli->lli_open_fd_read_count--;
306 mutex_unlock(&lli->lli_och_mutex);
308 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
309 LDLM_IBITS, &policy, lockmode, &lockh))
310 rc = ll_md_real_close(inode, fd->fd_omode);
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
319 /* While this returns an error code, fput() the caller does not, so we need
320 * to make every effort to clean up all of our state here. Also, applications
321 * rarely check close errors and even if an error is returned they will not
322 * re-try the close call.
324 int ll_file_release(struct inode *inode, struct file *file)
326 struct ll_file_data *fd;
327 struct ll_sb_info *sbi = ll_i2sbi(inode);
328 struct ll_inode_info *lli = ll_i2info(inode);
332 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
333 PFID(ll_inode2fid(inode)), inode);
335 if (inode->i_sb->s_root != file_dentry(file))
336 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
337 fd = LUSTRE_FPRIVATE(file);
340 /* The last ref on @file, maybe not the the owner pid of statahead,
341 * because parent and child process can share the same file handle. */
342 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
343 ll_deauthorize_statahead(inode, fd);
345 if (inode->i_sb->s_root == file_dentry(file)) {
346 LUSTRE_FPRIVATE(file) = NULL;
347 ll_file_data_put(fd);
351 if (!S_ISDIR(inode->i_mode)) {
352 if (lli->lli_clob != NULL)
353 lov_read_and_clear_async_rc(lli->lli_clob);
354 lli->lli_async_rc = 0;
357 rc = ll_md_close(inode, file);
359 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
360 libcfs_debug_dumplog();
365 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
366 struct lookup_intent *itp)
368 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
369 struct dentry *parent = de->d_parent;
370 const char *name = NULL;
372 struct md_op_data *op_data;
373 struct ptlrpc_request *req = NULL;
377 LASSERT(parent != NULL);
378 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
380 /* if server supports open-by-fid, or file name is invalid, don't pack
381 * name in open request */
382 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
383 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
384 name = de->d_name.name;
385 len = de->d_name.len;
388 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
389 name, len, 0, LUSTRE_OPC_ANY, NULL);
391 RETURN(PTR_ERR(op_data));
392 op_data->op_data = lmm;
393 op_data->op_data_size = lmmsize;
395 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
396 &ll_md_blocking_ast, 0);
397 ll_finish_md_op_data(op_data);
399 /* reason for keep own exit path - don`t flood log
400 * with messages with -ESTALE errors.
402 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
403 it_open_error(DISP_OPEN_OPEN, itp))
405 ll_release_openhandle(de, itp);
409 if (it_disposition(itp, DISP_LOOKUP_NEG))
410 GOTO(out, rc = -ENOENT);
412 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
413 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
414 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
418 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
419 if (!rc && itp->it_lock_mode)
420 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
423 ptlrpc_req_finished(req);
424 ll_intent_drop_lock(itp);
426 /* We did open by fid, but by the time we got to the server,
427 * the object disappeared. If this is a create, we cannot really
428 * tell the userspace that the file it was trying to create
429 * does not exist. Instead let's return -ESTALE, and the VFS will
430 * retry the create with LOOKUP_REVAL that we are going to catch
431 * in ll_revalidate_dentry() and use lookup then.
433 if (rc == -ENOENT && itp->it_op & IT_CREAT)
439 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
440 struct obd_client_handle *och)
442 struct mdt_body *body;
444 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
445 och->och_fh = body->mbo_handle;
446 och->och_fid = body->mbo_fid1;
447 och->och_lease_handle.cookie = it->it_lock_handle;
448 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
449 och->och_flags = it->it_flags;
451 return md_set_open_replay_data(md_exp, och, it);
454 static int ll_local_open(struct file *file, struct lookup_intent *it,
455 struct ll_file_data *fd, struct obd_client_handle *och)
457 struct inode *inode = file_inode(file);
460 LASSERT(!LUSTRE_FPRIVATE(file));
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
472 LUSTRE_FPRIVATE(file) = fd;
473 ll_readahead_init(inode, &fd->fd_ras);
474 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
476 /* ll_cl_context initialize */
477 rwlock_init(&fd->fd_lock);
478 INIT_LIST_HEAD(&fd->fd_lccs);
483 /* Open a file, and (for the very first open) create objects on the OSTs at
484 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
485 * creation or open until ll_lov_setstripe() ioctl is called.
487 * If we already have the stripe MD locally then we don't request it in
488 * md_open(), by passing a lmm_size = 0.
490 * It is up to the application to ensure no other processes open this file
491 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
492 * used. We might be able to avoid races of that sort by getting lli_open_sem
493 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
494 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
496 int ll_file_open(struct inode *inode, struct file *file)
498 struct ll_inode_info *lli = ll_i2info(inode);
499 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
500 .it_flags = file->f_flags };
501 struct obd_client_handle **och_p = NULL;
502 __u64 *och_usecount = NULL;
503 struct ll_file_data *fd;
507 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
508 PFID(ll_inode2fid(inode)), inode, file->f_flags);
510 it = file->private_data; /* XXX: compat macro */
511 file->private_data = NULL; /* prevent ll_local_open assertion */
513 fd = ll_file_data_get();
515 GOTO(out_openerr, rc = -ENOMEM);
518 if (S_ISDIR(inode->i_mode))
519 ll_authorize_statahead(inode, fd);
521 if (inode->i_sb->s_root == file_dentry(file)) {
522 LUSTRE_FPRIVATE(file) = fd;
526 if (!it || !it->it_disposition) {
527 /* Convert f_flags into access mode. We cannot use file->f_mode,
528 * because everything but O_ACCMODE mask was stripped from
530 if ((oit.it_flags + 1) & O_ACCMODE)
532 if (file->f_flags & O_TRUNC)
533 oit.it_flags |= FMODE_WRITE;
535 /* kernel only call f_op->open in dentry_open. filp_open calls
536 * dentry_open after call to open_namei that checks permissions.
537 * Only nfsd_open call dentry_open directly without checking
538 * permissions and because of that this code below is safe. */
539 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
540 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
542 /* We do not want O_EXCL here, presumably we opened the file
543 * already? XXX - NFS implications? */
544 oit.it_flags &= ~O_EXCL;
546 /* bug20584, if "it_flags" contains O_CREAT, the file will be
547 * created if necessary, then "IT_CREAT" should be set to keep
548 * consistent with it */
549 if (oit.it_flags & O_CREAT)
550 oit.it_op |= IT_CREAT;
556 /* Let's see if we have file open on MDS already. */
557 if (it->it_flags & FMODE_WRITE) {
558 och_p = &lli->lli_mds_write_och;
559 och_usecount = &lli->lli_open_fd_write_count;
560 } else if (it->it_flags & FMODE_EXEC) {
561 och_p = &lli->lli_mds_exec_och;
562 och_usecount = &lli->lli_open_fd_exec_count;
564 och_p = &lli->lli_mds_read_och;
565 och_usecount = &lli->lli_open_fd_read_count;
568 mutex_lock(&lli->lli_och_mutex);
569 if (*och_p) { /* Open handle is present */
570 if (it_disposition(it, DISP_OPEN_OPEN)) {
571 /* Well, there's extra open request that we do not need,
572 let's close it somehow. This will decref request. */
573 rc = it_open_error(DISP_OPEN_OPEN, it);
575 mutex_unlock(&lli->lli_och_mutex);
576 GOTO(out_openerr, rc);
579 ll_release_openhandle(file_dentry(file), it);
583 rc = ll_local_open(file, it, fd, NULL);
586 mutex_unlock(&lli->lli_och_mutex);
587 GOTO(out_openerr, rc);
590 LASSERT(*och_usecount == 0);
591 if (!it->it_disposition) {
592 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
593 /* We cannot just request lock handle now, new ELC code
594 means that one of other OPEN locks for this file
595 could be cancelled, and since blocking ast handler
596 would attempt to grab och_mutex as well, that would
597 result in a deadlock */
598 mutex_unlock(&lli->lli_och_mutex);
600 * Normally called under two situations:
602 * 2. A race/condition on MDS resulting in no open
603 * handle to be returned from LOOKUP|OPEN request,
604 * for example if the target entry was a symlink.
606 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
607 * marked by a bit set in ll_iget_for_nfs. Clear the
608 * bit so that it's not confusing later callers.
610 * NB; when ldd is NULL, it must have come via normal
611 * lookup path only, since ll_iget_for_nfs always calls
614 if (ldd && ldd->lld_nfs_dentry) {
615 ldd->lld_nfs_dentry = 0;
616 it->it_flags |= MDS_OPEN_LOCK;
620 * Always specify MDS_OPEN_BY_FID because we don't want
621 * to get file with different fid.
623 it->it_flags |= MDS_OPEN_BY_FID;
624 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
627 GOTO(out_openerr, rc);
631 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
633 GOTO(out_och_free, rc = -ENOMEM);
637 /* md_intent_lock() didn't get a request ref if there was an
638 * open error, so don't do cleanup on the request here
640 /* XXX (green): Should not we bail out on any error here, not
641 * just open error? */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 GOTO(out_och_free, rc);
646 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
647 "inode %p: disposition %x, status %d\n", inode,
648 it_disposition(it, ~0), it->it_status);
650 rc = ll_local_open(file, it, fd, *och_p);
652 GOTO(out_och_free, rc);
654 mutex_unlock(&lli->lli_och_mutex);
657 /* Must do this outside lli_och_mutex lock to prevent deadlock where
658 different kind of OPEN lock for this same inode gets cancelled
659 by ldlm_cancel_lru */
660 if (!S_ISREG(inode->i_mode))
661 GOTO(out_och_free, rc);
663 cl_lov_delay_create_clear(&file->f_flags);
664 GOTO(out_och_free, rc);
668 if (och_p && *och_p) {
669 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
670 *och_p = NULL; /* OBD_FREE writes some magic there */
673 mutex_unlock(&lli->lli_och_mutex);
676 if (lli->lli_opendir_key == fd)
677 ll_deauthorize_statahead(inode, fd);
679 ll_file_data_put(fd);
681 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
684 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
685 ptlrpc_req_finished(it->it_request);
686 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
692 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
693 struct ldlm_lock_desc *desc, void *data, int flag)
696 struct lustre_handle lockh;
700 case LDLM_CB_BLOCKING:
701 ldlm_lock2handle(lock, &lockh);
702 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
704 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
708 case LDLM_CB_CANCELING:
716 * When setting a lease on a file, we take ownership of the lli_mds_*_och
717 * and save it as fd->fd_och so as to force client to reopen the file even
718 * if it has an open lock in cache already.
720 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
721 struct lustre_handle *old_handle)
723 struct ll_inode_info *lli = ll_i2info(inode);
724 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
725 struct obd_client_handle **och_p;
730 /* Get the openhandle of the file */
731 mutex_lock(&lli->lli_och_mutex);
732 if (fd->fd_lease_och != NULL)
733 GOTO(out_unlock, rc = -EBUSY);
735 if (fd->fd_och == NULL) {
736 if (file->f_mode & FMODE_WRITE) {
737 LASSERT(lli->lli_mds_write_och != NULL);
738 och_p = &lli->lli_mds_write_och;
739 och_usecount = &lli->lli_open_fd_write_count;
741 LASSERT(lli->lli_mds_read_och != NULL);
742 och_p = &lli->lli_mds_read_och;
743 och_usecount = &lli->lli_open_fd_read_count;
746 if (*och_usecount > 1)
747 GOTO(out_unlock, rc = -EBUSY);
754 *old_handle = fd->fd_och->och_fh;
758 mutex_unlock(&lli->lli_och_mutex);
763 * Release ownership on lli_mds_*_och when putting back a file lease.
765 static int ll_lease_och_release(struct inode *inode, struct file *file)
767 struct ll_inode_info *lli = ll_i2info(inode);
768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
769 struct obd_client_handle **och_p;
770 struct obd_client_handle *old_och = NULL;
775 mutex_lock(&lli->lli_och_mutex);
776 if (file->f_mode & FMODE_WRITE) {
777 och_p = &lli->lli_mds_write_och;
778 och_usecount = &lli->lli_open_fd_write_count;
780 och_p = &lli->lli_mds_read_och;
781 och_usecount = &lli->lli_open_fd_read_count;
784 /* The file may have been open by another process (broken lease) so
785 * *och_p is not NULL. In this case we should simply increase usecount
788 if (*och_p != NULL) {
789 old_och = fd->fd_och;
796 mutex_unlock(&lli->lli_och_mutex);
799 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
805 * Acquire a lease and open the file.
807 static struct obd_client_handle *
808 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
811 struct lookup_intent it = { .it_op = IT_OPEN };
812 struct ll_sb_info *sbi = ll_i2sbi(inode);
813 struct md_op_data *op_data;
814 struct ptlrpc_request *req = NULL;
815 struct lustre_handle old_handle = { 0 };
816 struct obd_client_handle *och = NULL;
821 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
822 RETURN(ERR_PTR(-EINVAL));
825 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
826 RETURN(ERR_PTR(-EPERM));
828 rc = ll_lease_och_acquire(inode, file, &old_handle);
835 RETURN(ERR_PTR(-ENOMEM));
837 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
838 LUSTRE_OPC_ANY, NULL);
840 GOTO(out, rc = PTR_ERR(op_data));
842 /* To tell the MDT this openhandle is from the same owner */
843 op_data->op_handle = old_handle;
845 it.it_flags = fmode | open_flags;
846 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
847 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
848 &ll_md_blocking_lease_ast,
849 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
850 * it can be cancelled which may mislead applications that the lease is
852 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
853 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
854 * doesn't deal with openhandle, so normal openhandle will be leaked. */
855 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
856 ll_finish_md_op_data(op_data);
857 ptlrpc_req_finished(req);
859 GOTO(out_release_it, rc);
861 if (it_disposition(&it, DISP_LOOKUP_NEG))
862 GOTO(out_release_it, rc = -ENOENT);
864 rc = it_open_error(DISP_OPEN_OPEN, &it);
866 GOTO(out_release_it, rc);
868 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
869 ll_och_fill(sbi->ll_md_exp, &it, och);
871 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
872 GOTO(out_close, rc = -EOPNOTSUPP);
874 /* already get lease, handle lease lock */
875 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
876 if (it.it_lock_mode == 0 ||
877 it.it_lock_bits != MDS_INODELOCK_OPEN) {
878 /* open lock must return for lease */
879 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
880 PFID(ll_inode2fid(inode)), it.it_lock_mode,
882 GOTO(out_close, rc = -EPROTO);
885 ll_intent_release(&it);
889 /* Cancel open lock */
890 if (it.it_lock_mode != 0) {
891 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
894 och->och_lease_handle.cookie = 0ULL;
896 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
898 CERROR("%s: error closing file "DFID": %d\n",
899 ll_get_fsname(inode->i_sb, NULL, 0),
900 PFID(&ll_i2info(inode)->lli_fid), rc2);
901 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
903 ll_intent_release(&it);
911 * Check whether a layout swap can be done between two inodes.
913 * \param[in] inode1 First inode to check
914 * \param[in] inode2 Second inode to check
916 * \retval 0 on success, layout swap can be performed between both inodes
917 * \retval negative error code if requirements are not met
919 static int ll_check_swap_layouts_validity(struct inode *inode1,
920 struct inode *inode2)
922 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
925 if (inode_permission(inode1, MAY_WRITE) ||
926 inode_permission(inode2, MAY_WRITE))
929 if (inode1->i_sb != inode2->i_sb)
935 static int ll_swap_layouts_close(struct obd_client_handle *och,
936 struct inode *inode, struct inode *inode2)
938 const struct lu_fid *fid1 = ll_inode2fid(inode);
939 const struct lu_fid *fid2;
943 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
944 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
946 rc = ll_check_swap_layouts_validity(inode, inode2);
948 GOTO(out_free_och, rc);
950 /* We now know that inode2 is a lustre inode */
951 fid2 = ll_inode2fid(inode2);
953 rc = lu_fid_cmp(fid1, fid2);
955 GOTO(out_free_och, rc = -EINVAL);
957 /* Close the file and {swap,merge} layouts between inode & inode2.
958 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
959 * because we still need it to pack l_remote_handle to MDT. */
960 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
963 och = NULL; /* freed in ll_close_inode_openhandle() */
973 * Release lease and close the file.
974 * It will check if the lease has ever broken.
976 static int ll_lease_close_intent(struct obd_client_handle *och,
978 bool *lease_broken, enum mds_op_bias bias,
981 struct ldlm_lock *lock;
982 bool cancelled = true;
986 lock = ldlm_handle2lock(&och->och_lease_handle);
988 lock_res_and_lock(lock);
989 cancelled = ldlm_is_cancel(lock);
990 unlock_res_and_lock(lock);
994 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
995 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
997 if (lease_broken != NULL)
998 *lease_broken = cancelled;
1000 if (!cancelled && !bias)
1001 ldlm_cli_cancel(&och->och_lease_handle, 0);
1003 if (cancelled) { /* no need to excute intent */
1008 rc = ll_close_inode_openhandle(inode, och, bias, data);
1012 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1015 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1019 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1021 static int ll_lease_file_resync(struct obd_client_handle *och,
1022 struct inode *inode)
1024 struct ll_sb_info *sbi = ll_i2sbi(inode);
1025 struct md_op_data *op_data;
1026 __u64 data_version_unused;
1030 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1031 LUSTRE_OPC_ANY, NULL);
1032 if (IS_ERR(op_data))
1033 RETURN(PTR_ERR(op_data));
1035 /* before starting file resync, it's necessary to clean up page cache
1036 * in client memory, otherwise once the layout version is increased,
1037 * writing back cached data will be denied the OSTs. */
1038 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1042 op_data->op_handle = och->och_lease_handle;
1043 rc = md_file_resync(sbi->ll_md_exp, op_data);
1049 ll_finish_md_op_data(op_data);
1053 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1055 struct ll_inode_info *lli = ll_i2info(inode);
1056 struct cl_object *obj = lli->lli_clob;
1057 struct cl_attr *attr = vvp_env_thread_attr(env);
1065 ll_inode_size_lock(inode);
1067 /* Merge timestamps the most recently obtained from MDS with
1068 * timestamps obtained from OSTs.
1070 * Do not overwrite atime of inode because it may be refreshed
1071 * by file_accessed() function. If the read was served by cache
1072 * data, there is no RPC to be sent so that atime may not be
1073 * transferred to OSTs at all. MDT only updates atime at close time
1074 * if it's at least 'mdd.*.atime_diff' older.
1075 * All in all, the atime in Lustre does not strictly comply with
1076 * POSIX. Solving this problem needs to send an RPC to MDT for each
1077 * read, this will hurt performance. */
1078 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1079 LTIME_S(inode->i_atime) = lli->lli_atime;
1080 lli->lli_update_atime = 0;
1082 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1083 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1085 atime = LTIME_S(inode->i_atime);
1086 mtime = LTIME_S(inode->i_mtime);
1087 ctime = LTIME_S(inode->i_ctime);
1089 cl_object_attr_lock(obj);
1090 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1093 rc = cl_object_attr_get(env, obj, attr);
1094 cl_object_attr_unlock(obj);
1097 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1099 if (atime < attr->cat_atime)
1100 atime = attr->cat_atime;
1102 if (ctime < attr->cat_ctime)
1103 ctime = attr->cat_ctime;
1105 if (mtime < attr->cat_mtime)
1106 mtime = attr->cat_mtime;
1108 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1109 PFID(&lli->lli_fid), attr->cat_size);
1111 i_size_write(inode, attr->cat_size);
1112 inode->i_blocks = attr->cat_blocks;
1114 LTIME_S(inode->i_atime) = atime;
1115 LTIME_S(inode->i_mtime) = mtime;
1116 LTIME_S(inode->i_ctime) = ctime;
1119 ll_inode_size_unlock(inode);
1125 * Set designated mirror for I/O.
1127 * So far only read, write, and truncated can support to issue I/O to
1128 * designated mirror.
1130 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1132 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1134 /* clear layout version for generic(non-resync) I/O in case it carries
1135 * stale layout version due to I/O restart */
1136 io->ci_layout_version = 0;
1138 /* FLR: disable non-delay for designated mirror I/O because obviously
1139 * only one mirror is available */
1140 if (fd->fd_designated_mirror > 0) {
1142 io->ci_designated_mirror = fd->fd_designated_mirror;
1143 io->ci_layout_version = fd->fd_layout_version;
1144 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1148 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1149 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1152 static bool file_is_noatime(const struct file *file)
1154 const struct vfsmount *mnt = file->f_path.mnt;
1155 const struct inode *inode = file_inode((struct file *)file);
1157 /* Adapted from file_accessed() and touch_atime().*/
1158 if (file->f_flags & O_NOATIME)
1161 if (inode->i_flags & S_NOATIME)
1164 if (IS_NOATIME(inode))
1167 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1170 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1173 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1179 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1181 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1183 struct inode *inode = file_inode(file);
1184 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1186 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1187 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1188 io->u.ci_rw.rw_file = file;
1189 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1190 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1191 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1193 if (iot == CIT_WRITE) {
1194 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1195 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1196 file->f_flags & O_DIRECT ||
1199 io->ci_obj = ll_i2info(inode)->lli_clob;
1200 io->ci_lockreq = CILR_MAYBE;
1201 if (ll_file_nolock(file)) {
1202 io->ci_lockreq = CILR_NEVER;
1203 io->ci_no_srvlock = 1;
1204 } else if (file->f_flags & O_APPEND) {
1205 io->ci_lockreq = CILR_MANDATORY;
1207 io->ci_noatime = file_is_noatime(file);
1208 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1209 io->ci_pio = !io->u.ci_rw.rw_append;
1213 /* FLR: only use non-delay I/O for read as there is only one
1214 * avaliable mirror for write. */
1215 io->ci_ndelay = !(iot == CIT_WRITE);
1217 ll_io_set_mirror(io, file);
1220 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1222 struct cl_io_pt *pt = ptask->pt_cbdata;
1223 struct file *file = pt->cip_file;
1226 loff_t pos = pt->cip_pos;
1231 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1232 file_dentry(file)->d_name.name,
1233 pt->cip_iot == CIT_READ ? "read" : "write",
1234 pos, pos + pt->cip_count);
1236 env = cl_env_get(&refcheck);
1238 RETURN(PTR_ERR(env));
1240 io = vvp_env_thread_io(env);
1241 ll_io_init(io, file, pt->cip_iot);
1242 io->u.ci_rw.rw_iter = pt->cip_iter;
1243 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1244 io->ci_pio = 0; /* It's already in parallel task */
1246 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1247 pt->cip_count - pt->cip_result);
1249 struct vvp_io *vio = vvp_env_io(env);
1251 vio->vui_io_subtype = IO_NORMAL;
1252 vio->vui_fd = LUSTRE_FPRIVATE(file);
1254 ll_cl_add(file, env, io, LCC_RW);
1255 rc = cl_io_loop(env, io);
1256 ll_cl_remove(file, env);
1258 /* cl_io_rw_init() handled IO */
1262 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1268 if (io->ci_nob > 0) {
1269 pt->cip_result += io->ci_nob;
1270 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1272 pt->cip_iocb.ki_pos = pos;
1273 #ifdef HAVE_KIOCB_KI_LEFT
1274 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1275 #elif defined(HAVE_KI_NBYTES)
1276 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1280 cl_io_fini(env, io);
1281 cl_env_put(env, &refcheck);
1283 pt->cip_need_restart = io->ci_need_restart;
1285 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1286 file_dentry(file)->d_name.name,
1287 pt->cip_iot == CIT_READ ? "read" : "write",
1288 pt->cip_result, rc);
1290 RETURN(pt->cip_result > 0 ? 0 : rc);
1294 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1295 struct file *file, enum cl_io_type iot,
1296 loff_t *ppos, size_t count)
1298 struct range_lock range;
1299 struct vvp_io *vio = vvp_env_io(env);
1300 struct inode *inode = file_inode(file);
1301 struct ll_inode_info *lli = ll_i2info(inode);
1302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1307 unsigned retried = 0;
1308 bool restarted = false;
1312 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1313 file_dentry(file)->d_name.name,
1314 iot == CIT_READ ? "read" : "write", pos, pos + count);
1317 io = vvp_env_thread_io(env);
1318 ll_io_init(io, file, iot);
1319 if (args->via_io_subtype == IO_NORMAL) {
1320 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1321 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1323 if (args->via_io_subtype != IO_NORMAL || restarted)
1325 io->ci_ndelay_tried = retried;
1327 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1328 bool range_locked = false;
1330 if (file->f_flags & O_APPEND)
1331 range_lock_init(&range, 0, LUSTRE_EOF);
1333 range_lock_init(&range, pos, pos + count - 1);
1335 vio->vui_fd = LUSTRE_FPRIVATE(file);
1336 vio->vui_io_subtype = args->via_io_subtype;
1338 switch (vio->vui_io_subtype) {
1340 /* Direct IO reads must also take range lock,
1341 * or multiple reads will try to work on the same pages
1342 * See LU-6227 for details. */
1343 if (((iot == CIT_WRITE) ||
1344 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1345 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1346 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1348 rc = range_lock(&lli->lli_write_tree, &range);
1352 range_locked = true;
1356 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1357 vio->u.splice.vui_flags = args->u.splice.via_flags;
1360 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1364 ll_cl_add(file, env, io, LCC_RW);
1365 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1366 !lli->lli_inode_locked) {
1368 lli->lli_inode_locked = 1;
1370 rc = cl_io_loop(env, io);
1371 if (lli->lli_inode_locked) {
1372 lli->lli_inode_locked = 0;
1373 inode_unlock(inode);
1375 ll_cl_remove(file, env);
1378 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1380 range_unlock(&lli->lli_write_tree, &range);
1383 /* cl_io_rw_init() handled IO */
1387 if (io->ci_nob > 0) {
1388 result += io->ci_nob;
1389 count -= io->ci_nob;
1391 if (args->via_io_subtype == IO_NORMAL) {
1392 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1394 args->u.normal.via_iocb->ki_pos = pos;
1395 #ifdef HAVE_KIOCB_KI_LEFT
1396 args->u.normal.via_iocb->ki_left = count;
1397 #elif defined(HAVE_KI_NBYTES)
1398 args->u.normal.via_iocb->ki_nbytes = count;
1402 pos = io->u.ci_rw.rw_range.cir_pos;
1406 cl_io_fini(env, io);
1409 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1410 file->f_path.dentry->d_name.name,
1411 iot, rc, result, io->ci_need_restart);
1413 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1415 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1416 file_dentry(file)->d_name.name,
1417 iot == CIT_READ ? "read" : "write",
1418 pos, pos + count, result, rc);
1419 /* preserve the tried count for FLR */
1420 retried = io->ci_ndelay_tried;
1425 if (iot == CIT_READ) {
1427 ll_stats_ops_tally(ll_i2sbi(inode),
1428 LPROC_LL_READ_BYTES, result);
1429 } else if (iot == CIT_WRITE) {
1431 ll_stats_ops_tally(ll_i2sbi(inode),
1432 LPROC_LL_WRITE_BYTES, result);
1433 fd->fd_write_failed = false;
1434 } else if (result == 0 && rc == 0) {
1437 fd->fd_write_failed = true;
1439 fd->fd_write_failed = false;
1440 } else if (rc != -ERESTARTSYS) {
1441 fd->fd_write_failed = true;
1445 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1446 file_dentry(file)->d_name.name,
1447 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1451 RETURN(result > 0 ? result : rc);
1455 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1456 * especially for small I/O.
1458 * To serve a read request, CLIO has to create and initialize a cl_io and
1459 * then request DLM lock. This has turned out to have siginificant overhead
1460 * and affects the performance of small I/O dramatically.
1462 * It's not necessary to create a cl_io for each I/O. Under the help of read
1463 * ahead, most of the pages being read are already in memory cache and we can
1464 * read those pages directly because if the pages exist, the corresponding DLM
1465 * lock must exist so that page content must be valid.
1467 * In fast read implementation, the llite speculatively finds and reads pages
1468 * in memory cache. There are three scenarios for fast read:
1469 * - If the page exists and is uptodate, kernel VM will provide the data and
1470 * CLIO won't be intervened;
1471 * - If the page was brought into memory by read ahead, it will be exported
1472 * and read ahead parameters will be updated;
1473 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1474 * it will go back and invoke normal read, i.e., a cl_io will be created
1475 * and DLM lock will be requested.
1477 * POSIX compliance: posix standard states that read is intended to be atomic.
1478 * Lustre read implementation is in line with Linux kernel read implementation
1479 * and neither of them complies with POSIX standard in this matter. Fast read
1480 * doesn't make the situation worse on single node but it may interleave write
1481 * results from multiple nodes due to short read handling in ll_file_aio_read().
1483 * \param env - lu_env
1484 * \param iocb - kiocb from kernel
1485 * \param iter - user space buffers where the data will be copied
1487 * \retval - number of bytes have been read, or error code if error occurred.
1490 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1494 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1497 /* NB: we can't do direct IO for fast read because it will need a lock
1498 * to make IO engine happy. */
1499 if (iocb->ki_filp->f_flags & O_DIRECT)
1502 result = generic_file_read_iter(iocb, iter);
1504 /* If the first page is not in cache, generic_file_aio_read() will be
1505 * returned with -ENODATA.
1506 * See corresponding code in ll_readpage(). */
1507 if (result == -ENODATA)
1511 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1512 LPROC_LL_READ_BYTES, result);
1518 * Read from a file (through the page cache).
1520 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1523 struct vvp_io_args *args;
1528 result = ll_do_fast_read(iocb, to);
1529 if (result < 0 || iov_iter_count(to) == 0)
1532 env = cl_env_get(&refcheck);
1534 return PTR_ERR(env);
1536 args = ll_env_args(env, IO_NORMAL);
1537 args->u.normal.via_iter = to;
1538 args->u.normal.via_iocb = iocb;
1540 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1541 &iocb->ki_pos, iov_iter_count(to));
1544 else if (result == 0)
1547 cl_env_put(env, &refcheck);
1553 * Write to a file (through the page cache).
1555 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1557 struct vvp_io_args *args;
1562 env = cl_env_get(&refcheck);
1564 return PTR_ERR(env);
1566 args = ll_env_args(env, IO_NORMAL);
1567 args->u.normal.via_iter = from;
1568 args->u.normal.via_iocb = iocb;
1570 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1571 &iocb->ki_pos, iov_iter_count(from));
1572 cl_env_put(env, &refcheck);
1576 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1578 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1580 static int ll_file_get_iov_count(const struct iovec *iov,
1581 unsigned long *nr_segs, size_t *count)
1586 for (seg = 0; seg < *nr_segs; seg++) {
1587 const struct iovec *iv = &iov[seg];
1590 * If any segment has a negative length, or the cumulative
1591 * length ever wraps negative then return -EINVAL.
1594 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1596 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1601 cnt -= iv->iov_len; /* This segment is no good */
1608 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1609 unsigned long nr_segs, loff_t pos)
1616 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1620 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1621 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1622 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1623 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1624 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1626 result = ll_file_read_iter(iocb, &to);
1631 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1634 struct iovec iov = { .iov_base = buf, .iov_len = count };
1639 init_sync_kiocb(&kiocb, file);
1640 kiocb.ki_pos = *ppos;
1641 #ifdef HAVE_KIOCB_KI_LEFT
1642 kiocb.ki_left = count;
1643 #elif defined(HAVE_KI_NBYTES)
1644 kiocb.i_nbytes = count;
1647 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1648 *ppos = kiocb.ki_pos;
1654 * Write to a file (through the page cache).
1657 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1658 unsigned long nr_segs, loff_t pos)
1660 struct iov_iter from;
1665 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1669 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1670 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1671 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1672 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1673 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1675 result = ll_file_write_iter(iocb, &from);
1680 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1681 size_t count, loff_t *ppos)
1684 struct iovec iov = { .iov_base = (void __user *)buf,
1686 struct kiocb *kiocb;
1691 env = cl_env_get(&refcheck);
1693 RETURN(PTR_ERR(env));
1695 kiocb = &ll_env_info(env)->lti_kiocb;
1696 init_sync_kiocb(kiocb, file);
1697 kiocb->ki_pos = *ppos;
1698 #ifdef HAVE_KIOCB_KI_LEFT
1699 kiocb->ki_left = count;
1700 #elif defined(HAVE_KI_NBYTES)
1701 kiocb->ki_nbytes = count;
1704 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1705 *ppos = kiocb->ki_pos;
1707 cl_env_put(env, &refcheck);
1710 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1713 * Send file content (through pagecache) somewhere with helper
1715 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1716 struct pipe_inode_info *pipe, size_t count,
1720 struct vvp_io_args *args;
1725 env = cl_env_get(&refcheck);
1727 RETURN(PTR_ERR(env));
1729 args = ll_env_args(env, IO_SPLICE);
1730 args->u.splice.via_pipe = pipe;
1731 args->u.splice.via_flags = flags;
1733 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1734 cl_env_put(env, &refcheck);
1738 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1739 __u64 flags, struct lov_user_md *lum, int lum_size)
1741 struct lookup_intent oit = {
1743 .it_flags = flags | MDS_OPEN_BY_FID,
1748 ll_inode_size_lock(inode);
1749 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1751 GOTO(out_unlock, rc);
1753 ll_release_openhandle(dentry, &oit);
1756 ll_inode_size_unlock(inode);
1757 ll_intent_release(&oit);
1762 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1763 struct lov_mds_md **lmmp, int *lmm_size,
1764 struct ptlrpc_request **request)
1766 struct ll_sb_info *sbi = ll_i2sbi(inode);
1767 struct mdt_body *body;
1768 struct lov_mds_md *lmm = NULL;
1769 struct ptlrpc_request *req = NULL;
1770 struct md_op_data *op_data;
1773 rc = ll_get_default_mdsize(sbi, &lmmsize);
1777 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1778 strlen(filename), lmmsize,
1779 LUSTRE_OPC_ANY, NULL);
1780 if (IS_ERR(op_data))
1781 RETURN(PTR_ERR(op_data));
1783 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1784 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1785 ll_finish_md_op_data(op_data);
1787 CDEBUG(D_INFO, "md_getattr_name failed "
1788 "on %s: rc %d\n", filename, rc);
1792 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1793 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1795 lmmsize = body->mbo_eadatasize;
1797 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1799 GOTO(out, rc = -ENODATA);
1802 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1803 LASSERT(lmm != NULL);
1805 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1806 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1807 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1808 GOTO(out, rc = -EPROTO);
1811 * This is coming from the MDS, so is probably in
1812 * little endian. We convert it to host endian before
1813 * passing it to userspace.
1815 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1818 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1819 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1820 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1821 if (le32_to_cpu(lmm->lmm_pattern) &
1822 LOV_PATTERN_F_RELEASED)
1826 /* if function called for directory - we should
1827 * avoid swab not existent lsm objects */
1828 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1829 lustre_swab_lov_user_md_v1(
1830 (struct lov_user_md_v1 *)lmm);
1831 if (S_ISREG(body->mbo_mode))
1832 lustre_swab_lov_user_md_objects(
1833 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1835 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1836 lustre_swab_lov_user_md_v3(
1837 (struct lov_user_md_v3 *)lmm);
1838 if (S_ISREG(body->mbo_mode))
1839 lustre_swab_lov_user_md_objects(
1840 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1842 } else if (lmm->lmm_magic ==
1843 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1844 lustre_swab_lov_comp_md_v1(
1845 (struct lov_comp_md_v1 *)lmm);
1851 *lmm_size = lmmsize;
1856 static int ll_lov_setea(struct inode *inode, struct file *file,
1859 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1860 struct lov_user_md *lump;
1861 int lum_size = sizeof(struct lov_user_md) +
1862 sizeof(struct lov_user_ost_data);
1866 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1869 OBD_ALLOC_LARGE(lump, lum_size);
1873 if (copy_from_user(lump, arg, lum_size))
1874 GOTO(out_lump, rc = -EFAULT);
1876 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1878 cl_lov_delay_create_clear(&file->f_flags);
1881 OBD_FREE_LARGE(lump, lum_size);
1885 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1892 env = cl_env_get(&refcheck);
1894 RETURN(PTR_ERR(env));
1896 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1897 cl_env_put(env, &refcheck);
1901 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1904 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1905 struct lov_user_md *klum;
1907 __u64 flags = FMODE_WRITE;
1910 rc = ll_copy_user_md(lum, &klum);
1915 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1920 rc = put_user(0, &lum->lmm_stripe_count);
1924 rc = ll_layout_refresh(inode, &gen);
1928 rc = ll_file_getstripe(inode, arg, lum_size);
1930 cl_lov_delay_create_clear(&file->f_flags);
1933 OBD_FREE(klum, lum_size);
1938 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1940 struct ll_inode_info *lli = ll_i2info(inode);
1941 struct cl_object *obj = lli->lli_clob;
1942 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1943 struct ll_grouplock grouplock;
1948 CWARN("group id for group lock must not be 0\n");
1952 if (ll_file_nolock(file))
1953 RETURN(-EOPNOTSUPP);
1955 spin_lock(&lli->lli_lock);
1956 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1957 CWARN("group lock already existed with gid %lu\n",
1958 fd->fd_grouplock.lg_gid);
1959 spin_unlock(&lli->lli_lock);
1962 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1963 spin_unlock(&lli->lli_lock);
1966 * XXX: group lock needs to protect all OST objects while PFL
1967 * can add new OST objects during the IO, so we'd instantiate
1968 * all OST objects before getting its group lock.
1973 struct cl_layout cl = {
1974 .cl_is_composite = false,
1976 struct lu_extent ext = {
1978 .e_end = OBD_OBJECT_EOF,
1981 env = cl_env_get(&refcheck);
1983 RETURN(PTR_ERR(env));
1985 rc = cl_object_layout_get(env, obj, &cl);
1986 if (!rc && cl.cl_is_composite)
1987 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
1990 cl_env_put(env, &refcheck);
1995 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1996 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2000 spin_lock(&lli->lli_lock);
2001 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2002 spin_unlock(&lli->lli_lock);
2003 CERROR("another thread just won the race\n");
2004 cl_put_grouplock(&grouplock);
2008 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2009 fd->fd_grouplock = grouplock;
2010 spin_unlock(&lli->lli_lock);
2012 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2016 static int ll_put_grouplock(struct inode *inode, struct file *file,
2019 struct ll_inode_info *lli = ll_i2info(inode);
2020 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2021 struct ll_grouplock grouplock;
2024 spin_lock(&lli->lli_lock);
2025 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2026 spin_unlock(&lli->lli_lock);
2027 CWARN("no group lock held\n");
2031 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2033 if (fd->fd_grouplock.lg_gid != arg) {
2034 CWARN("group lock %lu doesn't match current id %lu\n",
2035 arg, fd->fd_grouplock.lg_gid);
2036 spin_unlock(&lli->lli_lock);
2040 grouplock = fd->fd_grouplock;
2041 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2042 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2043 spin_unlock(&lli->lli_lock);
2045 cl_put_grouplock(&grouplock);
2046 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2051 * Close inode open handle
2053 * \param dentry [in] dentry which contains the inode
2054 * \param it [in,out] intent which contains open info and result
2057 * \retval <0 failure
2059 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2061 struct inode *inode = dentry->d_inode;
2062 struct obd_client_handle *och;
2068 /* Root ? Do nothing. */
2069 if (dentry->d_inode->i_sb->s_root == dentry)
2072 /* No open handle to close? Move away */
2073 if (!it_disposition(it, DISP_OPEN_OPEN))
2076 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2078 OBD_ALLOC(och, sizeof(*och));
2080 GOTO(out, rc = -ENOMEM);
2082 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2084 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2086 /* this one is in place of ll_file_open */
2087 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2088 ptlrpc_req_finished(it->it_request);
2089 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2095 * Get size for inode for which FIEMAP mapping is requested.
2096 * Make the FIEMAP get_info call and returns the result.
2097 * \param fiemap kernel buffer to hold extens
2098 * \param num_bytes kernel buffer size
2100 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2106 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2109 /* Checks for fiemap flags */
2110 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2111 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2115 /* Check for FIEMAP_FLAG_SYNC */
2116 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2117 rc = filemap_fdatawrite(inode->i_mapping);
2122 env = cl_env_get(&refcheck);
2124 RETURN(PTR_ERR(env));
2126 if (i_size_read(inode) == 0) {
2127 rc = ll_glimpse_size(inode);
2132 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2133 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2134 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2136 /* If filesize is 0, then there would be no objects for mapping */
2137 if (fmkey.lfik_oa.o_size == 0) {
2138 fiemap->fm_mapped_extents = 0;
2142 fmkey.lfik_fiemap = *fiemap;
2144 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2145 &fmkey, fiemap, &num_bytes);
2147 cl_env_put(env, &refcheck);
2151 int ll_fid2path(struct inode *inode, void __user *arg)
2153 struct obd_export *exp = ll_i2mdexp(inode);
2154 const struct getinfo_fid2path __user *gfin = arg;
2156 struct getinfo_fid2path *gfout;
2162 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2163 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2166 /* Only need to get the buflen */
2167 if (get_user(pathlen, &gfin->gf_pathlen))
2170 if (pathlen > PATH_MAX)
2173 outsize = sizeof(*gfout) + pathlen;
2174 OBD_ALLOC(gfout, outsize);
2178 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2179 GOTO(gf_free, rc = -EFAULT);
2180 /* append root FID after gfout to let MDT know the root FID so that it
2181 * can lookup the correct path, this is mainly for fileset.
2182 * old server without fileset mount support will ignore this. */
2183 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2185 /* Call mdc_iocontrol */
2186 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2190 if (copy_to_user(arg, gfout, outsize))
2194 OBD_FREE(gfout, outsize);
2199 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2201 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2209 ioc->idv_version = 0;
2210 ioc->idv_layout_version = UINT_MAX;
2212 /* If no file object initialized, we consider its version is 0. */
2216 env = cl_env_get(&refcheck);
2218 RETURN(PTR_ERR(env));
2220 io = vvp_env_thread_io(env);
2222 io->u.ci_data_version.dv_data_version = 0;
2223 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2224 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2227 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2228 result = cl_io_loop(env, io);
2230 result = io->ci_result;
2232 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2233 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2235 cl_io_fini(env, io);
2237 if (unlikely(io->ci_need_restart))
2240 cl_env_put(env, &refcheck);
2246 * Read the data_version for inode.
2248 * This value is computed using stripe object version on OST.
2249 * Version is computed using server side locking.
2251 * @param flags if do sync on the OST side;
2253 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2254 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2256 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2258 struct ioc_data_version ioc = { .idv_flags = flags };
2261 rc = ll_ioc_data_version(inode, &ioc);
2263 *data_version = ioc.idv_version;
2269 * Trigger a HSM release request for the provided inode.
2271 int ll_hsm_release(struct inode *inode)
2274 struct obd_client_handle *och = NULL;
2275 __u64 data_version = 0;
2280 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2281 ll_get_fsname(inode->i_sb, NULL, 0),
2282 PFID(&ll_i2info(inode)->lli_fid));
2284 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2286 GOTO(out, rc = PTR_ERR(och));
2288 /* Grab latest data_version and [am]time values */
2289 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2293 env = cl_env_get(&refcheck);
2295 GOTO(out, rc = PTR_ERR(env));
2297 rc = ll_merge_attr(env, inode);
2298 cl_env_put(env, &refcheck);
2300 /* If error happen, we have the wrong size for a file.
2306 /* Release the file.
2307 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2308 * we still need it to pack l_remote_handle to MDT. */
2309 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2315 if (och != NULL && !IS_ERR(och)) /* close the file */
2316 ll_lease_close(och, inode, NULL);
2321 struct ll_swap_stack {
2324 struct inode *inode1;
2325 struct inode *inode2;
2330 static int ll_swap_layouts(struct file *file1, struct file *file2,
2331 struct lustre_swap_layouts *lsl)
2333 struct mdc_swap_layouts msl;
2334 struct md_op_data *op_data;
2337 struct ll_swap_stack *llss = NULL;
2340 OBD_ALLOC_PTR(llss);
2344 llss->inode1 = file_inode(file1);
2345 llss->inode2 = file_inode(file2);
2347 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2351 /* we use 2 bool because it is easier to swap than 2 bits */
2352 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2353 llss->check_dv1 = true;
2355 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2356 llss->check_dv2 = true;
2358 /* we cannot use lsl->sl_dvX directly because we may swap them */
2359 llss->dv1 = lsl->sl_dv1;
2360 llss->dv2 = lsl->sl_dv2;
2362 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2363 if (rc == 0) /* same file, done! */
2366 if (rc < 0) { /* sequentialize it */
2367 swap(llss->inode1, llss->inode2);
2369 swap(llss->dv1, llss->dv2);
2370 swap(llss->check_dv1, llss->check_dv2);
2374 if (gid != 0) { /* application asks to flush dirty cache */
2375 rc = ll_get_grouplock(llss->inode1, file1, gid);
2379 rc = ll_get_grouplock(llss->inode2, file2, gid);
2381 ll_put_grouplock(llss->inode1, file1, gid);
2386 /* ultimate check, before swaping the layouts we check if
2387 * dataversion has changed (if requested) */
2388 if (llss->check_dv1) {
2389 rc = ll_data_version(llss->inode1, &dv, 0);
2392 if (dv != llss->dv1)
2393 GOTO(putgl, rc = -EAGAIN);
2396 if (llss->check_dv2) {
2397 rc = ll_data_version(llss->inode2, &dv, 0);
2400 if (dv != llss->dv2)
2401 GOTO(putgl, rc = -EAGAIN);
2404 /* struct md_op_data is used to send the swap args to the mdt
2405 * only flags is missing, so we use struct mdc_swap_layouts
2406 * through the md_op_data->op_data */
2407 /* flags from user space have to be converted before they are send to
2408 * server, no flag is sent today, they are only used on the client */
2411 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2412 0, LUSTRE_OPC_ANY, &msl);
2413 if (IS_ERR(op_data))
2414 GOTO(free, rc = PTR_ERR(op_data));
2416 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2417 sizeof(*op_data), op_data, NULL);
2418 ll_finish_md_op_data(op_data);
2425 ll_put_grouplock(llss->inode2, file2, gid);
2426 ll_put_grouplock(llss->inode1, file1, gid);
2436 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2438 struct md_op_data *op_data;
2442 /* Detect out-of range masks */
2443 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2446 /* Non-root users are forbidden to set or clear flags which are
2447 * NOT defined in HSM_USER_MASK. */
2448 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2449 !cfs_capable(CFS_CAP_SYS_ADMIN))
2452 /* Detect out-of range archive id */
2453 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2454 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2457 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2458 LUSTRE_OPC_ANY, hss);
2459 if (IS_ERR(op_data))
2460 RETURN(PTR_ERR(op_data));
2462 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2463 sizeof(*op_data), op_data, NULL);
2465 ll_finish_md_op_data(op_data);
2470 static int ll_hsm_import(struct inode *inode, struct file *file,
2471 struct hsm_user_import *hui)
2473 struct hsm_state_set *hss = NULL;
2474 struct iattr *attr = NULL;
2478 if (!S_ISREG(inode->i_mode))
2484 GOTO(out, rc = -ENOMEM);
2486 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2487 hss->hss_archive_id = hui->hui_archive_id;
2488 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2489 rc = ll_hsm_state_set(inode, hss);
2493 OBD_ALLOC_PTR(attr);
2495 GOTO(out, rc = -ENOMEM);
2497 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2498 attr->ia_mode |= S_IFREG;
2499 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2500 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2501 attr->ia_size = hui->hui_size;
2502 attr->ia_mtime.tv_sec = hui->hui_mtime;
2503 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2504 attr->ia_atime.tv_sec = hui->hui_atime;
2505 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2507 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2508 ATTR_UID | ATTR_GID |
2509 ATTR_MTIME | ATTR_MTIME_SET |
2510 ATTR_ATIME | ATTR_ATIME_SET;
2514 rc = ll_setattr_raw(file_dentry(file), attr, true);
2518 inode_unlock(inode);
2530 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2532 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2533 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2536 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2538 struct inode *inode = file_inode(file);
2540 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2541 ATTR_MTIME | ATTR_MTIME_SET |
2542 ATTR_CTIME | ATTR_CTIME_SET,
2544 .tv_sec = lfu->lfu_atime_sec,
2545 .tv_nsec = lfu->lfu_atime_nsec,
2548 .tv_sec = lfu->lfu_mtime_sec,
2549 .tv_nsec = lfu->lfu_mtime_nsec,
2552 .tv_sec = lfu->lfu_ctime_sec,
2553 .tv_nsec = lfu->lfu_ctime_nsec,
2559 if (!capable(CAP_SYS_ADMIN))
2562 if (!S_ISREG(inode->i_mode))
2566 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2567 inode_unlock(inode);
2572 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2575 case MODE_READ_USER:
2577 case MODE_WRITE_USER:
2584 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2586 /* Used to allow the upper layers of the client to request an LDLM lock
2587 * without doing an actual read or write.
2589 * Used for ladvise lockahead to manually request specific locks.
2591 * \param[in] file file this ladvise lock request is on
2592 * \param[in] ladvise ladvise struct describing this lock request
2594 * \retval 0 success, no detailed result available (sync requests
2595 * and requests sent to the server [not handled locally]
2596 * cannot return detailed results)
2597 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2598 * see definitions for details.
2599 * \retval negative negative errno on error
2601 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2603 struct lu_env *env = NULL;
2604 struct cl_io *io = NULL;
2605 struct cl_lock *lock = NULL;
2606 struct cl_lock_descr *descr = NULL;
2607 struct dentry *dentry = file->f_path.dentry;
2608 struct inode *inode = dentry->d_inode;
2609 enum cl_lock_mode cl_mode;
2610 off_t start = ladvise->lla_start;
2611 off_t end = ladvise->lla_end;
2617 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2618 "start=%llu, end=%llu\n", dentry->d_name.len,
2619 dentry->d_name.name, dentry->d_inode,
2620 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2623 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2625 GOTO(out, result = cl_mode);
2627 /* Get IO environment */
2628 result = cl_io_get(inode, &env, &io, &refcheck);
2632 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2635 * nothing to do for this io. This currently happens when
2636 * stripe sub-object's are not yet created.
2638 result = io->ci_result;
2639 } else if (result == 0) {
2640 lock = vvp_env_lock(env);
2641 descr = &lock->cll_descr;
2643 descr->cld_obj = io->ci_obj;
2644 /* Convert byte offsets to pages */
2645 descr->cld_start = cl_index(io->ci_obj, start);
2646 descr->cld_end = cl_index(io->ci_obj, end);
2647 descr->cld_mode = cl_mode;
2648 /* CEF_MUST is used because we do not want to convert a
2649 * lockahead request to a lockless lock */
2650 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2653 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2654 descr->cld_enq_flags |= CEF_SPECULATIVE;
2656 result = cl_lock_request(env, io, lock);
2658 /* On success, we need to release the lock */
2660 cl_lock_release(env, lock);
2662 cl_io_fini(env, io);
2663 cl_env_put(env, &refcheck);
2665 /* -ECANCELED indicates a matching lock with a different extent
2666 * was already present, and -EEXIST indicates a matching lock
2667 * on exactly the same extent was already present.
2668 * We convert them to positive values for userspace to make
2669 * recognizing true errors easier.
2670 * Note we can only return these detailed results on async requests,
2671 * as sync requests look the same as i/o requests for locking. */
2672 if (result == -ECANCELED)
2673 result = LLA_RESULT_DIFFERENT;
2674 else if (result == -EEXIST)
2675 result = LLA_RESULT_SAME;
2680 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2682 static int ll_ladvise_sanity(struct inode *inode,
2683 struct llapi_lu_ladvise *ladvise)
2685 enum lu_ladvise_type advice = ladvise->lla_advice;
2686 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2687 * be in the first 32 bits of enum ladvise_flags */
2688 __u32 flags = ladvise->lla_peradvice_flags;
2689 /* 3 lines at 80 characters per line, should be plenty */
2692 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2694 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2695 "last supported advice is %s (value '%d'): rc = %d\n",
2696 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2697 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2701 /* Per-advice checks */
2703 case LU_LADVISE_LOCKNOEXPAND:
2704 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2706 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2708 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2709 ladvise_names[advice], rc);
2713 case LU_LADVISE_LOCKAHEAD:
2714 /* Currently only READ and WRITE modes can be requested */
2715 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2716 ladvise->lla_lockahead_mode == 0) {
2718 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2720 ll_get_fsname(inode->i_sb, NULL, 0),
2721 ladvise->lla_lockahead_mode,
2722 ladvise_names[advice], rc);
2725 case LU_LADVISE_WILLREAD:
2726 case LU_LADVISE_DONTNEED:
2728 /* Note fall through above - These checks apply to all advices
2729 * except LOCKNOEXPAND */
2730 if (flags & ~LF_DEFAULT_MASK) {
2732 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2734 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2735 ladvise_names[advice], rc);
2738 if (ladvise->lla_start >= ladvise->lla_end) {
2740 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2741 "for %s: rc = %d\n",
2742 ll_get_fsname(inode->i_sb, NULL, 0),
2743 ladvise->lla_start, ladvise->lla_end,
2744 ladvise_names[advice], rc);
2756 * Give file access advices
2758 * The ladvise interface is similar to Linux fadvise() system call, except it
2759 * forwards the advices directly from Lustre client to server. The server side
2760 * codes will apply appropriate read-ahead and caching techniques for the
2761 * corresponding files.
2763 * A typical workload for ladvise is e.g. a bunch of different clients are
2764 * doing small random reads of a file, so prefetching pages into OSS cache
2765 * with big linear reads before the random IO is a net benefit. Fetching
2766 * all that data into each client cache with fadvise() may not be, due to
2767 * much more data being sent to the client.
2769 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2770 struct llapi_lu_ladvise *ladvise)
2774 struct cl_ladvise_io *lio;
2779 env = cl_env_get(&refcheck);
2781 RETURN(PTR_ERR(env));
2783 io = vvp_env_thread_io(env);
2784 io->ci_obj = ll_i2info(inode)->lli_clob;
2786 /* initialize parameters for ladvise */
2787 lio = &io->u.ci_ladvise;
2788 lio->li_start = ladvise->lla_start;
2789 lio->li_end = ladvise->lla_end;
2790 lio->li_fid = ll_inode2fid(inode);
2791 lio->li_advice = ladvise->lla_advice;
2792 lio->li_flags = flags;
2794 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2795 rc = cl_io_loop(env, io);
2799 cl_io_fini(env, io);
2800 cl_env_put(env, &refcheck);
2804 static int ll_lock_noexpand(struct file *file, int flags)
2806 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2808 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2813 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2816 struct fsxattr fsxattr;
2818 if (copy_from_user(&fsxattr,
2819 (const struct fsxattr __user *)arg,
2823 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2824 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2825 if (copy_to_user((struct fsxattr __user *)arg,
2826 &fsxattr, sizeof(fsxattr)))
2832 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2836 struct md_op_data *op_data;
2837 struct ptlrpc_request *req = NULL;
2839 struct fsxattr fsxattr;
2840 struct cl_object *obj;
2842 /* only root could change project ID */
2843 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2846 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2847 LUSTRE_OPC_ANY, NULL);
2848 if (IS_ERR(op_data))
2849 RETURN(PTR_ERR(op_data));
2851 if (copy_from_user(&fsxattr,
2852 (const struct fsxattr __user *)arg,
2854 GOTO(out_fsxattr1, rc = -EFAULT);
2856 op_data->op_attr_flags = fsxattr.fsx_xflags;
2857 op_data->op_projid = fsxattr.fsx_projid;
2858 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2859 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2861 ptlrpc_req_finished(req);
2863 obj = ll_i2info(inode)->lli_clob;
2867 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2868 OBD_ALLOC_PTR(attr);
2870 GOTO(out_fsxattr1, rc = -ENOMEM);
2871 attr->ia_valid = ATTR_ATTR_FLAG;
2872 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2877 ll_finish_md_op_data(op_data);
2881 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2884 struct inode *inode = file_inode(file);
2885 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2886 struct ll_inode_info *lli = ll_i2info(inode);
2887 struct obd_client_handle *och = NULL;
2890 enum mds_op_bias bias = 0;
2891 struct file *layout_file = NULL;
2893 size_t data_size = 0;
2897 mutex_lock(&lli->lli_och_mutex);
2898 if (fd->fd_lease_och != NULL) {
2899 och = fd->fd_lease_och;
2900 fd->fd_lease_och = NULL;
2902 mutex_unlock(&lli->lli_och_mutex);
2905 GOTO(out, rc = -ENOLCK);
2907 fmode = och->och_flags;
2909 switch (ioc->lil_flags) {
2910 case LL_LEASE_RESYNC_DONE:
2911 if (ioc->lil_count > IOC_IDS_MAX)
2912 GOTO(out, rc = -EINVAL);
2914 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2915 OBD_ALLOC(data, data_size);
2917 GOTO(out, rc = -ENOMEM);
2919 if (copy_from_user(data, (void __user *)arg, data_size))
2920 GOTO(out, rc = -EFAULT);
2922 bias = MDS_CLOSE_RESYNC_DONE;
2924 case LL_LEASE_LAYOUT_MERGE: {
2927 if (ioc->lil_count != 1)
2928 GOTO(out, rc = -EINVAL);
2930 arg += sizeof(*ioc);
2931 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
2932 GOTO(out, rc = -EFAULT);
2934 layout_file = fget(fd);
2936 GOTO(out, rc = -EBADF);
2938 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
2939 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
2940 GOTO(out, rc = -EPERM);
2942 data = file_inode(layout_file);
2943 bias = MDS_CLOSE_LAYOUT_MERGE;
2947 /* without close intent */
2951 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2955 rc = ll_lease_och_release(inode, file);
2964 switch (ioc->lil_flags) {
2965 case LL_LEASE_RESYNC_DONE:
2967 OBD_FREE(data, data_size);
2969 case LL_LEASE_LAYOUT_MERGE:
2976 rc = ll_lease_type_from_fmode(fmode);
2980 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
2983 struct inode *inode = file_inode(file);
2984 struct ll_inode_info *lli = ll_i2info(inode);
2985 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2986 struct obd_client_handle *och = NULL;
2987 __u64 open_flags = 0;
2993 switch (ioc->lil_mode) {
2994 case LL_LEASE_WRLCK:
2995 if (!(file->f_mode & FMODE_WRITE))
2997 fmode = FMODE_WRITE;
2999 case LL_LEASE_RDLCK:
3000 if (!(file->f_mode & FMODE_READ))
3004 case LL_LEASE_UNLCK:
3005 RETURN(ll_file_unlock_lease(file, ioc, arg));
3010 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3012 /* apply for lease */
3013 if (ioc->lil_flags & LL_LEASE_RESYNC)
3014 open_flags = MDS_OPEN_RESYNC;
3015 och = ll_lease_open(inode, file, fmode, open_flags);
3017 RETURN(PTR_ERR(och));
3019 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3020 rc = ll_lease_file_resync(och, inode);
3022 ll_lease_close(och, inode, NULL);
3025 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3027 ll_lease_close(och, inode, NULL);
3033 mutex_lock(&lli->lli_och_mutex);
3034 if (fd->fd_lease_och == NULL) {
3035 fd->fd_lease_och = och;
3038 mutex_unlock(&lli->lli_och_mutex);
3040 /* impossible now that only excl is supported for now */
3041 ll_lease_close(och, inode, &lease_broken);
3048 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3050 struct inode *inode = file_inode(file);
3051 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3055 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3056 PFID(ll_inode2fid(inode)), inode, cmd);
3057 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3059 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3060 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3064 case LL_IOC_GETFLAGS:
3065 /* Get the current value of the file flags */
3066 return put_user(fd->fd_flags, (int __user *)arg);
3067 case LL_IOC_SETFLAGS:
3068 case LL_IOC_CLRFLAGS:
3069 /* Set or clear specific file flags */
3070 /* XXX This probably needs checks to ensure the flags are
3071 * not abused, and to handle any flag side effects.
3073 if (get_user(flags, (int __user *) arg))
3076 if (cmd == LL_IOC_SETFLAGS) {
3077 if ((flags & LL_FILE_IGNORE_LOCK) &&
3078 !(file->f_flags & O_DIRECT)) {
3079 CERROR("%s: unable to disable locking on "
3080 "non-O_DIRECT file\n", current->comm);
3084 fd->fd_flags |= flags;
3086 fd->fd_flags &= ~flags;
3089 case LL_IOC_LOV_SETSTRIPE:
3090 case LL_IOC_LOV_SETSTRIPE_NEW:
3091 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3092 case LL_IOC_LOV_SETEA:
3093 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3094 case LL_IOC_LOV_SWAP_LAYOUTS: {
3096 struct lustre_swap_layouts lsl;
3098 if (copy_from_user(&lsl, (char __user *)arg,
3099 sizeof(struct lustre_swap_layouts)))
3102 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3105 file2 = fget(lsl.sl_fd);
3109 /* O_WRONLY or O_RDWR */
3110 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3111 GOTO(out, rc = -EPERM);
3113 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3114 struct inode *inode2;
3115 struct ll_inode_info *lli;
3116 struct obd_client_handle *och = NULL;
3118 lli = ll_i2info(inode);
3119 mutex_lock(&lli->lli_och_mutex);
3120 if (fd->fd_lease_och != NULL) {
3121 och = fd->fd_lease_och;
3122 fd->fd_lease_och = NULL;
3124 mutex_unlock(&lli->lli_och_mutex);
3126 GOTO(out, rc = -ENOLCK);
3127 inode2 = file_inode(file2);
3128 rc = ll_swap_layouts_close(och, inode, inode2);
3130 rc = ll_swap_layouts(file, file2, &lsl);
3136 case LL_IOC_LOV_GETSTRIPE:
3137 case LL_IOC_LOV_GETSTRIPE_NEW:
3138 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3139 case FSFILT_IOC_GETFLAGS:
3140 case FSFILT_IOC_SETFLAGS:
3141 RETURN(ll_iocontrol(inode, file, cmd, arg));
3142 case FSFILT_IOC_GETVERSION_OLD:
3143 case FSFILT_IOC_GETVERSION:
3144 RETURN(put_user(inode->i_generation, (int __user *)arg));
3145 case LL_IOC_GROUP_LOCK:
3146 RETURN(ll_get_grouplock(inode, file, arg));
3147 case LL_IOC_GROUP_UNLOCK:
3148 RETURN(ll_put_grouplock(inode, file, arg));
3149 case IOC_OBD_STATFS:
3150 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3152 /* We need to special case any other ioctls we want to handle,
3153 * to send them to the MDS/OST as appropriate and to properly
3154 * network encode the arg field.
3155 case FSFILT_IOC_SETVERSION_OLD:
3156 case FSFILT_IOC_SETVERSION:
3158 case LL_IOC_FLUSHCTX:
3159 RETURN(ll_flush_ctx(inode));
3160 case LL_IOC_PATH2FID: {
3161 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3162 sizeof(struct lu_fid)))
3167 case LL_IOC_GETPARENT:
3168 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3170 case OBD_IOC_FID2PATH:
3171 RETURN(ll_fid2path(inode, (void __user *)arg));
3172 case LL_IOC_DATA_VERSION: {
3173 struct ioc_data_version idv;
3176 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3179 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3180 rc = ll_ioc_data_version(inode, &idv);
3183 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3189 case LL_IOC_GET_MDTIDX: {
3192 mdtidx = ll_get_mdt_idx(inode);
3196 if (put_user((int)mdtidx, (int __user *)arg))
3201 case OBD_IOC_GETDTNAME:
3202 case OBD_IOC_GETMDNAME:
3203 RETURN(ll_get_obd_name(inode, cmd, arg));
3204 case LL_IOC_HSM_STATE_GET: {
3205 struct md_op_data *op_data;
3206 struct hsm_user_state *hus;
3213 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3214 LUSTRE_OPC_ANY, hus);
3215 if (IS_ERR(op_data)) {
3217 RETURN(PTR_ERR(op_data));
3220 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3223 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3226 ll_finish_md_op_data(op_data);
3230 case LL_IOC_HSM_STATE_SET: {
3231 struct hsm_state_set *hss;
3238 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3243 rc = ll_hsm_state_set(inode, hss);
3248 case LL_IOC_HSM_ACTION: {
3249 struct md_op_data *op_data;
3250 struct hsm_current_action *hca;
3257 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3258 LUSTRE_OPC_ANY, hca);
3259 if (IS_ERR(op_data)) {
3261 RETURN(PTR_ERR(op_data));
3264 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3267 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3270 ll_finish_md_op_data(op_data);
3274 case LL_IOC_SET_LEASE_OLD: {
3275 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3277 RETURN(ll_file_set_lease(file, &ioc, 0));
3279 case LL_IOC_SET_LEASE: {
3280 struct ll_ioc_lease ioc;
3282 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3285 RETURN(ll_file_set_lease(file, &ioc, arg));
3287 case LL_IOC_GET_LEASE: {
3288 struct ll_inode_info *lli = ll_i2info(inode);
3289 struct ldlm_lock *lock = NULL;
3292 mutex_lock(&lli->lli_och_mutex);
3293 if (fd->fd_lease_och != NULL) {
3294 struct obd_client_handle *och = fd->fd_lease_och;
3296 lock = ldlm_handle2lock(&och->och_lease_handle);
3298 lock_res_and_lock(lock);
3299 if (!ldlm_is_cancel(lock))
3300 fmode = och->och_flags;
3302 unlock_res_and_lock(lock);
3303 LDLM_LOCK_PUT(lock);
3306 mutex_unlock(&lli->lli_och_mutex);
3308 RETURN(ll_lease_type_from_fmode(fmode));
3310 case LL_IOC_HSM_IMPORT: {
3311 struct hsm_user_import *hui;
3317 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3322 rc = ll_hsm_import(inode, file, hui);
3327 case LL_IOC_FUTIMES_3: {
3328 struct ll_futimes_3 lfu;
3330 if (copy_from_user(&lfu,
3331 (const struct ll_futimes_3 __user *)arg,
3335 RETURN(ll_file_futimes_3(file, &lfu));
3337 case LL_IOC_LADVISE: {
3338 struct llapi_ladvise_hdr *k_ladvise_hdr;
3339 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3342 int alloc_size = sizeof(*k_ladvise_hdr);
3345 u_ladvise_hdr = (void __user *)arg;
3346 OBD_ALLOC_PTR(k_ladvise_hdr);
3347 if (k_ladvise_hdr == NULL)
3350 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3351 GOTO(out_ladvise, rc = -EFAULT);
3353 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3354 k_ladvise_hdr->lah_count < 1)
3355 GOTO(out_ladvise, rc = -EINVAL);
3357 num_advise = k_ladvise_hdr->lah_count;
3358 if (num_advise >= LAH_COUNT_MAX)
3359 GOTO(out_ladvise, rc = -EFBIG);
3361 OBD_FREE_PTR(k_ladvise_hdr);
3362 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3363 lah_advise[num_advise]);
3364 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3365 if (k_ladvise_hdr == NULL)
3369 * TODO: submit multiple advices to one server in a single RPC
3371 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3372 GOTO(out_ladvise, rc = -EFAULT);
3374 for (i = 0; i < num_advise; i++) {
3375 struct llapi_lu_ladvise *k_ladvise =
3376 &k_ladvise_hdr->lah_advise[i];
3377 struct llapi_lu_ladvise __user *u_ladvise =
3378 &u_ladvise_hdr->lah_advise[i];
3380 rc = ll_ladvise_sanity(inode, k_ladvise);
3382 GOTO(out_ladvise, rc);
3384 switch (k_ladvise->lla_advice) {
3385 case LU_LADVISE_LOCKNOEXPAND:
3386 rc = ll_lock_noexpand(file,
3387 k_ladvise->lla_peradvice_flags);
3388 GOTO(out_ladvise, rc);
3389 case LU_LADVISE_LOCKAHEAD:
3391 rc = ll_file_lock_ahead(file, k_ladvise);
3394 GOTO(out_ladvise, rc);
3397 &u_ladvise->lla_lockahead_result))
3398 GOTO(out_ladvise, rc = -EFAULT);
3401 rc = ll_ladvise(inode, file,
3402 k_ladvise_hdr->lah_flags,
3405 GOTO(out_ladvise, rc);
3412 OBD_FREE(k_ladvise_hdr, alloc_size);
3415 case LL_IOC_FLR_SET_MIRROR: {
3416 /* mirror I/O must be direct to avoid polluting page cache
3418 if (!(file->f_flags & O_DIRECT))
3421 fd->fd_designated_mirror = (__u32)arg;
3424 case LL_IOC_FSGETXATTR:
3425 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3426 case LL_IOC_FSSETXATTR:
3427 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3429 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3431 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3432 (void __user *)arg));
3436 #ifndef HAVE_FILE_LLSEEK_SIZE
3437 static inline loff_t
3438 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3440 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3442 if (offset > maxsize)
3445 if (offset != file->f_pos) {
3446 file->f_pos = offset;
3447 file->f_version = 0;
3453 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3454 loff_t maxsize, loff_t eof)
3456 struct inode *inode = file_inode(file);
3464 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3465 * position-querying operation. Avoid rewriting the "same"
3466 * f_pos value back to the file because a concurrent read(),
3467 * write() or lseek() might have altered it
3472 * f_lock protects against read/modify/write race with other
3473 * SEEK_CURs. Note that parallel writes and reads behave
3477 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3478 inode_unlock(inode);
3482 * In the generic case the entire file is data, so as long as
3483 * offset isn't at the end of the file then the offset is data.
3490 * There is a virtual hole at the end of the file, so as long as
3491 * offset isn't i_size or larger, return i_size.
3499 return llseek_execute(file, offset, maxsize);
3503 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3505 struct inode *inode = file_inode(file);
3506 loff_t retval, eof = 0;
3509 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3510 (origin == SEEK_CUR) ? file->f_pos : 0);
3511 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3512 PFID(ll_inode2fid(inode)), inode, retval, retval,
3514 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3516 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3517 retval = ll_glimpse_size(inode);
3520 eof = i_size_read(inode);
3523 retval = ll_generic_file_llseek_size(file, offset, origin,
3524 ll_file_maxbytes(inode), eof);
3528 static int ll_flush(struct file *file, fl_owner_t id)
3530 struct inode *inode = file_inode(file);
3531 struct ll_inode_info *lli = ll_i2info(inode);
3532 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3535 LASSERT(!S_ISDIR(inode->i_mode));
3537 /* catch async errors that were recorded back when async writeback
3538 * failed for pages in this mapping. */
3539 rc = lli->lli_async_rc;
3540 lli->lli_async_rc = 0;
3541 if (lli->lli_clob != NULL) {
3542 err = lov_read_and_clear_async_rc(lli->lli_clob);
3547 /* The application has been told write failure already.
3548 * Do not report failure again. */
3549 if (fd->fd_write_failed)
3551 return rc ? -EIO : 0;
3555 * Called to make sure a portion of file has been written out.
3556 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3558 * Return how many pages have been written.
3560 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3561 enum cl_fsync_mode mode, int ignore_layout)
3565 struct cl_fsync_io *fio;
3570 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3571 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3574 env = cl_env_get(&refcheck);
3576 RETURN(PTR_ERR(env));
3578 io = vvp_env_thread_io(env);
3579 io->ci_obj = ll_i2info(inode)->lli_clob;
3580 io->ci_ignore_layout = ignore_layout;
3582 /* initialize parameters for sync */
3583 fio = &io->u.ci_fsync;
3584 fio->fi_start = start;
3586 fio->fi_fid = ll_inode2fid(inode);
3587 fio->fi_mode = mode;
3588 fio->fi_nr_written = 0;
3590 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3591 result = cl_io_loop(env, io);
3593 result = io->ci_result;
3595 result = fio->fi_nr_written;
3596 cl_io_fini(env, io);
3597 cl_env_put(env, &refcheck);
3603 * When dentry is provided (the 'else' case), file_dentry() may be
3604 * null and dentry must be used directly rather than pulled from
3605 * file_dentry() as is done otherwise.
3608 #ifdef HAVE_FILE_FSYNC_4ARGS
3609 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3611 struct dentry *dentry = file_dentry(file);
3613 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3614 int ll_fsync(struct file *file, int datasync)
3616 struct dentry *dentry = file_dentry(file);
3618 loff_t end = LLONG_MAX;
3620 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3623 loff_t end = LLONG_MAX;
3625 struct inode *inode = dentry->d_inode;
3626 struct ll_inode_info *lli = ll_i2info(inode);
3627 struct ptlrpc_request *req;
3631 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3632 PFID(ll_inode2fid(inode)), inode);
3633 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3635 #ifdef HAVE_FILE_FSYNC_4ARGS
3636 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3637 lock_inode = !lli->lli_inode_locked;
3641 /* fsync's caller has already called _fdata{sync,write}, we want
3642 * that IO to finish before calling the osc and mdc sync methods */
3643 rc = filemap_fdatawait(inode->i_mapping);
3646 /* catch async errors that were recorded back when async writeback
3647 * failed for pages in this mapping. */
3648 if (!S_ISDIR(inode->i_mode)) {
3649 err = lli->lli_async_rc;
3650 lli->lli_async_rc = 0;
3653 if (lli->lli_clob != NULL) {
3654 err = lov_read_and_clear_async_rc(lli->lli_clob);
3660 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3664 ptlrpc_req_finished(req);
3666 if (S_ISREG(inode->i_mode)) {
3667 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3669 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3670 if (rc == 0 && err < 0)
3673 fd->fd_write_failed = true;
3675 fd->fd_write_failed = false;
3678 #ifdef HAVE_FILE_FSYNC_4ARGS
3680 inode_unlock(inode);
3686 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3688 struct inode *inode = file_inode(file);
3689 struct ll_sb_info *sbi = ll_i2sbi(inode);
3690 struct ldlm_enqueue_info einfo = {
3691 .ei_type = LDLM_FLOCK,
3692 .ei_cb_cp = ldlm_flock_completion_ast,
3693 .ei_cbdata = file_lock,
3695 struct md_op_data *op_data;
3696 struct lustre_handle lockh = { 0 };
3697 union ldlm_policy_data flock = { { 0 } };
3698 int fl_type = file_lock->fl_type;
3704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3705 PFID(ll_inode2fid(inode)), file_lock);
3707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3709 if (file_lock->fl_flags & FL_FLOCK) {
3710 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3711 /* flocks are whole-file locks */
3712 flock.l_flock.end = OFFSET_MAX;
3713 /* For flocks owner is determined by the local file desctiptor*/
3714 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3715 } else if (file_lock->fl_flags & FL_POSIX) {
3716 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3717 flock.l_flock.start = file_lock->fl_start;
3718 flock.l_flock.end = file_lock->fl_end;
3722 flock.l_flock.pid = file_lock->fl_pid;
3724 /* Somewhat ugly workaround for svc lockd.
3725 * lockd installs custom fl_lmops->lm_compare_owner that checks
3726 * for the fl_owner to be the same (which it always is on local node
3727 * I guess between lockd processes) and then compares pid.
3728 * As such we assign pid to the owner field to make it all work,
3729 * conflict with normal locks is unlikely since pid space and
3730 * pointer space for current->files are not intersecting */
3731 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3732 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3736 einfo.ei_mode = LCK_PR;
3739 /* An unlock request may or may not have any relation to
3740 * existing locks so we may not be able to pass a lock handle
3741 * via a normal ldlm_lock_cancel() request. The request may even
3742 * unlock a byte range in the middle of an existing lock. In
3743 * order to process an unlock request we need all of the same
3744 * information that is given with a normal read or write record
3745 * lock request. To avoid creating another ldlm unlock (cancel)
3746 * message we'll treat a LCK_NL flock request as an unlock. */
3747 einfo.ei_mode = LCK_NL;
3750 einfo.ei_mode = LCK_PW;
3753 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3768 flags = LDLM_FL_BLOCK_NOWAIT;
3774 flags = LDLM_FL_TEST_LOCK;
3777 CERROR("unknown fcntl lock command: %d\n", cmd);
3781 /* Save the old mode so that if the mode in the lock changes we
3782 * can decrement the appropriate reader or writer refcount. */
3783 file_lock->fl_type = einfo.ei_mode;
3785 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3786 LUSTRE_OPC_ANY, NULL);
3787 if (IS_ERR(op_data))
3788 RETURN(PTR_ERR(op_data));
3790 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3791 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3792 flock.l_flock.pid, flags, einfo.ei_mode,
3793 flock.l_flock.start, flock.l_flock.end);
3795 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3798 /* Restore the file lock type if not TEST lock. */
3799 if (!(flags & LDLM_FL_TEST_LOCK))
3800 file_lock->fl_type = fl_type;
3802 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3803 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3804 !(flags & LDLM_FL_TEST_LOCK))
3805 rc2 = locks_lock_file_wait(file, file_lock);
3807 if ((file_lock->fl_flags & FL_FLOCK) &&
3808 (rc == 0 || file_lock->fl_type == F_UNLCK))
3809 rc2 = flock_lock_file_wait(file, file_lock);
3810 if ((file_lock->fl_flags & FL_POSIX) &&
3811 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3812 !(flags & LDLM_FL_TEST_LOCK))
3813 rc2 = posix_lock_file_wait(file, file_lock);
3814 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3816 if (rc2 && file_lock->fl_type != F_UNLCK) {
3817 einfo.ei_mode = LCK_NL;
3818 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3823 ll_finish_md_op_data(op_data);
3828 int ll_get_fid_by_name(struct inode *parent, const char *name,
3829 int namelen, struct lu_fid *fid,
3830 struct inode **inode)
3832 struct md_op_data *op_data = NULL;
3833 struct mdt_body *body;
3834 struct ptlrpc_request *req;
3838 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3839 LUSTRE_OPC_ANY, NULL);
3840 if (IS_ERR(op_data))
3841 RETURN(PTR_ERR(op_data));
3843 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3844 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3845 ll_finish_md_op_data(op_data);
3849 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3851 GOTO(out_req, rc = -EFAULT);
3853 *fid = body->mbo_fid1;
3856 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3858 ptlrpc_req_finished(req);
3862 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3863 const char *name, int namelen)
3865 struct dentry *dchild = NULL;
3866 struct inode *child_inode = NULL;
3867 struct md_op_data *op_data;
3868 struct ptlrpc_request *request = NULL;
3869 struct obd_client_handle *och = NULL;
3871 struct mdt_body *body;
3873 __u64 data_version = 0;
3876 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3877 name, PFID(ll_inode2fid(parent)), mdtidx);
3879 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3880 0, LUSTRE_OPC_ANY, NULL);
3881 if (IS_ERR(op_data))
3882 RETURN(PTR_ERR(op_data));
3884 /* Get child FID first */
3885 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3888 dchild = d_lookup(file_dentry(file), &qstr);
3889 if (dchild != NULL) {
3890 if (dchild->d_inode != NULL)
3891 child_inode = igrab(dchild->d_inode);
3895 if (child_inode == NULL) {
3896 rc = ll_get_fid_by_name(parent, name, namelen,
3897 &op_data->op_fid3, &child_inode);
3902 if (child_inode == NULL)
3903 GOTO(out_free, rc = -EINVAL);
3906 * lfs migrate command needs to be blocked on the client
3907 * by checking the migrate FID against the FID of the
3910 if (child_inode == parent->i_sb->s_root->d_inode)
3911 GOTO(out_iput, rc = -EINVAL);
3913 inode_lock(child_inode);
3914 op_data->op_fid3 = *ll_inode2fid(child_inode);
3915 if (!fid_is_sane(&op_data->op_fid3)) {
3916 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3917 ll_get_fsname(parent->i_sb, NULL, 0), name,
3918 PFID(&op_data->op_fid3));
3919 GOTO(out_unlock, rc = -EINVAL);
3922 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3924 GOTO(out_unlock, rc);
3927 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3928 PFID(&op_data->op_fid3), mdtidx);
3929 GOTO(out_unlock, rc = 0);
3932 if (S_ISREG(child_inode->i_mode)) {
3933 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3937 GOTO(out_unlock, rc);
3940 rc = ll_data_version(child_inode, &data_version,
3943 GOTO(out_close, rc);
3945 op_data->op_handle = och->och_fh;
3946 op_data->op_data = och->och_mod;
3947 op_data->op_data_version = data_version;
3948 op_data->op_lease_handle = och->och_lease_handle;
3949 op_data->op_bias |= MDS_RENAME_MIGRATE;
3952 op_data->op_mds = mdtidx;
3953 op_data->op_cli_flags = CLI_MIGRATE;
3954 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3955 namelen, name, namelen, &request);
3957 LASSERT(request != NULL);
3958 ll_update_times(request, parent);
3960 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3961 LASSERT(body != NULL);
3963 /* If the server does release layout lock, then we cleanup
3964 * the client och here, otherwise release it in out_close: */
3966 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3967 obd_mod_put(och->och_mod);
3968 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3970 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3976 if (request != NULL) {
3977 ptlrpc_req_finished(request);
3981 /* Try again if the file layout has changed. */
3982 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3986 if (och != NULL) /* close the file */
3987 ll_lease_close(och, child_inode, NULL);
3989 clear_nlink(child_inode);
3991 inode_unlock(child_inode);
3995 ll_finish_md_op_data(op_data);
4000 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4008 * test if some locks matching bits and l_req_mode are acquired
4009 * - bits can be in different locks
4010 * - if found clear the common lock bits in *bits
4011 * - the bits not found, are kept in *bits
4013 * \param bits [IN] searched lock bits [IN]
4014 * \param l_req_mode [IN] searched lock mode
4015 * \retval boolean, true iff all bits are found
4017 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4019 struct lustre_handle lockh;
4020 union ldlm_policy_data policy;
4021 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4022 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4031 fid = &ll_i2info(inode)->lli_fid;
4032 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4033 ldlm_lockname[mode]);
4035 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4036 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4037 policy.l_inodebits.bits = *bits & (1 << i);
4038 if (policy.l_inodebits.bits == 0)
4041 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4042 &policy, mode, &lockh)) {
4043 struct ldlm_lock *lock;
4045 lock = ldlm_handle2lock(&lockh);
4048 ~(lock->l_policy_data.l_inodebits.bits);
4049 LDLM_LOCK_PUT(lock);
4051 *bits &= ~policy.l_inodebits.bits;
4058 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4059 struct lustre_handle *lockh, __u64 flags,
4060 enum ldlm_mode mode)
4062 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4067 fid = &ll_i2info(inode)->lli_fid;
4068 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4070 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4071 fid, LDLM_IBITS, &policy, mode, lockh);
4076 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4078 /* Already unlinked. Just update nlink and return success */
4079 if (rc == -ENOENT) {
4081 /* If it is striped directory, and there is bad stripe
4082 * Let's revalidate the dentry again, instead of returning
4084 if (S_ISDIR(inode->i_mode) &&
4085 ll_i2info(inode)->lli_lsm_md != NULL)
4088 /* This path cannot be hit for regular files unless in
4089 * case of obscure races, so no need to to validate
4091 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4093 } else if (rc != 0) {
4094 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4095 "%s: revalidate FID "DFID" error: rc = %d\n",
4096 ll_get_fsname(inode->i_sb, NULL, 0),
4097 PFID(ll_inode2fid(inode)), rc);
4103 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4105 struct inode *inode = dentry->d_inode;
4106 struct obd_export *exp = ll_i2mdexp(inode);
4107 struct lookup_intent oit = {
4110 struct ptlrpc_request *req = NULL;
4111 struct md_op_data *op_data;
4115 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4116 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4118 /* Call getattr by fid, so do not provide name at all. */
4119 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4120 LUSTRE_OPC_ANY, NULL);
4121 if (IS_ERR(op_data))
4122 RETURN(PTR_ERR(op_data));
4124 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4125 ll_finish_md_op_data(op_data);
4127 rc = ll_inode_revalidate_fini(inode, rc);
4131 rc = ll_revalidate_it_finish(req, &oit, dentry);
4133 ll_intent_release(&oit);
4137 /* Unlinked? Unhash dentry, so it is not picked up later by
4138 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4139 * here to preserve get_cwd functionality on 2.6.
4141 if (!dentry->d_inode->i_nlink) {
4142 ll_lock_dcache(inode);
4143 d_lustre_invalidate(dentry, 0);
4144 ll_unlock_dcache(inode);
4147 ll_lookup_finish_locks(&oit, dentry);
4149 ptlrpc_req_finished(req);
4154 static int ll_merge_md_attr(struct inode *inode)
4156 struct cl_attr attr = { 0 };
4159 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4160 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4161 &attr, ll_md_blocking_ast);
4165 set_nlink(inode, attr.cat_nlink);
4166 inode->i_blocks = attr.cat_blocks;
4167 i_size_write(inode, attr.cat_size);
4169 ll_i2info(inode)->lli_atime = attr.cat_atime;
4170 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4171 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4176 static inline dev_t ll_compat_encode_dev(dev_t dev)
4178 /* The compat_sys_*stat*() syscalls will fail unless the
4179 * device majors and minors are both less than 256. Note that
4180 * the value returned here will be passed through
4181 * old_encode_dev() in cp_compat_stat(). And so we are not
4182 * trying to return a valid compat (u16) device number, just
4183 * one that will pass the old_valid_dev() check. */
4185 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4188 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4189 int ll_getattr(const struct path *path, struct kstat *stat,
4190 u32 request_mask, unsigned int flags)
4192 struct dentry *de = path->dentry;
4194 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4197 struct inode *inode = de->d_inode;
4198 struct ll_sb_info *sbi = ll_i2sbi(inode);
4199 struct ll_inode_info *lli = ll_i2info(inode);
4202 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4204 rc = ll_inode_revalidate(de, IT_GETATTR);
4208 if (S_ISREG(inode->i_mode)) {
4209 /* In case of restore, the MDT has the right size and has
4210 * already send it back without granting the layout lock,
4211 * inode is up-to-date so glimpse is useless.
4212 * Also to glimpse we need the layout, in case of a running
4213 * restore the MDT holds the layout lock so the glimpse will
4214 * block up to the end of restore (getattr will block)
4216 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4217 rc = ll_glimpse_size(inode);
4222 /* If object isn't regular a file then don't validate size. */
4223 if (S_ISDIR(inode->i_mode) &&
4224 lli->lli_lsm_md != NULL) {
4225 rc = ll_merge_md_attr(inode);
4230 LTIME_S(inode->i_atime) = lli->lli_atime;
4231 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4232 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4235 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4237 if (ll_need_32bit_api(sbi)) {
4238 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4239 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4240 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4242 stat->ino = inode->i_ino;
4243 stat->dev = inode->i_sb->s_dev;
4244 stat->rdev = inode->i_rdev;
4247 stat->mode = inode->i_mode;
4248 stat->uid = inode->i_uid;
4249 stat->gid = inode->i_gid;
4250 stat->atime = inode->i_atime;
4251 stat->mtime = inode->i_mtime;
4252 stat->ctime = inode->i_ctime;
4253 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4255 stat->nlink = inode->i_nlink;
4256 stat->size = i_size_read(inode);
4257 stat->blocks = inode->i_blocks;
4262 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4263 __u64 start, __u64 len)
4267 struct fiemap *fiemap;
4268 unsigned int extent_count = fieinfo->fi_extents_max;
4270 num_bytes = sizeof(*fiemap) + (extent_count *
4271 sizeof(struct fiemap_extent));
4272 OBD_ALLOC_LARGE(fiemap, num_bytes);
4277 fiemap->fm_flags = fieinfo->fi_flags;
4278 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4279 fiemap->fm_start = start;
4280 fiemap->fm_length = len;
4281 if (extent_count > 0 &&
4282 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4283 sizeof(struct fiemap_extent)) != 0)
4284 GOTO(out, rc = -EFAULT);
4286 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4288 fieinfo->fi_flags = fiemap->fm_flags;
4289 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4290 if (extent_count > 0 &&
4291 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4292 fiemap->fm_mapped_extents *
4293 sizeof(struct fiemap_extent)) != 0)
4294 GOTO(out, rc = -EFAULT);
4296 OBD_FREE_LARGE(fiemap, num_bytes);
4300 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4302 struct ll_inode_info *lli = ll_i2info(inode);
4303 struct posix_acl *acl = NULL;
4306 spin_lock(&lli->lli_lock);
4307 /* VFS' acl_permission_check->check_acl will release the refcount */
4308 acl = posix_acl_dup(lli->lli_posix_acl);
4309 spin_unlock(&lli->lli_lock);
4314 #ifdef HAVE_IOP_SET_ACL
4315 #ifdef CONFIG_FS_POSIX_ACL
4316 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4318 const char *name = NULL;
4325 case ACL_TYPE_ACCESS:
4327 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4331 name = XATTR_NAME_POSIX_ACL_ACCESS;
4333 case ACL_TYPE_DEFAULT:
4334 if (!S_ISDIR(inode->i_mode))
4335 GOTO(out, rc = acl ? -EACCES : 0);
4336 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4339 GOTO(out, rc = -EINVAL);
4343 size = posix_acl_xattr_size(acl->a_count);
4344 value = kmalloc(size, GFP_NOFS);
4346 GOTO(out, rc = -ENOMEM);
4348 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4353 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4354 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4359 set_cached_acl(inode, type, acl);
4361 forget_cached_acl(inode, type);
4364 #endif /* CONFIG_FS_POSIX_ACL */
4365 #endif /* HAVE_IOP_SET_ACL */
4367 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4369 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4370 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4372 ll_check_acl(struct inode *inode, int mask)
4375 # ifdef CONFIG_FS_POSIX_ACL
4376 struct posix_acl *acl;
4380 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4381 if (flags & IPERM_FLAG_RCU)
4384 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4389 rc = posix_acl_permission(inode, acl, mask);
4390 posix_acl_release(acl);
4393 # else /* !CONFIG_FS_POSIX_ACL */
4395 # endif /* CONFIG_FS_POSIX_ACL */
4397 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4399 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4400 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4402 # ifdef HAVE_INODE_PERMISION_2ARGS
4403 int ll_inode_permission(struct inode *inode, int mask)
4405 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4410 struct ll_sb_info *sbi;
4411 struct root_squash_info *squash;
4412 struct cred *cred = NULL;
4413 const struct cred *old_cred = NULL;
4415 bool squash_id = false;
4418 #ifdef MAY_NOT_BLOCK
4419 if (mask & MAY_NOT_BLOCK)
4421 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4422 if (flags & IPERM_FLAG_RCU)
4426 /* as root inode are NOT getting validated in lookup operation,
4427 * need to do it before permission check. */
4429 if (inode == inode->i_sb->s_root->d_inode) {
4430 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4435 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4436 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4438 /* squash fsuid/fsgid if needed */
4439 sbi = ll_i2sbi(inode);
4440 squash = &sbi->ll_squash;
4441 if (unlikely(squash->rsi_uid != 0 &&
4442 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4443 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4447 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4448 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4449 squash->rsi_uid, squash->rsi_gid);
4451 /* update current process's credentials
4452 * and FS capability */
4453 cred = prepare_creds();
4457 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4458 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4459 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4460 if ((1 << cap) & CFS_CAP_FS_MASK)
4461 cap_lower(cred->cap_effective, cap);
4463 old_cred = override_creds(cred);
4466 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4467 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4468 /* restore current process's credentials and FS capability */
4470 revert_creds(old_cred);
4477 /* -o localflock - only provides locally consistent flock locks */
4478 struct file_operations ll_file_operations = {
4479 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4480 # ifdef HAVE_SYNC_READ_WRITE
4481 .read = new_sync_read,
4482 .write = new_sync_write,
4484 .read_iter = ll_file_read_iter,
4485 .write_iter = ll_file_write_iter,
4486 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4487 .read = ll_file_read,
4488 .aio_read = ll_file_aio_read,
4489 .write = ll_file_write,
4490 .aio_write = ll_file_aio_write,
4491 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4492 .unlocked_ioctl = ll_file_ioctl,
4493 .open = ll_file_open,
4494 .release = ll_file_release,
4495 .mmap = ll_file_mmap,
4496 .llseek = ll_file_seek,
4497 .splice_read = ll_file_splice_read,
4502 struct file_operations ll_file_operations_flock = {
4503 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4504 # ifdef HAVE_SYNC_READ_WRITE
4505 .read = new_sync_read,
4506 .write = new_sync_write,
4507 # endif /* HAVE_SYNC_READ_WRITE */
4508 .read_iter = ll_file_read_iter,
4509 .write_iter = ll_file_write_iter,
4510 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4511 .read = ll_file_read,
4512 .aio_read = ll_file_aio_read,
4513 .write = ll_file_write,
4514 .aio_write = ll_file_aio_write,
4515 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4516 .unlocked_ioctl = ll_file_ioctl,
4517 .open = ll_file_open,
4518 .release = ll_file_release,
4519 .mmap = ll_file_mmap,
4520 .llseek = ll_file_seek,
4521 .splice_read = ll_file_splice_read,
4524 .flock = ll_file_flock,
4525 .lock = ll_file_flock
4528 /* These are for -o noflock - to return ENOSYS on flock calls */
4529 struct file_operations ll_file_operations_noflock = {
4530 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4531 # ifdef HAVE_SYNC_READ_WRITE
4532 .read = new_sync_read,
4533 .write = new_sync_write,
4534 # endif /* HAVE_SYNC_READ_WRITE */
4535 .read_iter = ll_file_read_iter,
4536 .write_iter = ll_file_write_iter,
4537 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4538 .read = ll_file_read,
4539 .aio_read = ll_file_aio_read,
4540 .write = ll_file_write,
4541 .aio_write = ll_file_aio_write,
4542 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4543 .unlocked_ioctl = ll_file_ioctl,
4544 .open = ll_file_open,
4545 .release = ll_file_release,
4546 .mmap = ll_file_mmap,
4547 .llseek = ll_file_seek,
4548 .splice_read = ll_file_splice_read,
4551 .flock = ll_file_noflock,
4552 .lock = ll_file_noflock
4555 struct inode_operations ll_file_inode_operations = {
4556 .setattr = ll_setattr,
4557 .getattr = ll_getattr,
4558 .permission = ll_inode_permission,
4559 #ifdef HAVE_IOP_XATTR
4560 .setxattr = ll_setxattr,
4561 .getxattr = ll_getxattr,
4562 .removexattr = ll_removexattr,
4564 .listxattr = ll_listxattr,
4565 .fiemap = ll_fiemap,
4566 #ifdef HAVE_IOP_GET_ACL
4567 .get_acl = ll_get_acl,
4569 #ifdef HAVE_IOP_SET_ACL
4570 .set_acl = ll_set_acl,
4574 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4576 struct ll_inode_info *lli = ll_i2info(inode);
4577 struct cl_object *obj = lli->lli_clob;
4586 env = cl_env_get(&refcheck);
4588 RETURN(PTR_ERR(env));
4590 rc = cl_conf_set(env, lli->lli_clob, conf);
4594 if (conf->coc_opc == OBJECT_CONF_SET) {
4595 struct ldlm_lock *lock = conf->coc_lock;
4596 struct cl_layout cl = {
4600 LASSERT(lock != NULL);
4601 LASSERT(ldlm_has_layout(lock));
4603 /* it can only be allowed to match after layout is
4604 * applied to inode otherwise false layout would be
4605 * seen. Applying layout shoud happen before dropping
4606 * the intent lock. */
4607 ldlm_lock_allow_match(lock);
4609 rc = cl_object_layout_get(env, obj, &cl);
4614 DFID": layout version change: %u -> %u\n",
4615 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4617 ll_layout_version_set(lli, cl.cl_layout_gen);
4621 cl_env_put(env, &refcheck);
4626 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4627 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4630 struct ll_sb_info *sbi = ll_i2sbi(inode);
4631 struct ptlrpc_request *req;
4632 struct mdt_body *body;
4639 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4640 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4641 lock->l_lvb_data, lock->l_lvb_len);
4643 if (lock->l_lvb_data != NULL)
4646 /* if layout lock was granted right away, the layout is returned
4647 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4648 * blocked and then granted via completion ast, we have to fetch
4649 * layout here. Please note that we can't use the LVB buffer in
4650 * completion AST because it doesn't have a large enough buffer */
4651 rc = ll_get_default_mdsize(sbi, &lmmsize);
4653 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4654 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4659 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4661 GOTO(out, rc = -EPROTO);
4663 lmmsize = body->mbo_eadatasize;
4664 if (lmmsize == 0) /* empty layout */
4667 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4669 GOTO(out, rc = -EFAULT);
4671 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4672 if (lvbdata == NULL)
4673 GOTO(out, rc = -ENOMEM);
4675 memcpy(lvbdata, lmm, lmmsize);
4676 lock_res_and_lock(lock);
4677 if (unlikely(lock->l_lvb_data == NULL)) {
4678 lock->l_lvb_type = LVB_T_LAYOUT;
4679 lock->l_lvb_data = lvbdata;
4680 lock->l_lvb_len = lmmsize;
4683 unlock_res_and_lock(lock);
4686 OBD_FREE_LARGE(lvbdata, lmmsize);
4691 ptlrpc_req_finished(req);
4696 * Apply the layout to the inode. Layout lock is held and will be released
4699 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4700 struct inode *inode)
4702 struct ll_inode_info *lli = ll_i2info(inode);
4703 struct ll_sb_info *sbi = ll_i2sbi(inode);
4704 struct ldlm_lock *lock;
4705 struct cl_object_conf conf;
4708 bool wait_layout = false;
4711 LASSERT(lustre_handle_is_used(lockh));
4713 lock = ldlm_handle2lock(lockh);
4714 LASSERT(lock != NULL);
4715 LASSERT(ldlm_has_layout(lock));
4717 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4718 PFID(&lli->lli_fid), inode);
4720 /* in case this is a caching lock and reinstate with new inode */
4721 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4723 lock_res_and_lock(lock);
4724 lvb_ready = ldlm_is_lvb_ready(lock);
4725 unlock_res_and_lock(lock);
4727 /* checking lvb_ready is racy but this is okay. The worst case is
4728 * that multi processes may configure the file on the same time. */
4732 rc = ll_layout_fetch(inode, lock);
4736 /* for layout lock, lmm is stored in lock's lvb.
4737 * lvb_data is immutable if the lock is held so it's safe to access it
4740 * set layout to file. Unlikely this will fail as old layout was
4741 * surely eliminated */
4742 memset(&conf, 0, sizeof conf);
4743 conf.coc_opc = OBJECT_CONF_SET;
4744 conf.coc_inode = inode;
4745 conf.coc_lock = lock;
4746 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4747 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4748 rc = ll_layout_conf(inode, &conf);
4750 /* refresh layout failed, need to wait */
4751 wait_layout = rc == -EBUSY;
4754 LDLM_LOCK_PUT(lock);
4755 ldlm_lock_decref(lockh, mode);
4757 /* wait for IO to complete if it's still being used. */
4759 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4760 ll_get_fsname(inode->i_sb, NULL, 0),
4761 PFID(&lli->lli_fid), inode);
4763 memset(&conf, 0, sizeof conf);
4764 conf.coc_opc = OBJECT_CONF_WAIT;
4765 conf.coc_inode = inode;
4766 rc = ll_layout_conf(inode, &conf);
4770 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4771 ll_get_fsname(inode->i_sb, NULL, 0),
4772 PFID(&lli->lli_fid), rc);
4778 * Issue layout intent RPC to MDS.
4779 * \param inode [in] file inode
4780 * \param intent [in] layout intent
4782 * \retval 0 on success
4783 * \retval < 0 error code
4785 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4787 struct ll_inode_info *lli = ll_i2info(inode);
4788 struct ll_sb_info *sbi = ll_i2sbi(inode);
4789 struct md_op_data *op_data;
4790 struct lookup_intent it;
4791 struct ptlrpc_request *req;
4795 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4796 0, 0, LUSTRE_OPC_ANY, NULL);
4797 if (IS_ERR(op_data))
4798 RETURN(PTR_ERR(op_data));
4800 op_data->op_data = intent;
4801 op_data->op_data_size = sizeof(*intent);
4803 memset(&it, 0, sizeof(it));
4804 it.it_op = IT_LAYOUT;
4805 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4806 intent->li_opc == LAYOUT_INTENT_TRUNC)
4807 it.it_flags = FMODE_WRITE;
4809 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4810 ll_get_fsname(inode->i_sb, NULL, 0),
4811 PFID(&lli->lli_fid), inode);
4813 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4814 &ll_md_blocking_ast, 0);
4815 if (it.it_request != NULL)
4816 ptlrpc_req_finished(it.it_request);
4817 it.it_request = NULL;
4819 ll_finish_md_op_data(op_data);
4821 /* set lock data in case this is a new lock */
4823 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4825 ll_intent_drop_lock(&it);
4831 * This function checks if there exists a LAYOUT lock on the client side,
4832 * or enqueues it if it doesn't have one in cache.
4834 * This function will not hold layout lock so it may be revoked any time after
4835 * this function returns. Any operations depend on layout should be redone
4838 * This function should be called before lov_io_init() to get an uptodate
4839 * layout version, the caller should save the version number and after IO
4840 * is finished, this function should be called again to verify that layout
4841 * is not changed during IO time.
4843 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4845 struct ll_inode_info *lli = ll_i2info(inode);
4846 struct ll_sb_info *sbi = ll_i2sbi(inode);
4847 struct lustre_handle lockh;
4848 struct layout_intent intent = {
4849 .li_opc = LAYOUT_INTENT_ACCESS,
4851 enum ldlm_mode mode;
4855 *gen = ll_layout_version_get(lli);
4856 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4860 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4861 LASSERT(S_ISREG(inode->i_mode));
4863 /* take layout lock mutex to enqueue layout lock exclusively. */
4864 mutex_lock(&lli->lli_layout_mutex);
4867 /* mostly layout lock is caching on the local side, so try to
4868 * match it before grabbing layout lock mutex. */
4869 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4870 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4871 if (mode != 0) { /* hit cached lock */
4872 rc = ll_layout_lock_set(&lockh, mode, inode);
4878 rc = ll_layout_intent(inode, &intent);
4884 *gen = ll_layout_version_get(lli);
4885 mutex_unlock(&lli->lli_layout_mutex);
4891 * Issue layout intent RPC indicating where in a file an IO is about to write.
4893 * \param[in] inode file inode.
4894 * \param[in] ext write range with start offset of fille in bytes where
4895 * an IO is about to write, and exclusive end offset in
4898 * \retval 0 on success
4899 * \retval < 0 error code
4901 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
4902 struct lu_extent *ext)
4904 struct layout_intent intent = {
4906 .li_extent.e_start = ext->e_start,
4907 .li_extent.e_end = ext->e_end,
4912 rc = ll_layout_intent(inode, &intent);
4918 * This function send a restore request to the MDT
4920 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4922 struct hsm_user_request *hur;
4926 len = sizeof(struct hsm_user_request) +
4927 sizeof(struct hsm_user_item);
4928 OBD_ALLOC(hur, len);
4932 hur->hur_request.hr_action = HUA_RESTORE;
4933 hur->hur_request.hr_archive_id = 0;
4934 hur->hur_request.hr_flags = 0;
4935 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4936 sizeof(hur->hur_user_item[0].hui_fid));
4937 hur->hur_user_item[0].hui_extent.offset = offset;
4938 hur->hur_user_item[0].hui_extent.length = length;
4939 hur->hur_request.hr_itemcount = 1;
4940 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,