4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static struct ll_file_data *ll_file_data_get(void)
64 struct ll_file_data *fd;
66 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
75 static void ll_file_data_put(struct ll_file_data *fd)
78 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
82 * Packs all the attributes into @op_data for the CLOSE rpc.
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85 struct obd_client_handle *och)
89 ll_prep_md_op_data(op_data, inode, NULL, NULL,
90 0, 0, LUSTRE_OPC_ANY, NULL);
92 op_data->op_attr.ia_mode = inode->i_mode;
93 op_data->op_attr.ia_atime = inode->i_atime;
94 op_data->op_attr.ia_mtime = inode->i_mtime;
95 op_data->op_attr.ia_ctime = inode->i_ctime;
96 op_data->op_attr.ia_size = i_size_read(inode);
97 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
98 ATTR_MTIME | ATTR_MTIME_SET |
99 ATTR_CTIME | ATTR_CTIME_SET;
100 op_data->op_attr_blocks = inode->i_blocks;
101 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
102 op_data->op_handle = och->och_fh;
104 if (och->och_flags & FMODE_WRITE &&
105 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
106 /* For HSM: if inode data has been modified, pack it so that
107 * MDT can set data dirty flag in the archive. */
108 op_data->op_bias |= MDS_DATA_MODIFIED;
114 * Perform a close, possibly with a bias.
115 * The meaning of "data" depends on the value of "bias".
117 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
118 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
121 static int ll_close_inode_openhandle(struct inode *inode,
122 struct obd_client_handle *och,
123 enum mds_op_bias bias, void *data)
125 struct obd_export *md_exp = ll_i2mdexp(inode);
126 const struct ll_inode_info *lli = ll_i2info(inode);
127 struct md_op_data *op_data;
128 struct ptlrpc_request *req = NULL;
132 if (class_exp2obd(md_exp) == NULL) {
133 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
134 ll_get_fsname(inode->i_sb, NULL, 0),
135 PFID(&lli->lli_fid));
139 OBD_ALLOC_PTR(op_data);
140 /* We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either. */
143 GOTO(out, rc = -ENOMEM);
145 ll_prepare_close(inode, op_data, och);
147 case MDS_CLOSE_LAYOUT_MERGE:
148 /* merge blocks from the victim inode */
149 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
150 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
151 case MDS_CLOSE_LAYOUT_SWAP:
152 LASSERT(data != NULL);
153 op_data->op_bias |= bias;
154 op_data->op_data_version = 0;
155 op_data->op_lease_handle = och->och_lease_handle;
156 op_data->op_fid2 = *ll_inode2fid(data);
159 case MDS_CLOSE_RESYNC_DONE: {
160 struct ll_ioc_lease *ioc = data;
162 LASSERT(data != NULL);
163 op_data->op_attr_blocks +=
164 ioc->lil_count * op_data->op_attr_blocks;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
166 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
168 op_data->op_lease_handle = och->och_lease_handle;
169 op_data->op_data = &ioc->lil_ids[0];
170 op_data->op_data_size =
171 ioc->lil_count * sizeof(ioc->lil_ids[0]);
175 case MDS_HSM_RELEASE:
176 LASSERT(data != NULL);
177 op_data->op_bias |= MDS_HSM_RELEASE;
178 op_data->op_data_version = *(__u64 *)data;
179 op_data->op_lease_handle = och->och_lease_handle;
180 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
184 LASSERT(data == NULL);
188 rc = md_close(md_exp, op_data, och->och_mod, &req);
189 if (rc != 0 && rc != -EINTR)
190 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
191 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
193 if (rc == 0 && op_data->op_bias & bias) {
194 struct mdt_body *body;
196 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
197 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
201 ll_finish_md_op_data(op_data);
205 md_clear_open_replay_data(md_exp, och);
206 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
209 ptlrpc_req_finished(req); /* This is close request */
213 int ll_md_real_close(struct inode *inode, fmode_t fmode)
215 struct ll_inode_info *lli = ll_i2info(inode);
216 struct obd_client_handle **och_p;
217 struct obd_client_handle *och;
222 if (fmode & FMODE_WRITE) {
223 och_p = &lli->lli_mds_write_och;
224 och_usecount = &lli->lli_open_fd_write_count;
225 } else if (fmode & FMODE_EXEC) {
226 och_p = &lli->lli_mds_exec_och;
227 och_usecount = &lli->lli_open_fd_exec_count;
229 LASSERT(fmode & FMODE_READ);
230 och_p = &lli->lli_mds_read_och;
231 och_usecount = &lli->lli_open_fd_read_count;
234 mutex_lock(&lli->lli_och_mutex);
235 if (*och_usecount > 0) {
236 /* There are still users of this handle, so skip
238 mutex_unlock(&lli->lli_och_mutex);
244 mutex_unlock(&lli->lli_och_mutex);
247 /* There might be a race and this handle may already
249 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
255 static int ll_md_close(struct inode *inode, struct file *file)
257 union ldlm_policy_data policy = {
258 .l_inodebits = { MDS_INODELOCK_OPEN },
260 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
261 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
262 struct ll_inode_info *lli = ll_i2info(inode);
263 struct lustre_handle lockh;
264 enum ldlm_mode lockmode;
268 /* clear group lock, if present */
269 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
270 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
272 if (fd->fd_lease_och != NULL) {
275 /* Usually the lease is not released when the
276 * application crashed, we need to release here. */
277 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
278 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
279 PFID(&lli->lli_fid), rc, lease_broken);
281 fd->fd_lease_och = NULL;
284 if (fd->fd_och != NULL) {
285 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
290 /* Let's see if we have good enough OPEN lock on the file and if
291 we can skip talking to MDS */
292 mutex_lock(&lli->lli_och_mutex);
293 if (fd->fd_omode & FMODE_WRITE) {
295 LASSERT(lli->lli_open_fd_write_count);
296 lli->lli_open_fd_write_count--;
297 } else if (fd->fd_omode & FMODE_EXEC) {
299 LASSERT(lli->lli_open_fd_exec_count);
300 lli->lli_open_fd_exec_count--;
303 LASSERT(lli->lli_open_fd_read_count);
304 lli->lli_open_fd_read_count--;
306 mutex_unlock(&lli->lli_och_mutex);
308 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
309 LDLM_IBITS, &policy, lockmode, &lockh))
310 rc = ll_md_real_close(inode, fd->fd_omode);
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
319 /* While this returns an error code, fput() the caller does not, so we need
320 * to make every effort to clean up all of our state here. Also, applications
321 * rarely check close errors and even if an error is returned they will not
322 * re-try the close call.
324 int ll_file_release(struct inode *inode, struct file *file)
326 struct ll_file_data *fd;
327 struct ll_sb_info *sbi = ll_i2sbi(inode);
328 struct ll_inode_info *lli = ll_i2info(inode);
332 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
333 PFID(ll_inode2fid(inode)), inode);
335 if (inode->i_sb->s_root != file_dentry(file))
336 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
337 fd = LUSTRE_FPRIVATE(file);
340 /* The last ref on @file, maybe not the the owner pid of statahead,
341 * because parent and child process can share the same file handle. */
342 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
343 ll_deauthorize_statahead(inode, fd);
345 if (inode->i_sb->s_root == file_dentry(file)) {
346 LUSTRE_FPRIVATE(file) = NULL;
347 ll_file_data_put(fd);
351 if (!S_ISDIR(inode->i_mode)) {
352 if (lli->lli_clob != NULL)
353 lov_read_and_clear_async_rc(lli->lli_clob);
354 lli->lli_async_rc = 0;
357 rc = ll_md_close(inode, file);
359 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
360 libcfs_debug_dumplog();
365 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
366 struct lookup_intent *itp)
368 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
369 struct dentry *parent = de->d_parent;
370 const char *name = NULL;
372 struct md_op_data *op_data;
373 struct ptlrpc_request *req = NULL;
377 LASSERT(parent != NULL);
378 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
380 /* if server supports open-by-fid, or file name is invalid, don't pack
381 * name in open request */
382 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
383 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
384 name = de->d_name.name;
385 len = de->d_name.len;
388 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
389 name, len, 0, LUSTRE_OPC_ANY, NULL);
391 RETURN(PTR_ERR(op_data));
392 op_data->op_data = lmm;
393 op_data->op_data_size = lmmsize;
395 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
396 &ll_md_blocking_ast, 0);
397 ll_finish_md_op_data(op_data);
399 /* reason for keep own exit path - don`t flood log
400 * with messages with -ESTALE errors.
402 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
403 it_open_error(DISP_OPEN_OPEN, itp))
405 ll_release_openhandle(de, itp);
409 if (it_disposition(itp, DISP_LOOKUP_NEG))
410 GOTO(out, rc = -ENOENT);
412 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
413 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
414 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
418 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
419 if (!rc && itp->it_lock_mode)
420 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
423 ptlrpc_req_finished(req);
424 ll_intent_drop_lock(itp);
426 /* We did open by fid, but by the time we got to the server,
427 * the object disappeared. If this is a create, we cannot really
428 * tell the userspace that the file it was trying to create
429 * does not exist. Instead let's return -ESTALE, and the VFS will
430 * retry the create with LOOKUP_REVAL that we are going to catch
431 * in ll_revalidate_dentry() and use lookup then.
433 if (rc == -ENOENT && itp->it_op & IT_CREAT)
439 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
440 struct obd_client_handle *och)
442 struct mdt_body *body;
444 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
445 och->och_fh = body->mbo_handle;
446 och->och_fid = body->mbo_fid1;
447 och->och_lease_handle.cookie = it->it_lock_handle;
448 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
449 och->och_flags = it->it_flags;
451 return md_set_open_replay_data(md_exp, och, it);
454 static int ll_local_open(struct file *file, struct lookup_intent *it,
455 struct ll_file_data *fd, struct obd_client_handle *och)
457 struct inode *inode = file_inode(file);
460 LASSERT(!LUSTRE_FPRIVATE(file));
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
472 LUSTRE_FPRIVATE(file) = fd;
473 ll_readahead_init(inode, &fd->fd_ras);
474 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
476 /* ll_cl_context initialize */
477 rwlock_init(&fd->fd_lock);
478 INIT_LIST_HEAD(&fd->fd_lccs);
483 /* Open a file, and (for the very first open) create objects on the OSTs at
484 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
485 * creation or open until ll_lov_setstripe() ioctl is called.
487 * If we already have the stripe MD locally then we don't request it in
488 * md_open(), by passing a lmm_size = 0.
490 * It is up to the application to ensure no other processes open this file
491 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
492 * used. We might be able to avoid races of that sort by getting lli_open_sem
493 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
494 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
496 int ll_file_open(struct inode *inode, struct file *file)
498 struct ll_inode_info *lli = ll_i2info(inode);
499 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
500 .it_flags = file->f_flags };
501 struct obd_client_handle **och_p = NULL;
502 __u64 *och_usecount = NULL;
503 struct ll_file_data *fd;
507 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
508 PFID(ll_inode2fid(inode)), inode, file->f_flags);
510 it = file->private_data; /* XXX: compat macro */
511 file->private_data = NULL; /* prevent ll_local_open assertion */
513 fd = ll_file_data_get();
515 GOTO(out_openerr, rc = -ENOMEM);
518 if (S_ISDIR(inode->i_mode))
519 ll_authorize_statahead(inode, fd);
521 if (inode->i_sb->s_root == file_dentry(file)) {
522 LUSTRE_FPRIVATE(file) = fd;
526 if (!it || !it->it_disposition) {
527 /* Convert f_flags into access mode. We cannot use file->f_mode,
528 * because everything but O_ACCMODE mask was stripped from
530 if ((oit.it_flags + 1) & O_ACCMODE)
532 if (file->f_flags & O_TRUNC)
533 oit.it_flags |= FMODE_WRITE;
535 /* kernel only call f_op->open in dentry_open. filp_open calls
536 * dentry_open after call to open_namei that checks permissions.
537 * Only nfsd_open call dentry_open directly without checking
538 * permissions and because of that this code below is safe. */
539 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
540 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
542 /* We do not want O_EXCL here, presumably we opened the file
543 * already? XXX - NFS implications? */
544 oit.it_flags &= ~O_EXCL;
546 /* bug20584, if "it_flags" contains O_CREAT, the file will be
547 * created if necessary, then "IT_CREAT" should be set to keep
548 * consistent with it */
549 if (oit.it_flags & O_CREAT)
550 oit.it_op |= IT_CREAT;
556 /* Let's see if we have file open on MDS already. */
557 if (it->it_flags & FMODE_WRITE) {
558 och_p = &lli->lli_mds_write_och;
559 och_usecount = &lli->lli_open_fd_write_count;
560 } else if (it->it_flags & FMODE_EXEC) {
561 och_p = &lli->lli_mds_exec_och;
562 och_usecount = &lli->lli_open_fd_exec_count;
564 och_p = &lli->lli_mds_read_och;
565 och_usecount = &lli->lli_open_fd_read_count;
568 mutex_lock(&lli->lli_och_mutex);
569 if (*och_p) { /* Open handle is present */
570 if (it_disposition(it, DISP_OPEN_OPEN)) {
571 /* Well, there's extra open request that we do not need,
572 let's close it somehow. This will decref request. */
573 rc = it_open_error(DISP_OPEN_OPEN, it);
575 mutex_unlock(&lli->lli_och_mutex);
576 GOTO(out_openerr, rc);
579 ll_release_openhandle(file_dentry(file), it);
583 rc = ll_local_open(file, it, fd, NULL);
586 mutex_unlock(&lli->lli_och_mutex);
587 GOTO(out_openerr, rc);
590 LASSERT(*och_usecount == 0);
591 if (!it->it_disposition) {
592 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
593 /* We cannot just request lock handle now, new ELC code
594 means that one of other OPEN locks for this file
595 could be cancelled, and since blocking ast handler
596 would attempt to grab och_mutex as well, that would
597 result in a deadlock */
598 mutex_unlock(&lli->lli_och_mutex);
600 * Normally called under two situations:
602 * 2. A race/condition on MDS resulting in no open
603 * handle to be returned from LOOKUP|OPEN request,
604 * for example if the target entry was a symlink.
606 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
607 * marked by a bit set in ll_iget_for_nfs. Clear the
608 * bit so that it's not confusing later callers.
610 * NB; when ldd is NULL, it must have come via normal
611 * lookup path only, since ll_iget_for_nfs always calls
614 if (ldd && ldd->lld_nfs_dentry) {
615 ldd->lld_nfs_dentry = 0;
616 it->it_flags |= MDS_OPEN_LOCK;
620 * Always specify MDS_OPEN_BY_FID because we don't want
621 * to get file with different fid.
623 it->it_flags |= MDS_OPEN_BY_FID;
624 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
627 GOTO(out_openerr, rc);
631 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
633 GOTO(out_och_free, rc = -ENOMEM);
637 /* md_intent_lock() didn't get a request ref if there was an
638 * open error, so don't do cleanup on the request here
640 /* XXX (green): Should not we bail out on any error here, not
641 * just open error? */
642 rc = it_open_error(DISP_OPEN_OPEN, it);
644 GOTO(out_och_free, rc);
646 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
647 "inode %p: disposition %x, status %d\n", inode,
648 it_disposition(it, ~0), it->it_status);
650 rc = ll_local_open(file, it, fd, *och_p);
652 GOTO(out_och_free, rc);
654 mutex_unlock(&lli->lli_och_mutex);
657 /* Must do this outside lli_och_mutex lock to prevent deadlock where
658 different kind of OPEN lock for this same inode gets cancelled
659 by ldlm_cancel_lru */
660 if (!S_ISREG(inode->i_mode))
661 GOTO(out_och_free, rc);
663 cl_lov_delay_create_clear(&file->f_flags);
664 GOTO(out_och_free, rc);
668 if (och_p && *och_p) {
669 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
670 *och_p = NULL; /* OBD_FREE writes some magic there */
673 mutex_unlock(&lli->lli_och_mutex);
676 if (lli->lli_opendir_key == fd)
677 ll_deauthorize_statahead(inode, fd);
679 ll_file_data_put(fd);
681 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
684 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
685 ptlrpc_req_finished(it->it_request);
686 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
692 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
693 struct ldlm_lock_desc *desc, void *data, int flag)
696 struct lustre_handle lockh;
700 case LDLM_CB_BLOCKING:
701 ldlm_lock2handle(lock, &lockh);
702 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
704 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
708 case LDLM_CB_CANCELING:
716 * When setting a lease on a file, we take ownership of the lli_mds_*_och
717 * and save it as fd->fd_och so as to force client to reopen the file even
718 * if it has an open lock in cache already.
720 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
721 struct lustre_handle *old_handle)
723 struct ll_inode_info *lli = ll_i2info(inode);
724 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
725 struct obd_client_handle **och_p;
730 /* Get the openhandle of the file */
731 mutex_lock(&lli->lli_och_mutex);
732 if (fd->fd_lease_och != NULL)
733 GOTO(out_unlock, rc = -EBUSY);
735 if (fd->fd_och == NULL) {
736 if (file->f_mode & FMODE_WRITE) {
737 LASSERT(lli->lli_mds_write_och != NULL);
738 och_p = &lli->lli_mds_write_och;
739 och_usecount = &lli->lli_open_fd_write_count;
741 LASSERT(lli->lli_mds_read_och != NULL);
742 och_p = &lli->lli_mds_read_och;
743 och_usecount = &lli->lli_open_fd_read_count;
746 if (*och_usecount > 1)
747 GOTO(out_unlock, rc = -EBUSY);
754 *old_handle = fd->fd_och->och_fh;
758 mutex_unlock(&lli->lli_och_mutex);
763 * Release ownership on lli_mds_*_och when putting back a file lease.
765 static int ll_lease_och_release(struct inode *inode, struct file *file)
767 struct ll_inode_info *lli = ll_i2info(inode);
768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
769 struct obd_client_handle **och_p;
770 struct obd_client_handle *old_och = NULL;
775 mutex_lock(&lli->lli_och_mutex);
776 if (file->f_mode & FMODE_WRITE) {
777 och_p = &lli->lli_mds_write_och;
778 och_usecount = &lli->lli_open_fd_write_count;
780 och_p = &lli->lli_mds_read_och;
781 och_usecount = &lli->lli_open_fd_read_count;
784 /* The file may have been open by another process (broken lease) so
785 * *och_p is not NULL. In this case we should simply increase usecount
788 if (*och_p != NULL) {
789 old_och = fd->fd_och;
796 mutex_unlock(&lli->lli_och_mutex);
799 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
805 * Acquire a lease and open the file.
807 static struct obd_client_handle *
808 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
811 struct lookup_intent it = { .it_op = IT_OPEN };
812 struct ll_sb_info *sbi = ll_i2sbi(inode);
813 struct md_op_data *op_data;
814 struct ptlrpc_request *req = NULL;
815 struct lustre_handle old_handle = { 0 };
816 struct obd_client_handle *och = NULL;
821 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
822 RETURN(ERR_PTR(-EINVAL));
825 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
826 RETURN(ERR_PTR(-EPERM));
828 rc = ll_lease_och_acquire(inode, file, &old_handle);
835 RETURN(ERR_PTR(-ENOMEM));
837 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
838 LUSTRE_OPC_ANY, NULL);
840 GOTO(out, rc = PTR_ERR(op_data));
842 /* To tell the MDT this openhandle is from the same owner */
843 op_data->op_handle = old_handle;
845 it.it_flags = fmode | open_flags;
846 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
847 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
848 &ll_md_blocking_lease_ast,
849 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
850 * it can be cancelled which may mislead applications that the lease is
852 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
853 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
854 * doesn't deal with openhandle, so normal openhandle will be leaked. */
855 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
856 ll_finish_md_op_data(op_data);
857 ptlrpc_req_finished(req);
859 GOTO(out_release_it, rc);
861 if (it_disposition(&it, DISP_LOOKUP_NEG))
862 GOTO(out_release_it, rc = -ENOENT);
864 rc = it_open_error(DISP_OPEN_OPEN, &it);
866 GOTO(out_release_it, rc);
868 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
869 ll_och_fill(sbi->ll_md_exp, &it, och);
871 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
872 GOTO(out_close, rc = -EOPNOTSUPP);
874 /* already get lease, handle lease lock */
875 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
876 if (it.it_lock_mode == 0 ||
877 it.it_lock_bits != MDS_INODELOCK_OPEN) {
878 /* open lock must return for lease */
879 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
880 PFID(ll_inode2fid(inode)), it.it_lock_mode,
882 GOTO(out_close, rc = -EPROTO);
885 ll_intent_release(&it);
889 /* Cancel open lock */
890 if (it.it_lock_mode != 0) {
891 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
894 och->och_lease_handle.cookie = 0ULL;
896 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
898 CERROR("%s: error closing file "DFID": %d\n",
899 ll_get_fsname(inode->i_sb, NULL, 0),
900 PFID(&ll_i2info(inode)->lli_fid), rc2);
901 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
903 ll_intent_release(&it);
911 * Check whether a layout swap can be done between two inodes.
913 * \param[in] inode1 First inode to check
914 * \param[in] inode2 Second inode to check
916 * \retval 0 on success, layout swap can be performed between both inodes
917 * \retval negative error code if requirements are not met
919 static int ll_check_swap_layouts_validity(struct inode *inode1,
920 struct inode *inode2)
922 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
925 if (inode_permission(inode1, MAY_WRITE) ||
926 inode_permission(inode2, MAY_WRITE))
929 if (inode1->i_sb != inode2->i_sb)
935 static int ll_swap_layouts_close(struct obd_client_handle *och,
936 struct inode *inode, struct inode *inode2,
939 const struct lu_fid *fid1 = ll_inode2fid(inode);
940 const struct lu_fid *fid2;
941 enum mds_op_bias bias;
945 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
946 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
948 rc = ll_check_swap_layouts_validity(inode, inode2);
950 GOTO(out_free_och, rc);
952 /* We now know that inode2 is a lustre inode */
953 fid2 = ll_inode2fid(inode2);
955 rc = lu_fid_cmp(fid1, fid2);
957 GOTO(out_free_och, rc = -EINVAL);
960 case SWAP_LAYOUTS_CLOSE:
961 bias = MDS_CLOSE_LAYOUT_SWAP;
963 case MERGE_LAYOUTS_CLOSE:
964 bias = MDS_CLOSE_LAYOUT_MERGE;
967 GOTO(out_free_och, rc = -EOPNOTSUPP);
970 /* Close the file and {swap,merge} layouts between inode & inode2.
971 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
972 * because we still need it to pack l_remote_handle to MDT. */
973 rc = ll_close_inode_openhandle(inode, och, bias, inode2);
975 och = NULL; /* freed in ll_close_inode_openhandle() */
985 * Release lease and close the file.
986 * It will check if the lease has ever broken.
988 static int ll_lease_close_intent(struct obd_client_handle *och,
990 bool *lease_broken, enum mds_op_bias bias,
993 struct ldlm_lock *lock;
994 bool cancelled = true;
998 lock = ldlm_handle2lock(&och->och_lease_handle);
1000 lock_res_and_lock(lock);
1001 cancelled = ldlm_is_cancel(lock);
1002 unlock_res_and_lock(lock);
1003 LDLM_LOCK_PUT(lock);
1006 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1007 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1009 if (lease_broken != NULL)
1010 *lease_broken = cancelled;
1012 if (!cancelled && !bias)
1013 ldlm_cli_cancel(&och->och_lease_handle, 0);
1015 if (cancelled) { /* no need to excute intent */
1020 rc = ll_close_inode_openhandle(inode, och, bias, data);
1024 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1027 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1031 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1033 static int ll_lease_file_resync(struct obd_client_handle *och,
1034 struct inode *inode)
1036 struct ll_sb_info *sbi = ll_i2sbi(inode);
1037 struct md_op_data *op_data;
1038 __u64 data_version_unused;
1042 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1043 LUSTRE_OPC_ANY, NULL);
1044 if (IS_ERR(op_data))
1045 RETURN(PTR_ERR(op_data));
1047 /* before starting file resync, it's necessary to clean up page cache
1048 * in client memory, otherwise once the layout version is increased,
1049 * writing back cached data will be denied the OSTs. */
1050 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1054 op_data->op_handle = och->och_lease_handle;
1055 rc = md_file_resync(sbi->ll_md_exp, op_data);
1061 ll_finish_md_op_data(op_data);
1065 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1067 struct ll_inode_info *lli = ll_i2info(inode);
1068 struct cl_object *obj = lli->lli_clob;
1069 struct cl_attr *attr = vvp_env_thread_attr(env);
1077 ll_inode_size_lock(inode);
1079 /* Merge timestamps the most recently obtained from MDS with
1080 * timestamps obtained from OSTs.
1082 * Do not overwrite atime of inode because it may be refreshed
1083 * by file_accessed() function. If the read was served by cache
1084 * data, there is no RPC to be sent so that atime may not be
1085 * transferred to OSTs at all. MDT only updates atime at close time
1086 * if it's at least 'mdd.*.atime_diff' older.
1087 * All in all, the atime in Lustre does not strictly comply with
1088 * POSIX. Solving this problem needs to send an RPC to MDT for each
1089 * read, this will hurt performance. */
1090 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1091 LTIME_S(inode->i_atime) = lli->lli_atime;
1092 lli->lli_update_atime = 0;
1094 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1095 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1097 atime = LTIME_S(inode->i_atime);
1098 mtime = LTIME_S(inode->i_mtime);
1099 ctime = LTIME_S(inode->i_ctime);
1101 cl_object_attr_lock(obj);
1102 rc = cl_object_attr_get(env, obj, attr);
1103 cl_object_attr_unlock(obj);
1106 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1108 if (atime < attr->cat_atime)
1109 atime = attr->cat_atime;
1111 if (ctime < attr->cat_ctime)
1112 ctime = attr->cat_ctime;
1114 if (mtime < attr->cat_mtime)
1115 mtime = attr->cat_mtime;
1117 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1118 PFID(&lli->lli_fid), attr->cat_size);
1120 i_size_write(inode, attr->cat_size);
1121 inode->i_blocks = attr->cat_blocks;
1123 LTIME_S(inode->i_atime) = atime;
1124 LTIME_S(inode->i_mtime) = mtime;
1125 LTIME_S(inode->i_ctime) = ctime;
1128 ll_inode_size_unlock(inode);
1134 * Set designated mirror for I/O.
1136 * So far only read, write, and truncated can support to issue I/O to
1137 * designated mirror.
1139 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1143 /* clear layout version for generic(non-resync) I/O in case it carries
1144 * stale layout version due to I/O restart */
1145 io->ci_layout_version = 0;
1147 /* FLR: disable non-delay for designated mirror I/O because obviously
1148 * only one mirror is available */
1149 if (fd->fd_designated_mirror > 0) {
1151 io->ci_designated_mirror = fd->fd_designated_mirror;
1152 io->ci_layout_version = fd->fd_layout_version;
1153 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1157 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1158 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1161 static bool file_is_noatime(const struct file *file)
1163 const struct vfsmount *mnt = file->f_path.mnt;
1164 const struct inode *inode = file_inode((struct file *)file);
1166 /* Adapted from file_accessed() and touch_atime().*/
1167 if (file->f_flags & O_NOATIME)
1170 if (inode->i_flags & S_NOATIME)
1173 if (IS_NOATIME(inode))
1176 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1179 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1182 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1188 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1190 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1192 struct inode *inode = file_inode(file);
1193 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1195 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1196 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1197 io->u.ci_rw.rw_file = file;
1198 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1199 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1200 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1202 if (iot == CIT_WRITE) {
1203 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1204 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1205 file->f_flags & O_DIRECT ||
1208 io->ci_obj = ll_i2info(inode)->lli_clob;
1209 io->ci_lockreq = CILR_MAYBE;
1210 if (ll_file_nolock(file)) {
1211 io->ci_lockreq = CILR_NEVER;
1212 io->ci_no_srvlock = 1;
1213 } else if (file->f_flags & O_APPEND) {
1214 io->ci_lockreq = CILR_MANDATORY;
1216 io->ci_noatime = file_is_noatime(file);
1217 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1218 io->ci_pio = !io->u.ci_rw.rw_append;
1222 /* FLR: only use non-delay I/O for read as there is only one
1223 * avaliable mirror for write. */
1224 io->ci_ndelay = !(iot == CIT_WRITE);
1226 ll_io_set_mirror(io, file);
1229 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1231 struct cl_io_pt *pt = ptask->pt_cbdata;
1232 struct file *file = pt->cip_file;
1235 loff_t pos = pt->cip_pos;
1240 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1241 file_dentry(file)->d_name.name,
1242 pt->cip_iot == CIT_READ ? "read" : "write",
1243 pos, pos + pt->cip_count);
1245 env = cl_env_get(&refcheck);
1247 RETURN(PTR_ERR(env));
1249 io = vvp_env_thread_io(env);
1250 ll_io_init(io, file, pt->cip_iot);
1251 io->u.ci_rw.rw_iter = pt->cip_iter;
1252 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1253 io->ci_pio = 0; /* It's already in parallel task */
1255 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1256 pt->cip_count - pt->cip_result);
1258 struct vvp_io *vio = vvp_env_io(env);
1260 vio->vui_io_subtype = IO_NORMAL;
1261 vio->vui_fd = LUSTRE_FPRIVATE(file);
1263 ll_cl_add(file, env, io, LCC_RW);
1264 rc = cl_io_loop(env, io);
1265 ll_cl_remove(file, env);
1267 /* cl_io_rw_init() handled IO */
1271 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1277 if (io->ci_nob > 0) {
1278 pt->cip_result += io->ci_nob;
1279 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1281 pt->cip_iocb.ki_pos = pos;
1282 #ifdef HAVE_KIOCB_KI_LEFT
1283 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1284 #elif defined(HAVE_KI_NBYTES)
1285 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1289 cl_io_fini(env, io);
1290 cl_env_put(env, &refcheck);
1292 pt->cip_need_restart = io->ci_need_restart;
1294 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1295 file_dentry(file)->d_name.name,
1296 pt->cip_iot == CIT_READ ? "read" : "write",
1297 pt->cip_result, rc);
1299 RETURN(pt->cip_result > 0 ? 0 : rc);
1303 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1304 struct file *file, enum cl_io_type iot,
1305 loff_t *ppos, size_t count)
1307 struct range_lock range;
1308 struct vvp_io *vio = vvp_env_io(env);
1309 struct inode *inode = file_inode(file);
1310 struct ll_inode_info *lli = ll_i2info(inode);
1311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1316 unsigned retried = 0;
1317 bool restarted = false;
1321 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1322 file_dentry(file)->d_name.name,
1323 iot == CIT_READ ? "read" : "write", pos, pos + count);
1326 io = vvp_env_thread_io(env);
1327 ll_io_init(io, file, iot);
1328 if (args->via_io_subtype == IO_NORMAL) {
1329 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1330 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1332 if (args->via_io_subtype != IO_NORMAL || restarted)
1334 io->ci_ndelay_tried = retried;
1336 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1337 bool range_locked = false;
1339 if (file->f_flags & O_APPEND)
1340 range_lock_init(&range, 0, LUSTRE_EOF);
1342 range_lock_init(&range, pos, pos + count - 1);
1344 vio->vui_fd = LUSTRE_FPRIVATE(file);
1345 vio->vui_io_subtype = args->via_io_subtype;
1347 switch (vio->vui_io_subtype) {
1349 /* Direct IO reads must also take range lock,
1350 * or multiple reads will try to work on the same pages
1351 * See LU-6227 for details. */
1352 if (((iot == CIT_WRITE) ||
1353 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1354 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1355 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1357 rc = range_lock(&lli->lli_write_tree, &range);
1361 range_locked = true;
1365 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1366 vio->u.splice.vui_flags = args->u.splice.via_flags;
1369 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1373 ll_cl_add(file, env, io, LCC_RW);
1374 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1375 !lli->lli_inode_locked) {
1377 lli->lli_inode_locked = 1;
1379 rc = cl_io_loop(env, io);
1380 if (lli->lli_inode_locked) {
1381 lli->lli_inode_locked = 0;
1382 inode_unlock(inode);
1384 ll_cl_remove(file, env);
1387 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1389 range_unlock(&lli->lli_write_tree, &range);
1392 /* cl_io_rw_init() handled IO */
1396 if (io->ci_nob > 0) {
1397 result += io->ci_nob;
1398 count -= io->ci_nob;
1400 if (args->via_io_subtype == IO_NORMAL) {
1401 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1403 args->u.normal.via_iocb->ki_pos = pos;
1404 #ifdef HAVE_KIOCB_KI_LEFT
1405 args->u.normal.via_iocb->ki_left = count;
1406 #elif defined(HAVE_KI_NBYTES)
1407 args->u.normal.via_iocb->ki_nbytes = count;
1411 pos = io->u.ci_rw.rw_range.cir_pos;
1415 cl_io_fini(env, io);
1418 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1419 file->f_path.dentry->d_name.name,
1420 iot, rc, result, io->ci_need_restart);
1422 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1424 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1425 file_dentry(file)->d_name.name,
1426 iot == CIT_READ ? "read" : "write",
1427 pos, pos + count, result, rc);
1428 /* preserve the tried count for FLR */
1429 retried = io->ci_ndelay_tried;
1434 if (iot == CIT_READ) {
1436 ll_stats_ops_tally(ll_i2sbi(inode),
1437 LPROC_LL_READ_BYTES, result);
1438 } else if (iot == CIT_WRITE) {
1440 ll_stats_ops_tally(ll_i2sbi(inode),
1441 LPROC_LL_WRITE_BYTES, result);
1442 fd->fd_write_failed = false;
1443 } else if (result == 0 && rc == 0) {
1446 fd->fd_write_failed = true;
1448 fd->fd_write_failed = false;
1449 } else if (rc != -ERESTARTSYS) {
1450 fd->fd_write_failed = true;
1454 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1455 file_dentry(file)->d_name.name,
1456 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1460 RETURN(result > 0 ? result : rc);
1464 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1465 * especially for small I/O.
1467 * To serve a read request, CLIO has to create and initialize a cl_io and
1468 * then request DLM lock. This has turned out to have siginificant overhead
1469 * and affects the performance of small I/O dramatically.
1471 * It's not necessary to create a cl_io for each I/O. Under the help of read
1472 * ahead, most of the pages being read are already in memory cache and we can
1473 * read those pages directly because if the pages exist, the corresponding DLM
1474 * lock must exist so that page content must be valid.
1476 * In fast read implementation, the llite speculatively finds and reads pages
1477 * in memory cache. There are three scenarios for fast read:
1478 * - If the page exists and is uptodate, kernel VM will provide the data and
1479 * CLIO won't be intervened;
1480 * - If the page was brought into memory by read ahead, it will be exported
1481 * and read ahead parameters will be updated;
1482 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1483 * it will go back and invoke normal read, i.e., a cl_io will be created
1484 * and DLM lock will be requested.
1486 * POSIX compliance: posix standard states that read is intended to be atomic.
1487 * Lustre read implementation is in line with Linux kernel read implementation
1488 * and neither of them complies with POSIX standard in this matter. Fast read
1489 * doesn't make the situation worse on single node but it may interleave write
1490 * results from multiple nodes due to short read handling in ll_file_aio_read().
1492 * \param env - lu_env
1493 * \param iocb - kiocb from kernel
1494 * \param iter - user space buffers where the data will be copied
1496 * \retval - number of bytes have been read, or error code if error occurred.
1499 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1503 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1506 /* NB: we can't do direct IO for fast read because it will need a lock
1507 * to make IO engine happy. */
1508 if (iocb->ki_filp->f_flags & O_DIRECT)
1511 result = generic_file_read_iter(iocb, iter);
1513 /* If the first page is not in cache, generic_file_aio_read() will be
1514 * returned with -ENODATA.
1515 * See corresponding code in ll_readpage(). */
1516 if (result == -ENODATA)
1520 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1521 LPROC_LL_READ_BYTES, result);
1527 * Read from a file (through the page cache).
1529 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1532 struct vvp_io_args *args;
1537 result = ll_do_fast_read(iocb, to);
1538 if (result < 0 || iov_iter_count(to) == 0)
1541 env = cl_env_get(&refcheck);
1543 return PTR_ERR(env);
1545 args = ll_env_args(env, IO_NORMAL);
1546 args->u.normal.via_iter = to;
1547 args->u.normal.via_iocb = iocb;
1549 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1550 &iocb->ki_pos, iov_iter_count(to));
1553 else if (result == 0)
1556 cl_env_put(env, &refcheck);
1562 * Write to a file (through the page cache).
1564 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1566 struct vvp_io_args *args;
1571 env = cl_env_get(&refcheck);
1573 return PTR_ERR(env);
1575 args = ll_env_args(env, IO_NORMAL);
1576 args->u.normal.via_iter = from;
1577 args->u.normal.via_iocb = iocb;
1579 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1580 &iocb->ki_pos, iov_iter_count(from));
1581 cl_env_put(env, &refcheck);
1585 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1587 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1589 static int ll_file_get_iov_count(const struct iovec *iov,
1590 unsigned long *nr_segs, size_t *count)
1595 for (seg = 0; seg < *nr_segs; seg++) {
1596 const struct iovec *iv = &iov[seg];
1599 * If any segment has a negative length, or the cumulative
1600 * length ever wraps negative then return -EINVAL.
1603 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1605 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1610 cnt -= iv->iov_len; /* This segment is no good */
1617 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1618 unsigned long nr_segs, loff_t pos)
1625 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1629 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1630 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1631 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1632 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1633 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1635 result = ll_file_read_iter(iocb, &to);
1640 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1643 struct iovec iov = { .iov_base = buf, .iov_len = count };
1648 init_sync_kiocb(&kiocb, file);
1649 kiocb.ki_pos = *ppos;
1650 #ifdef HAVE_KIOCB_KI_LEFT
1651 kiocb.ki_left = count;
1652 #elif defined(HAVE_KI_NBYTES)
1653 kiocb.i_nbytes = count;
1656 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1657 *ppos = kiocb.ki_pos;
1663 * Write to a file (through the page cache).
1666 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1667 unsigned long nr_segs, loff_t pos)
1669 struct iov_iter from;
1674 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1678 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1679 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1680 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1681 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1682 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1684 result = ll_file_write_iter(iocb, &from);
1689 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1690 size_t count, loff_t *ppos)
1693 struct iovec iov = { .iov_base = (void __user *)buf,
1695 struct kiocb *kiocb;
1700 env = cl_env_get(&refcheck);
1702 RETURN(PTR_ERR(env));
1704 kiocb = &ll_env_info(env)->lti_kiocb;
1705 init_sync_kiocb(kiocb, file);
1706 kiocb->ki_pos = *ppos;
1707 #ifdef HAVE_KIOCB_KI_LEFT
1708 kiocb->ki_left = count;
1709 #elif defined(HAVE_KI_NBYTES)
1710 kiocb->ki_nbytes = count;
1713 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1714 *ppos = kiocb->ki_pos;
1716 cl_env_put(env, &refcheck);
1719 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1722 * Send file content (through pagecache) somewhere with helper
1724 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1725 struct pipe_inode_info *pipe, size_t count,
1729 struct vvp_io_args *args;
1734 env = cl_env_get(&refcheck);
1736 RETURN(PTR_ERR(env));
1738 args = ll_env_args(env, IO_SPLICE);
1739 args->u.splice.via_pipe = pipe;
1740 args->u.splice.via_flags = flags;
1742 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1743 cl_env_put(env, &refcheck);
1747 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1748 __u64 flags, struct lov_user_md *lum, int lum_size)
1750 struct lookup_intent oit = {
1752 .it_flags = flags | MDS_OPEN_BY_FID,
1757 ll_inode_size_lock(inode);
1758 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1760 GOTO(out_unlock, rc);
1762 ll_release_openhandle(dentry, &oit);
1765 ll_inode_size_unlock(inode);
1766 ll_intent_release(&oit);
1771 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1772 struct lov_mds_md **lmmp, int *lmm_size,
1773 struct ptlrpc_request **request)
1775 struct ll_sb_info *sbi = ll_i2sbi(inode);
1776 struct mdt_body *body;
1777 struct lov_mds_md *lmm = NULL;
1778 struct ptlrpc_request *req = NULL;
1779 struct md_op_data *op_data;
1782 rc = ll_get_default_mdsize(sbi, &lmmsize);
1786 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1787 strlen(filename), lmmsize,
1788 LUSTRE_OPC_ANY, NULL);
1789 if (IS_ERR(op_data))
1790 RETURN(PTR_ERR(op_data));
1792 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1793 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1794 ll_finish_md_op_data(op_data);
1796 CDEBUG(D_INFO, "md_getattr_name failed "
1797 "on %s: rc %d\n", filename, rc);
1801 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1802 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1804 lmmsize = body->mbo_eadatasize;
1806 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1808 GOTO(out, rc = -ENODATA);
1811 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1812 LASSERT(lmm != NULL);
1814 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1815 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1816 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1817 GOTO(out, rc = -EPROTO);
1820 * This is coming from the MDS, so is probably in
1821 * little endian. We convert it to host endian before
1822 * passing it to userspace.
1824 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1827 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1828 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1829 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1830 if (le32_to_cpu(lmm->lmm_pattern) &
1831 LOV_PATTERN_F_RELEASED)
1835 /* if function called for directory - we should
1836 * avoid swab not existent lsm objects */
1837 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1838 lustre_swab_lov_user_md_v1(
1839 (struct lov_user_md_v1 *)lmm);
1840 if (S_ISREG(body->mbo_mode))
1841 lustre_swab_lov_user_md_objects(
1842 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1844 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1845 lustre_swab_lov_user_md_v3(
1846 (struct lov_user_md_v3 *)lmm);
1847 if (S_ISREG(body->mbo_mode))
1848 lustre_swab_lov_user_md_objects(
1849 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1851 } else if (lmm->lmm_magic ==
1852 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1853 lustre_swab_lov_comp_md_v1(
1854 (struct lov_comp_md_v1 *)lmm);
1860 *lmm_size = lmmsize;
1865 static int ll_lov_setea(struct inode *inode, struct file *file,
1868 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1869 struct lov_user_md *lump;
1870 int lum_size = sizeof(struct lov_user_md) +
1871 sizeof(struct lov_user_ost_data);
1875 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1878 OBD_ALLOC_LARGE(lump, lum_size);
1882 if (copy_from_user(lump, arg, lum_size))
1883 GOTO(out_lump, rc = -EFAULT);
1885 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1887 cl_lov_delay_create_clear(&file->f_flags);
1890 OBD_FREE_LARGE(lump, lum_size);
1894 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1901 env = cl_env_get(&refcheck);
1903 RETURN(PTR_ERR(env));
1905 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1906 cl_env_put(env, &refcheck);
1910 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1913 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1914 struct lov_user_md *klum;
1916 __u64 flags = FMODE_WRITE;
1919 rc = ll_copy_user_md(lum, &klum);
1924 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1929 rc = put_user(0, &lum->lmm_stripe_count);
1933 rc = ll_layout_refresh(inode, &gen);
1937 rc = ll_file_getstripe(inode, arg, lum_size);
1939 cl_lov_delay_create_clear(&file->f_flags);
1942 OBD_FREE(klum, lum_size);
1947 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1949 struct ll_inode_info *lli = ll_i2info(inode);
1950 struct cl_object *obj = lli->lli_clob;
1951 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1952 struct ll_grouplock grouplock;
1957 CWARN("group id for group lock must not be 0\n");
1961 if (ll_file_nolock(file))
1962 RETURN(-EOPNOTSUPP);
1964 spin_lock(&lli->lli_lock);
1965 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1966 CWARN("group lock already existed with gid %lu\n",
1967 fd->fd_grouplock.lg_gid);
1968 spin_unlock(&lli->lli_lock);
1971 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1972 spin_unlock(&lli->lli_lock);
1975 * XXX: group lock needs to protect all OST objects while PFL
1976 * can add new OST objects during the IO, so we'd instantiate
1977 * all OST objects before getting its group lock.
1982 struct cl_layout cl = {
1983 .cl_is_composite = false,
1985 struct lu_extent ext = {
1987 .e_end = OBD_OBJECT_EOF,
1990 env = cl_env_get(&refcheck);
1992 RETURN(PTR_ERR(env));
1994 rc = cl_object_layout_get(env, obj, &cl);
1995 if (!rc && cl.cl_is_composite)
1996 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
1999 cl_env_put(env, &refcheck);
2004 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2005 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2009 spin_lock(&lli->lli_lock);
2010 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2011 spin_unlock(&lli->lli_lock);
2012 CERROR("another thread just won the race\n");
2013 cl_put_grouplock(&grouplock);
2017 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2018 fd->fd_grouplock = grouplock;
2019 spin_unlock(&lli->lli_lock);
2021 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2025 static int ll_put_grouplock(struct inode *inode, struct file *file,
2028 struct ll_inode_info *lli = ll_i2info(inode);
2029 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2030 struct ll_grouplock grouplock;
2033 spin_lock(&lli->lli_lock);
2034 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2035 spin_unlock(&lli->lli_lock);
2036 CWARN("no group lock held\n");
2040 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2042 if (fd->fd_grouplock.lg_gid != arg) {
2043 CWARN("group lock %lu doesn't match current id %lu\n",
2044 arg, fd->fd_grouplock.lg_gid);
2045 spin_unlock(&lli->lli_lock);
2049 grouplock = fd->fd_grouplock;
2050 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2051 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2052 spin_unlock(&lli->lli_lock);
2054 cl_put_grouplock(&grouplock);
2055 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2060 * Close inode open handle
2062 * \param dentry [in] dentry which contains the inode
2063 * \param it [in,out] intent which contains open info and result
2066 * \retval <0 failure
2068 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2070 struct inode *inode = dentry->d_inode;
2071 struct obd_client_handle *och;
2077 /* Root ? Do nothing. */
2078 if (dentry->d_inode->i_sb->s_root == dentry)
2081 /* No open handle to close? Move away */
2082 if (!it_disposition(it, DISP_OPEN_OPEN))
2085 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2087 OBD_ALLOC(och, sizeof(*och));
2089 GOTO(out, rc = -ENOMEM);
2091 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2093 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2095 /* this one is in place of ll_file_open */
2096 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2097 ptlrpc_req_finished(it->it_request);
2098 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2104 * Get size for inode for which FIEMAP mapping is requested.
2105 * Make the FIEMAP get_info call and returns the result.
2106 * \param fiemap kernel buffer to hold extens
2107 * \param num_bytes kernel buffer size
2109 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2115 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2118 /* Checks for fiemap flags */
2119 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2120 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2124 /* Check for FIEMAP_FLAG_SYNC */
2125 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2126 rc = filemap_fdatawrite(inode->i_mapping);
2131 env = cl_env_get(&refcheck);
2133 RETURN(PTR_ERR(env));
2135 if (i_size_read(inode) == 0) {
2136 rc = ll_glimpse_size(inode);
2141 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2142 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2143 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2145 /* If filesize is 0, then there would be no objects for mapping */
2146 if (fmkey.lfik_oa.o_size == 0) {
2147 fiemap->fm_mapped_extents = 0;
2151 fmkey.lfik_fiemap = *fiemap;
2153 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2154 &fmkey, fiemap, &num_bytes);
2156 cl_env_put(env, &refcheck);
2160 int ll_fid2path(struct inode *inode, void __user *arg)
2162 struct obd_export *exp = ll_i2mdexp(inode);
2163 const struct getinfo_fid2path __user *gfin = arg;
2165 struct getinfo_fid2path *gfout;
2171 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2172 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2175 /* Only need to get the buflen */
2176 if (get_user(pathlen, &gfin->gf_pathlen))
2179 if (pathlen > PATH_MAX)
2182 outsize = sizeof(*gfout) + pathlen;
2183 OBD_ALLOC(gfout, outsize);
2187 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2188 GOTO(gf_free, rc = -EFAULT);
2189 /* append root FID after gfout to let MDT know the root FID so that it
2190 * can lookup the correct path, this is mainly for fileset.
2191 * old server without fileset mount support will ignore this. */
2192 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2194 /* Call mdc_iocontrol */
2195 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2199 if (copy_to_user(arg, gfout, outsize))
2203 OBD_FREE(gfout, outsize);
2208 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2210 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2218 ioc->idv_version = 0;
2219 ioc->idv_layout_version = UINT_MAX;
2221 /* If no file object initialized, we consider its version is 0. */
2225 env = cl_env_get(&refcheck);
2227 RETURN(PTR_ERR(env));
2229 io = vvp_env_thread_io(env);
2231 io->u.ci_data_version.dv_data_version = 0;
2232 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2233 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2236 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2237 result = cl_io_loop(env, io);
2239 result = io->ci_result;
2241 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2242 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2244 cl_io_fini(env, io);
2246 if (unlikely(io->ci_need_restart))
2249 cl_env_put(env, &refcheck);
2255 * Read the data_version for inode.
2257 * This value is computed using stripe object version on OST.
2258 * Version is computed using server side locking.
2260 * @param flags if do sync on the OST side;
2262 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2263 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2265 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2267 struct ioc_data_version ioc = { .idv_flags = flags };
2270 rc = ll_ioc_data_version(inode, &ioc);
2272 *data_version = ioc.idv_version;
2278 * Trigger a HSM release request for the provided inode.
2280 int ll_hsm_release(struct inode *inode)
2283 struct obd_client_handle *och = NULL;
2284 __u64 data_version = 0;
2289 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2290 ll_get_fsname(inode->i_sb, NULL, 0),
2291 PFID(&ll_i2info(inode)->lli_fid));
2293 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2295 GOTO(out, rc = PTR_ERR(och));
2297 /* Grab latest data_version and [am]time values */
2298 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2302 env = cl_env_get(&refcheck);
2304 GOTO(out, rc = PTR_ERR(env));
2306 ll_merge_attr(env, inode);
2307 cl_env_put(env, &refcheck);
2309 /* Release the file.
2310 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2311 * we still need it to pack l_remote_handle to MDT. */
2312 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2318 if (och != NULL && !IS_ERR(och)) /* close the file */
2319 ll_lease_close(och, inode, NULL);
2324 struct ll_swap_stack {
2327 struct inode *inode1;
2328 struct inode *inode2;
2333 static int ll_swap_layouts(struct file *file1, struct file *file2,
2334 struct lustre_swap_layouts *lsl)
2336 struct mdc_swap_layouts msl;
2337 struct md_op_data *op_data;
2340 struct ll_swap_stack *llss = NULL;
2343 OBD_ALLOC_PTR(llss);
2347 llss->inode1 = file_inode(file1);
2348 llss->inode2 = file_inode(file2);
2350 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2354 /* we use 2 bool because it is easier to swap than 2 bits */
2355 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2356 llss->check_dv1 = true;
2358 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2359 llss->check_dv2 = true;
2361 /* we cannot use lsl->sl_dvX directly because we may swap them */
2362 llss->dv1 = lsl->sl_dv1;
2363 llss->dv2 = lsl->sl_dv2;
2365 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2366 if (rc == 0) /* same file, done! */
2369 if (rc < 0) { /* sequentialize it */
2370 swap(llss->inode1, llss->inode2);
2372 swap(llss->dv1, llss->dv2);
2373 swap(llss->check_dv1, llss->check_dv2);
2377 if (gid != 0) { /* application asks to flush dirty cache */
2378 rc = ll_get_grouplock(llss->inode1, file1, gid);
2382 rc = ll_get_grouplock(llss->inode2, file2, gid);
2384 ll_put_grouplock(llss->inode1, file1, gid);
2389 /* ultimate check, before swaping the layouts we check if
2390 * dataversion has changed (if requested) */
2391 if (llss->check_dv1) {
2392 rc = ll_data_version(llss->inode1, &dv, 0);
2395 if (dv != llss->dv1)
2396 GOTO(putgl, rc = -EAGAIN);
2399 if (llss->check_dv2) {
2400 rc = ll_data_version(llss->inode2, &dv, 0);
2403 if (dv != llss->dv2)
2404 GOTO(putgl, rc = -EAGAIN);
2407 /* struct md_op_data is used to send the swap args to the mdt
2408 * only flags is missing, so we use struct mdc_swap_layouts
2409 * through the md_op_data->op_data */
2410 /* flags from user space have to be converted before they are send to
2411 * server, no flag is sent today, they are only used on the client */
2414 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2415 0, LUSTRE_OPC_ANY, &msl);
2416 if (IS_ERR(op_data))
2417 GOTO(free, rc = PTR_ERR(op_data));
2419 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2420 sizeof(*op_data), op_data, NULL);
2421 ll_finish_md_op_data(op_data);
2428 ll_put_grouplock(llss->inode2, file2, gid);
2429 ll_put_grouplock(llss->inode1, file1, gid);
2439 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2441 struct md_op_data *op_data;
2445 /* Detect out-of range masks */
2446 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2449 /* Non-root users are forbidden to set or clear flags which are
2450 * NOT defined in HSM_USER_MASK. */
2451 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2452 !cfs_capable(CFS_CAP_SYS_ADMIN))
2455 /* Detect out-of range archive id */
2456 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2457 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2460 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2461 LUSTRE_OPC_ANY, hss);
2462 if (IS_ERR(op_data))
2463 RETURN(PTR_ERR(op_data));
2465 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2466 sizeof(*op_data), op_data, NULL);
2468 ll_finish_md_op_data(op_data);
2473 static int ll_hsm_import(struct inode *inode, struct file *file,
2474 struct hsm_user_import *hui)
2476 struct hsm_state_set *hss = NULL;
2477 struct iattr *attr = NULL;
2481 if (!S_ISREG(inode->i_mode))
2487 GOTO(out, rc = -ENOMEM);
2489 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2490 hss->hss_archive_id = hui->hui_archive_id;
2491 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2492 rc = ll_hsm_state_set(inode, hss);
2496 OBD_ALLOC_PTR(attr);
2498 GOTO(out, rc = -ENOMEM);
2500 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2501 attr->ia_mode |= S_IFREG;
2502 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2503 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2504 attr->ia_size = hui->hui_size;
2505 attr->ia_mtime.tv_sec = hui->hui_mtime;
2506 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2507 attr->ia_atime.tv_sec = hui->hui_atime;
2508 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2510 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2511 ATTR_UID | ATTR_GID |
2512 ATTR_MTIME | ATTR_MTIME_SET |
2513 ATTR_ATIME | ATTR_ATIME_SET;
2517 rc = ll_setattr_raw(file_dentry(file), attr, true);
2521 inode_unlock(inode);
2533 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2535 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2536 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2539 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2541 struct inode *inode = file_inode(file);
2543 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2544 ATTR_MTIME | ATTR_MTIME_SET |
2545 ATTR_CTIME | ATTR_CTIME_SET,
2547 .tv_sec = lfu->lfu_atime_sec,
2548 .tv_nsec = lfu->lfu_atime_nsec,
2551 .tv_sec = lfu->lfu_mtime_sec,
2552 .tv_nsec = lfu->lfu_mtime_nsec,
2555 .tv_sec = lfu->lfu_ctime_sec,
2556 .tv_nsec = lfu->lfu_ctime_nsec,
2562 if (!capable(CAP_SYS_ADMIN))
2565 if (!S_ISREG(inode->i_mode))
2569 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2570 inode_unlock(inode);
2575 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2578 case MODE_READ_USER:
2580 case MODE_WRITE_USER:
2587 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2589 /* Used to allow the upper layers of the client to request an LDLM lock
2590 * without doing an actual read or write.
2592 * Used for ladvise lockahead to manually request specific locks.
2594 * \param[in] file file this ladvise lock request is on
2595 * \param[in] ladvise ladvise struct describing this lock request
2597 * \retval 0 success, no detailed result available (sync requests
2598 * and requests sent to the server [not handled locally]
2599 * cannot return detailed results)
2600 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2601 * see definitions for details.
2602 * \retval negative negative errno on error
2604 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2606 struct lu_env *env = NULL;
2607 struct cl_io *io = NULL;
2608 struct cl_lock *lock = NULL;
2609 struct cl_lock_descr *descr = NULL;
2610 struct dentry *dentry = file->f_path.dentry;
2611 struct inode *inode = dentry->d_inode;
2612 enum cl_lock_mode cl_mode;
2613 off_t start = ladvise->lla_start;
2614 off_t end = ladvise->lla_end;
2620 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2621 "start=%llu, end=%llu\n", dentry->d_name.len,
2622 dentry->d_name.name, dentry->d_inode,
2623 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2626 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2628 GOTO(out, result = cl_mode);
2630 /* Get IO environment */
2631 result = cl_io_get(inode, &env, &io, &refcheck);
2635 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2638 * nothing to do for this io. This currently happens when
2639 * stripe sub-object's are not yet created.
2641 result = io->ci_result;
2642 } else if (result == 0) {
2643 lock = vvp_env_lock(env);
2644 descr = &lock->cll_descr;
2646 descr->cld_obj = io->ci_obj;
2647 /* Convert byte offsets to pages */
2648 descr->cld_start = cl_index(io->ci_obj, start);
2649 descr->cld_end = cl_index(io->ci_obj, end);
2650 descr->cld_mode = cl_mode;
2651 /* CEF_MUST is used because we do not want to convert a
2652 * lockahead request to a lockless lock */
2653 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2656 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2657 descr->cld_enq_flags |= CEF_SPECULATIVE;
2659 result = cl_lock_request(env, io, lock);
2661 /* On success, we need to release the lock */
2663 cl_lock_release(env, lock);
2665 cl_io_fini(env, io);
2666 cl_env_put(env, &refcheck);
2668 /* -ECANCELED indicates a matching lock with a different extent
2669 * was already present, and -EEXIST indicates a matching lock
2670 * on exactly the same extent was already present.
2671 * We convert them to positive values for userspace to make
2672 * recognizing true errors easier.
2673 * Note we can only return these detailed results on async requests,
2674 * as sync requests look the same as i/o requests for locking. */
2675 if (result == -ECANCELED)
2676 result = LLA_RESULT_DIFFERENT;
2677 else if (result == -EEXIST)
2678 result = LLA_RESULT_SAME;
2683 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2685 static int ll_ladvise_sanity(struct inode *inode,
2686 struct llapi_lu_ladvise *ladvise)
2688 enum lu_ladvise_type advice = ladvise->lla_advice;
2689 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2690 * be in the first 32 bits of enum ladvise_flags */
2691 __u32 flags = ladvise->lla_peradvice_flags;
2692 /* 3 lines at 80 characters per line, should be plenty */
2695 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2697 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2698 "last supported advice is %s (value '%d'): rc = %d\n",
2699 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2700 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2704 /* Per-advice checks */
2706 case LU_LADVISE_LOCKNOEXPAND:
2707 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2709 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2711 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2712 ladvise_names[advice], rc);
2716 case LU_LADVISE_LOCKAHEAD:
2717 /* Currently only READ and WRITE modes can be requested */
2718 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2719 ladvise->lla_lockahead_mode == 0) {
2721 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2723 ll_get_fsname(inode->i_sb, NULL, 0),
2724 ladvise->lla_lockahead_mode,
2725 ladvise_names[advice], rc);
2728 case LU_LADVISE_WILLREAD:
2729 case LU_LADVISE_DONTNEED:
2731 /* Note fall through above - These checks apply to all advices
2732 * except LOCKNOEXPAND */
2733 if (flags & ~LF_DEFAULT_MASK) {
2735 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2737 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2738 ladvise_names[advice], rc);
2741 if (ladvise->lla_start >= ladvise->lla_end) {
2743 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2744 "for %s: rc = %d\n",
2745 ll_get_fsname(inode->i_sb, NULL, 0),
2746 ladvise->lla_start, ladvise->lla_end,
2747 ladvise_names[advice], rc);
2759 * Give file access advices
2761 * The ladvise interface is similar to Linux fadvise() system call, except it
2762 * forwards the advices directly from Lustre client to server. The server side
2763 * codes will apply appropriate read-ahead and caching techniques for the
2764 * corresponding files.
2766 * A typical workload for ladvise is e.g. a bunch of different clients are
2767 * doing small random reads of a file, so prefetching pages into OSS cache
2768 * with big linear reads before the random IO is a net benefit. Fetching
2769 * all that data into each client cache with fadvise() may not be, due to
2770 * much more data being sent to the client.
2772 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2773 struct llapi_lu_ladvise *ladvise)
2777 struct cl_ladvise_io *lio;
2782 env = cl_env_get(&refcheck);
2784 RETURN(PTR_ERR(env));
2786 io = vvp_env_thread_io(env);
2787 io->ci_obj = ll_i2info(inode)->lli_clob;
2789 /* initialize parameters for ladvise */
2790 lio = &io->u.ci_ladvise;
2791 lio->li_start = ladvise->lla_start;
2792 lio->li_end = ladvise->lla_end;
2793 lio->li_fid = ll_inode2fid(inode);
2794 lio->li_advice = ladvise->lla_advice;
2795 lio->li_flags = flags;
2797 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2798 rc = cl_io_loop(env, io);
2802 cl_io_fini(env, io);
2803 cl_env_put(env, &refcheck);
2807 static int ll_lock_noexpand(struct file *file, int flags)
2809 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2811 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2816 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2819 struct fsxattr fsxattr;
2821 if (copy_from_user(&fsxattr,
2822 (const struct fsxattr __user *)arg,
2826 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2827 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2828 if (copy_to_user((struct fsxattr __user *)arg,
2829 &fsxattr, sizeof(fsxattr)))
2835 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2839 struct md_op_data *op_data;
2840 struct ptlrpc_request *req = NULL;
2842 struct fsxattr fsxattr;
2843 struct cl_object *obj;
2845 /* only root could change project ID */
2846 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2849 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2850 LUSTRE_OPC_ANY, NULL);
2851 if (IS_ERR(op_data))
2852 RETURN(PTR_ERR(op_data));
2854 if (copy_from_user(&fsxattr,
2855 (const struct fsxattr __user *)arg,
2857 GOTO(out_fsxattr1, rc = -EFAULT);
2859 op_data->op_attr_flags = fsxattr.fsx_xflags;
2860 op_data->op_projid = fsxattr.fsx_projid;
2861 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2862 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2864 ptlrpc_req_finished(req);
2866 obj = ll_i2info(inode)->lli_clob;
2870 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2871 OBD_ALLOC_PTR(attr);
2873 GOTO(out_fsxattr1, rc = -ENOMEM);
2874 attr->ia_valid = ATTR_ATTR_FLAG;
2875 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2880 ll_finish_md_op_data(op_data);
2884 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2887 struct inode *inode = file_inode(file);
2888 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2889 struct ll_inode_info *lli = ll_i2info(inode);
2890 struct obd_client_handle *och = NULL;
2893 enum mds_op_bias bias = 0;
2895 size_t data_size = 0;
2899 mutex_lock(&lli->lli_och_mutex);
2900 if (fd->fd_lease_och != NULL) {
2901 och = fd->fd_lease_och;
2902 fd->fd_lease_och = NULL;
2904 mutex_unlock(&lli->lli_och_mutex);
2907 GOTO(out, rc = -ENOLCK);
2909 fmode = och->och_flags;
2911 if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
2912 if (ioc->lil_count > IOC_IDS_MAX)
2913 GOTO(out, rc = -EINVAL);
2915 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2916 OBD_ALLOC(data, data_size);
2918 GOTO(out, rc = -ENOMEM);
2920 if (copy_from_user(data, (void __user *)arg, data_size))
2921 GOTO(out, rc = -EFAULT);
2923 bias = MDS_CLOSE_RESYNC_DONE;
2926 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2930 rc = ll_lease_och_release(inode, file);
2940 OBD_FREE(data, data_size);
2942 rc = ll_lease_type_from_fmode(fmode);
2946 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
2949 struct inode *inode = file_inode(file);
2950 struct ll_inode_info *lli = ll_i2info(inode);
2951 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2952 struct obd_client_handle *och = NULL;
2953 __u64 open_flags = 0;
2959 switch (ioc->lil_mode) {
2960 case LL_LEASE_WRLCK:
2961 if (!(file->f_mode & FMODE_WRITE))
2963 fmode = FMODE_WRITE;
2965 case LL_LEASE_RDLCK:
2966 if (!(file->f_mode & FMODE_READ))
2970 case LL_LEASE_UNLCK:
2971 RETURN(ll_file_unlock_lease(file, ioc, arg));
2976 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2978 /* apply for lease */
2979 if (ioc->lil_flags & LL_LEASE_RESYNC)
2980 open_flags = MDS_OPEN_RESYNC;
2981 och = ll_lease_open(inode, file, fmode, open_flags);
2983 RETURN(PTR_ERR(och));
2985 if (ioc->lil_flags & LL_LEASE_RESYNC) {
2986 rc = ll_lease_file_resync(och, inode);
2988 ll_lease_close(och, inode, NULL);
2991 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
2993 ll_lease_close(och, inode, NULL);
2999 mutex_lock(&lli->lli_och_mutex);
3000 if (fd->fd_lease_och == NULL) {
3001 fd->fd_lease_och = och;
3004 mutex_unlock(&lli->lli_och_mutex);
3006 /* impossible now that only excl is supported for now */
3007 ll_lease_close(och, inode, &lease_broken);
3014 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3016 struct inode *inode = file_inode(file);
3017 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3021 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3022 PFID(ll_inode2fid(inode)), inode, cmd);
3023 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3025 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3026 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3030 case LL_IOC_GETFLAGS:
3031 /* Get the current value of the file flags */
3032 return put_user(fd->fd_flags, (int __user *)arg);
3033 case LL_IOC_SETFLAGS:
3034 case LL_IOC_CLRFLAGS:
3035 /* Set or clear specific file flags */
3036 /* XXX This probably needs checks to ensure the flags are
3037 * not abused, and to handle any flag side effects.
3039 if (get_user(flags, (int __user *) arg))
3042 if (cmd == LL_IOC_SETFLAGS) {
3043 if ((flags & LL_FILE_IGNORE_LOCK) &&
3044 !(file->f_flags & O_DIRECT)) {
3045 CERROR("%s: unable to disable locking on "
3046 "non-O_DIRECT file\n", current->comm);
3050 fd->fd_flags |= flags;
3052 fd->fd_flags &= ~flags;
3055 case LL_IOC_LOV_SETSTRIPE:
3056 case LL_IOC_LOV_SETSTRIPE_NEW:
3057 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3058 case LL_IOC_LOV_SETEA:
3059 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3060 case LL_IOC_LOV_SWAP_LAYOUTS: {
3062 struct lustre_swap_layouts lsl;
3065 if (copy_from_user(&lsl, (char __user *)arg,
3066 sizeof(struct lustre_swap_layouts)))
3069 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3072 file2 = fget(lsl.sl_fd);
3076 /* O_WRONLY or O_RDWR */
3077 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3078 GOTO(out, rc = -EPERM);
3080 intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
3082 struct inode *inode2;
3083 struct ll_inode_info *lli;
3084 struct obd_client_handle *och = NULL;
3086 lli = ll_i2info(inode);
3087 mutex_lock(&lli->lli_och_mutex);
3088 if (fd->fd_lease_och != NULL) {
3089 och = fd->fd_lease_och;
3090 fd->fd_lease_och = NULL;
3092 mutex_unlock(&lli->lli_och_mutex);
3094 GOTO(out, rc = -ENOLCK);
3095 inode2 = file_inode(file2);
3096 rc = ll_swap_layouts_close(och, inode, inode2, intent);
3098 rc = ll_swap_layouts(file, file2, &lsl);
3104 case LL_IOC_LOV_GETSTRIPE:
3105 case LL_IOC_LOV_GETSTRIPE_NEW:
3106 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3107 case FSFILT_IOC_GETFLAGS:
3108 case FSFILT_IOC_SETFLAGS:
3109 RETURN(ll_iocontrol(inode, file, cmd, arg));
3110 case FSFILT_IOC_GETVERSION_OLD:
3111 case FSFILT_IOC_GETVERSION:
3112 RETURN(put_user(inode->i_generation, (int __user *)arg));
3113 case LL_IOC_GROUP_LOCK:
3114 RETURN(ll_get_grouplock(inode, file, arg));
3115 case LL_IOC_GROUP_UNLOCK:
3116 RETURN(ll_put_grouplock(inode, file, arg));
3117 case IOC_OBD_STATFS:
3118 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3120 /* We need to special case any other ioctls we want to handle,
3121 * to send them to the MDS/OST as appropriate and to properly
3122 * network encode the arg field.
3123 case FSFILT_IOC_SETVERSION_OLD:
3124 case FSFILT_IOC_SETVERSION:
3126 case LL_IOC_FLUSHCTX:
3127 RETURN(ll_flush_ctx(inode));
3128 case LL_IOC_PATH2FID: {
3129 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3130 sizeof(struct lu_fid)))
3135 case LL_IOC_GETPARENT:
3136 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3138 case OBD_IOC_FID2PATH:
3139 RETURN(ll_fid2path(inode, (void __user *)arg));
3140 case LL_IOC_DATA_VERSION: {
3141 struct ioc_data_version idv;
3144 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3147 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3148 rc = ll_ioc_data_version(inode, &idv);
3151 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3157 case LL_IOC_GET_MDTIDX: {
3160 mdtidx = ll_get_mdt_idx(inode);
3164 if (put_user((int)mdtidx, (int __user *)arg))
3169 case OBD_IOC_GETDTNAME:
3170 case OBD_IOC_GETMDNAME:
3171 RETURN(ll_get_obd_name(inode, cmd, arg));
3172 case LL_IOC_HSM_STATE_GET: {
3173 struct md_op_data *op_data;
3174 struct hsm_user_state *hus;
3181 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3182 LUSTRE_OPC_ANY, hus);
3183 if (IS_ERR(op_data)) {
3185 RETURN(PTR_ERR(op_data));
3188 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3191 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3194 ll_finish_md_op_data(op_data);
3198 case LL_IOC_HSM_STATE_SET: {
3199 struct hsm_state_set *hss;
3206 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3211 rc = ll_hsm_state_set(inode, hss);
3216 case LL_IOC_HSM_ACTION: {
3217 struct md_op_data *op_data;
3218 struct hsm_current_action *hca;
3225 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3226 LUSTRE_OPC_ANY, hca);
3227 if (IS_ERR(op_data)) {
3229 RETURN(PTR_ERR(op_data));
3232 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3235 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3238 ll_finish_md_op_data(op_data);
3242 case LL_IOC_SET_LEASE_OLD: {
3243 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3245 RETURN(ll_file_set_lease(file, &ioc, 0));
3247 case LL_IOC_SET_LEASE: {
3248 struct ll_ioc_lease ioc;
3250 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3253 RETURN(ll_file_set_lease(file, &ioc, arg));
3255 case LL_IOC_GET_LEASE: {
3256 struct ll_inode_info *lli = ll_i2info(inode);
3257 struct ldlm_lock *lock = NULL;
3260 mutex_lock(&lli->lli_och_mutex);
3261 if (fd->fd_lease_och != NULL) {
3262 struct obd_client_handle *och = fd->fd_lease_och;
3264 lock = ldlm_handle2lock(&och->och_lease_handle);
3266 lock_res_and_lock(lock);
3267 if (!ldlm_is_cancel(lock))
3268 fmode = och->och_flags;
3270 unlock_res_and_lock(lock);
3271 LDLM_LOCK_PUT(lock);
3274 mutex_unlock(&lli->lli_och_mutex);
3276 RETURN(ll_lease_type_from_fmode(fmode));
3278 case LL_IOC_HSM_IMPORT: {
3279 struct hsm_user_import *hui;
3285 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3290 rc = ll_hsm_import(inode, file, hui);
3295 case LL_IOC_FUTIMES_3: {
3296 struct ll_futimes_3 lfu;
3298 if (copy_from_user(&lfu,
3299 (const struct ll_futimes_3 __user *)arg,
3303 RETURN(ll_file_futimes_3(file, &lfu));
3305 case LL_IOC_LADVISE: {
3306 struct llapi_ladvise_hdr *k_ladvise_hdr;
3307 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3310 int alloc_size = sizeof(*k_ladvise_hdr);
3313 u_ladvise_hdr = (void __user *)arg;
3314 OBD_ALLOC_PTR(k_ladvise_hdr);
3315 if (k_ladvise_hdr == NULL)
3318 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3319 GOTO(out_ladvise, rc = -EFAULT);
3321 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3322 k_ladvise_hdr->lah_count < 1)
3323 GOTO(out_ladvise, rc = -EINVAL);
3325 num_advise = k_ladvise_hdr->lah_count;
3326 if (num_advise >= LAH_COUNT_MAX)
3327 GOTO(out_ladvise, rc = -EFBIG);
3329 OBD_FREE_PTR(k_ladvise_hdr);
3330 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3331 lah_advise[num_advise]);
3332 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3333 if (k_ladvise_hdr == NULL)
3337 * TODO: submit multiple advices to one server in a single RPC
3339 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3340 GOTO(out_ladvise, rc = -EFAULT);
3342 for (i = 0; i < num_advise; i++) {
3343 struct llapi_lu_ladvise *k_ladvise =
3344 &k_ladvise_hdr->lah_advise[i];
3345 struct llapi_lu_ladvise __user *u_ladvise =
3346 &u_ladvise_hdr->lah_advise[i];
3348 rc = ll_ladvise_sanity(inode, k_ladvise);
3350 GOTO(out_ladvise, rc);
3352 switch (k_ladvise->lla_advice) {
3353 case LU_LADVISE_LOCKNOEXPAND:
3354 rc = ll_lock_noexpand(file,
3355 k_ladvise->lla_peradvice_flags);
3356 GOTO(out_ladvise, rc);
3357 case LU_LADVISE_LOCKAHEAD:
3359 rc = ll_file_lock_ahead(file, k_ladvise);
3362 GOTO(out_ladvise, rc);
3365 &u_ladvise->lla_lockahead_result))
3366 GOTO(out_ladvise, rc = -EFAULT);
3369 rc = ll_ladvise(inode, file,
3370 k_ladvise_hdr->lah_flags,
3373 GOTO(out_ladvise, rc);
3380 OBD_FREE(k_ladvise_hdr, alloc_size);
3383 case LL_IOC_FLR_SET_MIRROR: {
3384 /* mirror I/O must be direct to avoid polluting page cache
3386 if (!(file->f_flags & O_DIRECT))
3389 fd->fd_designated_mirror = (__u32)arg;
3392 case LL_IOC_FSGETXATTR:
3393 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3394 case LL_IOC_FSSETXATTR:
3395 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3397 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3399 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3400 (void __user *)arg));
3404 #ifndef HAVE_FILE_LLSEEK_SIZE
3405 static inline loff_t
3406 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3408 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3410 if (offset > maxsize)
3413 if (offset != file->f_pos) {
3414 file->f_pos = offset;
3415 file->f_version = 0;
3421 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3422 loff_t maxsize, loff_t eof)
3424 struct inode *inode = file_inode(file);
3432 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3433 * position-querying operation. Avoid rewriting the "same"
3434 * f_pos value back to the file because a concurrent read(),
3435 * write() or lseek() might have altered it
3440 * f_lock protects against read/modify/write race with other
3441 * SEEK_CURs. Note that parallel writes and reads behave
3445 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3446 inode_unlock(inode);
3450 * In the generic case the entire file is data, so as long as
3451 * offset isn't at the end of the file then the offset is data.
3458 * There is a virtual hole at the end of the file, so as long as
3459 * offset isn't i_size or larger, return i_size.
3467 return llseek_execute(file, offset, maxsize);
3471 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3473 struct inode *inode = file_inode(file);
3474 loff_t retval, eof = 0;
3477 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3478 (origin == SEEK_CUR) ? file->f_pos : 0);
3479 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3480 PFID(ll_inode2fid(inode)), inode, retval, retval,
3482 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3484 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3485 retval = ll_glimpse_size(inode);
3488 eof = i_size_read(inode);
3491 retval = ll_generic_file_llseek_size(file, offset, origin,
3492 ll_file_maxbytes(inode), eof);
3496 static int ll_flush(struct file *file, fl_owner_t id)
3498 struct inode *inode = file_inode(file);
3499 struct ll_inode_info *lli = ll_i2info(inode);
3500 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3503 LASSERT(!S_ISDIR(inode->i_mode));
3505 /* catch async errors that were recorded back when async writeback
3506 * failed for pages in this mapping. */
3507 rc = lli->lli_async_rc;
3508 lli->lli_async_rc = 0;
3509 if (lli->lli_clob != NULL) {
3510 err = lov_read_and_clear_async_rc(lli->lli_clob);
3515 /* The application has been told write failure already.
3516 * Do not report failure again. */
3517 if (fd->fd_write_failed)
3519 return rc ? -EIO : 0;
3523 * Called to make sure a portion of file has been written out.
3524 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3526 * Return how many pages have been written.
3528 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3529 enum cl_fsync_mode mode, int ignore_layout)
3533 struct cl_fsync_io *fio;
3538 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3539 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3542 env = cl_env_get(&refcheck);
3544 RETURN(PTR_ERR(env));
3546 io = vvp_env_thread_io(env);
3547 io->ci_obj = ll_i2info(inode)->lli_clob;
3548 io->ci_ignore_layout = ignore_layout;
3550 /* initialize parameters for sync */
3551 fio = &io->u.ci_fsync;
3552 fio->fi_start = start;
3554 fio->fi_fid = ll_inode2fid(inode);
3555 fio->fi_mode = mode;
3556 fio->fi_nr_written = 0;
3558 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3559 result = cl_io_loop(env, io);
3561 result = io->ci_result;
3563 result = fio->fi_nr_written;
3564 cl_io_fini(env, io);
3565 cl_env_put(env, &refcheck);
3571 * When dentry is provided (the 'else' case), file_dentry() may be
3572 * null and dentry must be used directly rather than pulled from
3573 * file_dentry() as is done otherwise.
3576 #ifdef HAVE_FILE_FSYNC_4ARGS
3577 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3579 struct dentry *dentry = file_dentry(file);
3581 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3582 int ll_fsync(struct file *file, int datasync)
3584 struct dentry *dentry = file_dentry(file);
3586 loff_t end = LLONG_MAX;
3588 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3591 loff_t end = LLONG_MAX;
3593 struct inode *inode = dentry->d_inode;
3594 struct ll_inode_info *lli = ll_i2info(inode);
3595 struct ptlrpc_request *req;
3599 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3600 PFID(ll_inode2fid(inode)), inode);
3601 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3603 #ifdef HAVE_FILE_FSYNC_4ARGS
3604 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3605 lock_inode = !lli->lli_inode_locked;
3609 /* fsync's caller has already called _fdata{sync,write}, we want
3610 * that IO to finish before calling the osc and mdc sync methods */
3611 rc = filemap_fdatawait(inode->i_mapping);
3614 /* catch async errors that were recorded back when async writeback
3615 * failed for pages in this mapping. */
3616 if (!S_ISDIR(inode->i_mode)) {
3617 err = lli->lli_async_rc;
3618 lli->lli_async_rc = 0;
3621 if (lli->lli_clob != NULL) {
3622 err = lov_read_and_clear_async_rc(lli->lli_clob);
3628 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3632 ptlrpc_req_finished(req);
3634 if (S_ISREG(inode->i_mode)) {
3635 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3637 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3638 if (rc == 0 && err < 0)
3641 fd->fd_write_failed = true;
3643 fd->fd_write_failed = false;
3646 #ifdef HAVE_FILE_FSYNC_4ARGS
3648 inode_unlock(inode);
3654 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3656 struct inode *inode = file_inode(file);
3657 struct ll_sb_info *sbi = ll_i2sbi(inode);
3658 struct ldlm_enqueue_info einfo = {
3659 .ei_type = LDLM_FLOCK,
3660 .ei_cb_cp = ldlm_flock_completion_ast,
3661 .ei_cbdata = file_lock,
3663 struct md_op_data *op_data;
3664 struct lustre_handle lockh = { 0 };
3665 union ldlm_policy_data flock = { { 0 } };
3666 int fl_type = file_lock->fl_type;
3672 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3673 PFID(ll_inode2fid(inode)), file_lock);
3675 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3677 if (file_lock->fl_flags & FL_FLOCK) {
3678 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3679 /* flocks are whole-file locks */
3680 flock.l_flock.end = OFFSET_MAX;
3681 /* For flocks owner is determined by the local file desctiptor*/
3682 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3683 } else if (file_lock->fl_flags & FL_POSIX) {
3684 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3685 flock.l_flock.start = file_lock->fl_start;
3686 flock.l_flock.end = file_lock->fl_end;
3690 flock.l_flock.pid = file_lock->fl_pid;
3692 /* Somewhat ugly workaround for svc lockd.
3693 * lockd installs custom fl_lmops->lm_compare_owner that checks
3694 * for the fl_owner to be the same (which it always is on local node
3695 * I guess between lockd processes) and then compares pid.
3696 * As such we assign pid to the owner field to make it all work,
3697 * conflict with normal locks is unlikely since pid space and
3698 * pointer space for current->files are not intersecting */
3699 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3700 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3704 einfo.ei_mode = LCK_PR;
3707 /* An unlock request may or may not have any relation to
3708 * existing locks so we may not be able to pass a lock handle
3709 * via a normal ldlm_lock_cancel() request. The request may even
3710 * unlock a byte range in the middle of an existing lock. In
3711 * order to process an unlock request we need all of the same
3712 * information that is given with a normal read or write record
3713 * lock request. To avoid creating another ldlm unlock (cancel)
3714 * message we'll treat a LCK_NL flock request as an unlock. */
3715 einfo.ei_mode = LCK_NL;
3718 einfo.ei_mode = LCK_PW;
3721 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3736 flags = LDLM_FL_BLOCK_NOWAIT;
3742 flags = LDLM_FL_TEST_LOCK;
3745 CERROR("unknown fcntl lock command: %d\n", cmd);
3749 /* Save the old mode so that if the mode in the lock changes we
3750 * can decrement the appropriate reader or writer refcount. */
3751 file_lock->fl_type = einfo.ei_mode;
3753 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3754 LUSTRE_OPC_ANY, NULL);
3755 if (IS_ERR(op_data))
3756 RETURN(PTR_ERR(op_data));
3758 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3759 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3760 flock.l_flock.pid, flags, einfo.ei_mode,
3761 flock.l_flock.start, flock.l_flock.end);
3763 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3766 /* Restore the file lock type if not TEST lock. */
3767 if (!(flags & LDLM_FL_TEST_LOCK))
3768 file_lock->fl_type = fl_type;
3770 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3771 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3772 !(flags & LDLM_FL_TEST_LOCK))
3773 rc2 = locks_lock_file_wait(file, file_lock);
3775 if ((file_lock->fl_flags & FL_FLOCK) &&
3776 (rc == 0 || file_lock->fl_type == F_UNLCK))
3777 rc2 = flock_lock_file_wait(file, file_lock);
3778 if ((file_lock->fl_flags & FL_POSIX) &&
3779 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3780 !(flags & LDLM_FL_TEST_LOCK))
3781 rc2 = posix_lock_file_wait(file, file_lock);
3782 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3784 if (rc2 && file_lock->fl_type != F_UNLCK) {
3785 einfo.ei_mode = LCK_NL;
3786 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3791 ll_finish_md_op_data(op_data);
3796 int ll_get_fid_by_name(struct inode *parent, const char *name,
3797 int namelen, struct lu_fid *fid,
3798 struct inode **inode)
3800 struct md_op_data *op_data = NULL;
3801 struct mdt_body *body;
3802 struct ptlrpc_request *req;
3806 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3807 LUSTRE_OPC_ANY, NULL);
3808 if (IS_ERR(op_data))
3809 RETURN(PTR_ERR(op_data));
3811 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3812 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3813 ll_finish_md_op_data(op_data);
3817 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3819 GOTO(out_req, rc = -EFAULT);
3821 *fid = body->mbo_fid1;
3824 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3826 ptlrpc_req_finished(req);
3830 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3831 const char *name, int namelen)
3833 struct dentry *dchild = NULL;
3834 struct inode *child_inode = NULL;
3835 struct md_op_data *op_data;
3836 struct ptlrpc_request *request = NULL;
3837 struct obd_client_handle *och = NULL;
3839 struct mdt_body *body;
3841 __u64 data_version = 0;
3844 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3845 name, PFID(ll_inode2fid(parent)), mdtidx);
3847 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3848 0, LUSTRE_OPC_ANY, NULL);
3849 if (IS_ERR(op_data))
3850 RETURN(PTR_ERR(op_data));
3852 /* Get child FID first */
3853 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3856 dchild = d_lookup(file_dentry(file), &qstr);
3857 if (dchild != NULL) {
3858 if (dchild->d_inode != NULL)
3859 child_inode = igrab(dchild->d_inode);
3863 if (child_inode == NULL) {
3864 rc = ll_get_fid_by_name(parent, name, namelen,
3865 &op_data->op_fid3, &child_inode);
3870 if (child_inode == NULL)
3871 GOTO(out_free, rc = -EINVAL);
3874 * lfs migrate command needs to be blocked on the client
3875 * by checking the migrate FID against the FID of the
3878 if (child_inode == parent->i_sb->s_root->d_inode)
3879 GOTO(out_iput, rc = -EINVAL);
3881 inode_lock(child_inode);
3882 op_data->op_fid3 = *ll_inode2fid(child_inode);
3883 if (!fid_is_sane(&op_data->op_fid3)) {
3884 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3885 ll_get_fsname(parent->i_sb, NULL, 0), name,
3886 PFID(&op_data->op_fid3));
3887 GOTO(out_unlock, rc = -EINVAL);
3890 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3892 GOTO(out_unlock, rc);
3895 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3896 PFID(&op_data->op_fid3), mdtidx);
3897 GOTO(out_unlock, rc = 0);
3900 if (S_ISREG(child_inode->i_mode)) {
3901 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3905 GOTO(out_unlock, rc);
3908 rc = ll_data_version(child_inode, &data_version,
3911 GOTO(out_close, rc);
3913 op_data->op_handle = och->och_fh;
3914 op_data->op_data = och->och_mod;
3915 op_data->op_data_version = data_version;
3916 op_data->op_lease_handle = och->och_lease_handle;
3917 op_data->op_bias |= MDS_RENAME_MIGRATE;
3920 op_data->op_mds = mdtidx;
3921 op_data->op_cli_flags = CLI_MIGRATE;
3922 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3923 namelen, name, namelen, &request);
3925 LASSERT(request != NULL);
3926 ll_update_times(request, parent);
3928 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3929 LASSERT(body != NULL);
3931 /* If the server does release layout lock, then we cleanup
3932 * the client och here, otherwise release it in out_close: */
3934 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3935 obd_mod_put(och->och_mod);
3936 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3938 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3944 if (request != NULL) {
3945 ptlrpc_req_finished(request);
3949 /* Try again if the file layout has changed. */
3950 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3954 if (och != NULL) /* close the file */
3955 ll_lease_close(och, child_inode, NULL);
3957 clear_nlink(child_inode);
3959 inode_unlock(child_inode);
3963 ll_finish_md_op_data(op_data);
3968 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3976 * test if some locks matching bits and l_req_mode are acquired
3977 * - bits can be in different locks
3978 * - if found clear the common lock bits in *bits
3979 * - the bits not found, are kept in *bits
3981 * \param bits [IN] searched lock bits [IN]
3982 * \param l_req_mode [IN] searched lock mode
3983 * \retval boolean, true iff all bits are found
3985 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3987 struct lustre_handle lockh;
3988 union ldlm_policy_data policy;
3989 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3990 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3999 fid = &ll_i2info(inode)->lli_fid;
4000 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4001 ldlm_lockname[mode]);
4003 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4004 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4005 policy.l_inodebits.bits = *bits & (1 << i);
4006 if (policy.l_inodebits.bits == 0)
4009 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4010 &policy, mode, &lockh)) {
4011 struct ldlm_lock *lock;
4013 lock = ldlm_handle2lock(&lockh);
4016 ~(lock->l_policy_data.l_inodebits.bits);
4017 LDLM_LOCK_PUT(lock);
4019 *bits &= ~policy.l_inodebits.bits;
4026 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4027 struct lustre_handle *lockh, __u64 flags,
4028 enum ldlm_mode mode)
4030 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4035 fid = &ll_i2info(inode)->lli_fid;
4036 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4038 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4039 fid, LDLM_IBITS, &policy, mode, lockh);
4044 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4046 /* Already unlinked. Just update nlink and return success */
4047 if (rc == -ENOENT) {
4049 /* If it is striped directory, and there is bad stripe
4050 * Let's revalidate the dentry again, instead of returning
4052 if (S_ISDIR(inode->i_mode) &&
4053 ll_i2info(inode)->lli_lsm_md != NULL)
4056 /* This path cannot be hit for regular files unless in
4057 * case of obscure races, so no need to to validate
4059 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4061 } else if (rc != 0) {
4062 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4063 "%s: revalidate FID "DFID" error: rc = %d\n",
4064 ll_get_fsname(inode->i_sb, NULL, 0),
4065 PFID(ll_inode2fid(inode)), rc);
4071 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4073 struct inode *inode = dentry->d_inode;
4074 struct ptlrpc_request *req = NULL;
4075 struct obd_export *exp;
4079 LASSERT(inode != NULL);
4081 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4082 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4084 exp = ll_i2mdexp(inode);
4086 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
4087 * But under CMD case, it caused some lock issues, should be fixed
4088 * with new CMD ibits lock. See bug 12718 */
4089 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
4090 struct lookup_intent oit = { .it_op = IT_GETATTR };
4091 struct md_op_data *op_data;
4093 if (ibits == MDS_INODELOCK_LOOKUP)
4094 oit.it_op = IT_LOOKUP;
4096 /* Call getattr by fid, so do not provide name at all. */
4097 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
4098 dentry->d_inode, NULL, 0, 0,
4099 LUSTRE_OPC_ANY, NULL);
4100 if (IS_ERR(op_data))
4101 RETURN(PTR_ERR(op_data));
4103 rc = md_intent_lock(exp, op_data, &oit, &req,
4104 &ll_md_blocking_ast, 0);
4105 ll_finish_md_op_data(op_data);
4107 rc = ll_inode_revalidate_fini(inode, rc);
4111 rc = ll_revalidate_it_finish(req, &oit, dentry);
4113 ll_intent_release(&oit);
4117 /* Unlinked? Unhash dentry, so it is not picked up later by
4118 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4119 here to preserve get_cwd functionality on 2.6.
4121 if (!dentry->d_inode->i_nlink) {
4122 ll_lock_dcache(inode);
4123 d_lustre_invalidate(dentry, 0);
4124 ll_unlock_dcache(inode);
4127 ll_lookup_finish_locks(&oit, dentry);
4128 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
4129 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
4130 u64 valid = OBD_MD_FLGETATTR;
4131 struct md_op_data *op_data;
4134 if (S_ISREG(inode->i_mode)) {
4135 rc = ll_get_default_mdsize(sbi, &ealen);
4138 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
4141 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
4142 0, ealen, LUSTRE_OPC_ANY,
4144 if (IS_ERR(op_data))
4145 RETURN(PTR_ERR(op_data));
4147 op_data->op_valid = valid;
4148 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
4149 ll_finish_md_op_data(op_data);
4151 rc = ll_inode_revalidate_fini(inode, rc);
4155 rc = ll_prep_inode(&inode, req, NULL, NULL);
4158 ptlrpc_req_finished(req);
4162 static int ll_merge_md_attr(struct inode *inode)
4164 struct cl_attr attr = { 0 };
4167 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4168 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4169 &attr, ll_md_blocking_ast);
4173 set_nlink(inode, attr.cat_nlink);
4174 inode->i_blocks = attr.cat_blocks;
4175 i_size_write(inode, attr.cat_size);
4177 ll_i2info(inode)->lli_atime = attr.cat_atime;
4178 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4179 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4185 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
4187 struct inode *inode = dentry->d_inode;
4191 rc = __ll_inode_revalidate(dentry, ibits);
4195 /* if object isn't regular file, don't validate size */
4196 if (!S_ISREG(inode->i_mode)) {
4197 if (S_ISDIR(inode->i_mode) &&
4198 ll_i2info(inode)->lli_lsm_md != NULL) {
4199 rc = ll_merge_md_attr(inode);
4204 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
4205 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
4206 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
4208 /* In case of restore, the MDT has the right size and has
4209 * already send it back without granting the layout lock,
4210 * inode is up-to-date so glimpse is useless.
4211 * Also to glimpse we need the layout, in case of a running
4212 * restore the MDT holds the layout lock so the glimpse will
4213 * block up to the end of restore (getattr will block)
4215 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
4216 rc = ll_glimpse_size(inode);
4221 static inline dev_t ll_compat_encode_dev(dev_t dev)
4223 /* The compat_sys_*stat*() syscalls will fail unless the
4224 * device majors and minors are both less than 256. Note that
4225 * the value returned here will be passed through
4226 * old_encode_dev() in cp_compat_stat(). And so we are not
4227 * trying to return a valid compat (u16) device number, just
4228 * one that will pass the old_valid_dev() check. */
4230 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4233 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4234 int ll_getattr(const struct path *path, struct kstat *stat,
4235 u32 request_mask, unsigned int flags)
4238 struct dentry *de = path->dentry;
4240 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4243 struct inode *inode = de->d_inode;
4244 struct ll_sb_info *sbi = ll_i2sbi(inode);
4245 struct ll_inode_info *lli = ll_i2info(inode);
4248 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
4249 MDS_INODELOCK_LOOKUP);
4250 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4255 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4257 if (ll_need_32bit_api(sbi)) {
4258 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4259 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4260 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4262 stat->ino = inode->i_ino;
4263 stat->dev = inode->i_sb->s_dev;
4264 stat->rdev = inode->i_rdev;
4267 stat->mode = inode->i_mode;
4268 stat->uid = inode->i_uid;
4269 stat->gid = inode->i_gid;
4270 stat->atime = inode->i_atime;
4271 stat->mtime = inode->i_mtime;
4272 stat->ctime = inode->i_ctime;
4273 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4275 stat->nlink = inode->i_nlink;
4276 stat->size = i_size_read(inode);
4277 stat->blocks = inode->i_blocks;
4282 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4283 __u64 start, __u64 len)
4287 struct fiemap *fiemap;
4288 unsigned int extent_count = fieinfo->fi_extents_max;
4290 num_bytes = sizeof(*fiemap) + (extent_count *
4291 sizeof(struct fiemap_extent));
4292 OBD_ALLOC_LARGE(fiemap, num_bytes);
4297 fiemap->fm_flags = fieinfo->fi_flags;
4298 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4299 fiemap->fm_start = start;
4300 fiemap->fm_length = len;
4301 if (extent_count > 0 &&
4302 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4303 sizeof(struct fiemap_extent)) != 0)
4304 GOTO(out, rc = -EFAULT);
4306 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4308 fieinfo->fi_flags = fiemap->fm_flags;
4309 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4310 if (extent_count > 0 &&
4311 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4312 fiemap->fm_mapped_extents *
4313 sizeof(struct fiemap_extent)) != 0)
4314 GOTO(out, rc = -EFAULT);
4316 OBD_FREE_LARGE(fiemap, num_bytes);
4320 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4322 struct ll_inode_info *lli = ll_i2info(inode);
4323 struct posix_acl *acl = NULL;
4326 spin_lock(&lli->lli_lock);
4327 /* VFS' acl_permission_check->check_acl will release the refcount */
4328 acl = posix_acl_dup(lli->lli_posix_acl);
4329 spin_unlock(&lli->lli_lock);
4334 #ifdef HAVE_IOP_SET_ACL
4335 #ifdef CONFIG_FS_POSIX_ACL
4336 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4338 const char *name = NULL;
4345 case ACL_TYPE_ACCESS:
4347 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4351 name = XATTR_NAME_POSIX_ACL_ACCESS;
4353 case ACL_TYPE_DEFAULT:
4354 if (!S_ISDIR(inode->i_mode))
4355 GOTO(out, rc = acl ? -EACCES : 0);
4356 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4359 GOTO(out, rc = -EINVAL);
4363 size = posix_acl_xattr_size(acl->a_count);
4364 value = kmalloc(size, GFP_NOFS);
4366 GOTO(out, rc = -ENOMEM);
4368 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4373 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4374 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4379 set_cached_acl(inode, type, acl);
4381 forget_cached_acl(inode, type);
4384 #endif /* CONFIG_FS_POSIX_ACL */
4385 #endif /* HAVE_IOP_SET_ACL */
4387 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4389 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4390 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4392 ll_check_acl(struct inode *inode, int mask)
4395 # ifdef CONFIG_FS_POSIX_ACL
4396 struct posix_acl *acl;
4400 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4401 if (flags & IPERM_FLAG_RCU)
4404 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4409 rc = posix_acl_permission(inode, acl, mask);
4410 posix_acl_release(acl);
4413 # else /* !CONFIG_FS_POSIX_ACL */
4415 # endif /* CONFIG_FS_POSIX_ACL */
4417 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4419 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4420 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4422 # ifdef HAVE_INODE_PERMISION_2ARGS
4423 int ll_inode_permission(struct inode *inode, int mask)
4425 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4430 struct ll_sb_info *sbi;
4431 struct root_squash_info *squash;
4432 struct cred *cred = NULL;
4433 const struct cred *old_cred = NULL;
4435 bool squash_id = false;
4438 #ifdef MAY_NOT_BLOCK
4439 if (mask & MAY_NOT_BLOCK)
4441 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4442 if (flags & IPERM_FLAG_RCU)
4446 /* as root inode are NOT getting validated in lookup operation,
4447 * need to do it before permission check. */
4449 if (inode == inode->i_sb->s_root->d_inode) {
4450 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4451 MDS_INODELOCK_LOOKUP);
4456 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4457 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4459 /* squash fsuid/fsgid if needed */
4460 sbi = ll_i2sbi(inode);
4461 squash = &sbi->ll_squash;
4462 if (unlikely(squash->rsi_uid != 0 &&
4463 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4464 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4468 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4469 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4470 squash->rsi_uid, squash->rsi_gid);
4472 /* update current process's credentials
4473 * and FS capability */
4474 cred = prepare_creds();
4478 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4479 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4480 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4481 if ((1 << cap) & CFS_CAP_FS_MASK)
4482 cap_lower(cred->cap_effective, cap);
4484 old_cred = override_creds(cred);
4487 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4488 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4489 /* restore current process's credentials and FS capability */
4491 revert_creds(old_cred);
4498 /* -o localflock - only provides locally consistent flock locks */
4499 struct file_operations ll_file_operations = {
4500 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4501 # ifdef HAVE_SYNC_READ_WRITE
4502 .read = new_sync_read,
4503 .write = new_sync_write,
4505 .read_iter = ll_file_read_iter,
4506 .write_iter = ll_file_write_iter,
4507 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4508 .read = ll_file_read,
4509 .aio_read = ll_file_aio_read,
4510 .write = ll_file_write,
4511 .aio_write = ll_file_aio_write,
4512 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4513 .unlocked_ioctl = ll_file_ioctl,
4514 .open = ll_file_open,
4515 .release = ll_file_release,
4516 .mmap = ll_file_mmap,
4517 .llseek = ll_file_seek,
4518 .splice_read = ll_file_splice_read,
4523 struct file_operations ll_file_operations_flock = {
4524 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4525 # ifdef HAVE_SYNC_READ_WRITE
4526 .read = new_sync_read,
4527 .write = new_sync_write,
4528 # endif /* HAVE_SYNC_READ_WRITE */
4529 .read_iter = ll_file_read_iter,
4530 .write_iter = ll_file_write_iter,
4531 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4532 .read = ll_file_read,
4533 .aio_read = ll_file_aio_read,
4534 .write = ll_file_write,
4535 .aio_write = ll_file_aio_write,
4536 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4537 .unlocked_ioctl = ll_file_ioctl,
4538 .open = ll_file_open,
4539 .release = ll_file_release,
4540 .mmap = ll_file_mmap,
4541 .llseek = ll_file_seek,
4542 .splice_read = ll_file_splice_read,
4545 .flock = ll_file_flock,
4546 .lock = ll_file_flock
4549 /* These are for -o noflock - to return ENOSYS on flock calls */
4550 struct file_operations ll_file_operations_noflock = {
4551 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4552 # ifdef HAVE_SYNC_READ_WRITE
4553 .read = new_sync_read,
4554 .write = new_sync_write,
4555 # endif /* HAVE_SYNC_READ_WRITE */
4556 .read_iter = ll_file_read_iter,
4557 .write_iter = ll_file_write_iter,
4558 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4559 .read = ll_file_read,
4560 .aio_read = ll_file_aio_read,
4561 .write = ll_file_write,
4562 .aio_write = ll_file_aio_write,
4563 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4564 .unlocked_ioctl = ll_file_ioctl,
4565 .open = ll_file_open,
4566 .release = ll_file_release,
4567 .mmap = ll_file_mmap,
4568 .llseek = ll_file_seek,
4569 .splice_read = ll_file_splice_read,
4572 .flock = ll_file_noflock,
4573 .lock = ll_file_noflock
4576 struct inode_operations ll_file_inode_operations = {
4577 .setattr = ll_setattr,
4578 .getattr = ll_getattr,
4579 .permission = ll_inode_permission,
4580 #ifdef HAVE_IOP_XATTR
4581 .setxattr = ll_setxattr,
4582 .getxattr = ll_getxattr,
4583 .removexattr = ll_removexattr,
4585 .listxattr = ll_listxattr,
4586 .fiemap = ll_fiemap,
4587 #ifdef HAVE_IOP_GET_ACL
4588 .get_acl = ll_get_acl,
4590 #ifdef HAVE_IOP_SET_ACL
4591 .set_acl = ll_set_acl,
4595 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4597 struct ll_inode_info *lli = ll_i2info(inode);
4598 struct cl_object *obj = lli->lli_clob;
4607 env = cl_env_get(&refcheck);
4609 RETURN(PTR_ERR(env));
4611 rc = cl_conf_set(env, lli->lli_clob, conf);
4615 if (conf->coc_opc == OBJECT_CONF_SET) {
4616 struct ldlm_lock *lock = conf->coc_lock;
4617 struct cl_layout cl = {
4621 LASSERT(lock != NULL);
4622 LASSERT(ldlm_has_layout(lock));
4624 /* it can only be allowed to match after layout is
4625 * applied to inode otherwise false layout would be
4626 * seen. Applying layout shoud happen before dropping
4627 * the intent lock. */
4628 ldlm_lock_allow_match(lock);
4630 rc = cl_object_layout_get(env, obj, &cl);
4635 DFID": layout version change: %u -> %u\n",
4636 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4638 ll_layout_version_set(lli, cl.cl_layout_gen);
4642 cl_env_put(env, &refcheck);
4647 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4648 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4651 struct ll_sb_info *sbi = ll_i2sbi(inode);
4652 struct ptlrpc_request *req;
4653 struct mdt_body *body;
4660 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4661 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4662 lock->l_lvb_data, lock->l_lvb_len);
4664 if (lock->l_lvb_data != NULL)
4667 /* if layout lock was granted right away, the layout is returned
4668 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4669 * blocked and then granted via completion ast, we have to fetch
4670 * layout here. Please note that we can't use the LVB buffer in
4671 * completion AST because it doesn't have a large enough buffer */
4672 rc = ll_get_default_mdsize(sbi, &lmmsize);
4674 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4675 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4680 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4682 GOTO(out, rc = -EPROTO);
4684 lmmsize = body->mbo_eadatasize;
4685 if (lmmsize == 0) /* empty layout */
4688 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4690 GOTO(out, rc = -EFAULT);
4692 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4693 if (lvbdata == NULL)
4694 GOTO(out, rc = -ENOMEM);
4696 memcpy(lvbdata, lmm, lmmsize);
4697 lock_res_and_lock(lock);
4698 if (unlikely(lock->l_lvb_data == NULL)) {
4699 lock->l_lvb_type = LVB_T_LAYOUT;
4700 lock->l_lvb_data = lvbdata;
4701 lock->l_lvb_len = lmmsize;
4704 unlock_res_and_lock(lock);
4707 OBD_FREE_LARGE(lvbdata, lmmsize);
4712 ptlrpc_req_finished(req);
4717 * Apply the layout to the inode. Layout lock is held and will be released
4720 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4721 struct inode *inode)
4723 struct ll_inode_info *lli = ll_i2info(inode);
4724 struct ll_sb_info *sbi = ll_i2sbi(inode);
4725 struct ldlm_lock *lock;
4726 struct cl_object_conf conf;
4729 bool wait_layout = false;
4732 LASSERT(lustre_handle_is_used(lockh));
4734 lock = ldlm_handle2lock(lockh);
4735 LASSERT(lock != NULL);
4736 LASSERT(ldlm_has_layout(lock));
4738 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4739 PFID(&lli->lli_fid), inode);
4741 /* in case this is a caching lock and reinstate with new inode */
4742 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4744 lock_res_and_lock(lock);
4745 lvb_ready = ldlm_is_lvb_ready(lock);
4746 unlock_res_and_lock(lock);
4748 /* checking lvb_ready is racy but this is okay. The worst case is
4749 * that multi processes may configure the file on the same time. */
4753 rc = ll_layout_fetch(inode, lock);
4757 /* for layout lock, lmm is stored in lock's lvb.
4758 * lvb_data is immutable if the lock is held so it's safe to access it
4761 * set layout to file. Unlikely this will fail as old layout was
4762 * surely eliminated */
4763 memset(&conf, 0, sizeof conf);
4764 conf.coc_opc = OBJECT_CONF_SET;
4765 conf.coc_inode = inode;
4766 conf.coc_lock = lock;
4767 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4768 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4769 rc = ll_layout_conf(inode, &conf);
4771 /* refresh layout failed, need to wait */
4772 wait_layout = rc == -EBUSY;
4775 LDLM_LOCK_PUT(lock);
4776 ldlm_lock_decref(lockh, mode);
4778 /* wait for IO to complete if it's still being used. */
4780 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4781 ll_get_fsname(inode->i_sb, NULL, 0),
4782 PFID(&lli->lli_fid), inode);
4784 memset(&conf, 0, sizeof conf);
4785 conf.coc_opc = OBJECT_CONF_WAIT;
4786 conf.coc_inode = inode;
4787 rc = ll_layout_conf(inode, &conf);
4791 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4792 ll_get_fsname(inode->i_sb, NULL, 0),
4793 PFID(&lli->lli_fid), rc);
4799 * Issue layout intent RPC to MDS.
4800 * \param inode [in] file inode
4801 * \param intent [in] layout intent
4803 * \retval 0 on success
4804 * \retval < 0 error code
4806 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4808 struct ll_inode_info *lli = ll_i2info(inode);
4809 struct ll_sb_info *sbi = ll_i2sbi(inode);
4810 struct md_op_data *op_data;
4811 struct lookup_intent it;
4812 struct ptlrpc_request *req;
4816 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4817 0, 0, LUSTRE_OPC_ANY, NULL);
4818 if (IS_ERR(op_data))
4819 RETURN(PTR_ERR(op_data));
4821 op_data->op_data = intent;
4822 op_data->op_data_size = sizeof(*intent);
4824 memset(&it, 0, sizeof(it));
4825 it.it_op = IT_LAYOUT;
4826 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4827 intent->li_opc == LAYOUT_INTENT_TRUNC)
4828 it.it_flags = FMODE_WRITE;
4830 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4831 ll_get_fsname(inode->i_sb, NULL, 0),
4832 PFID(&lli->lli_fid), inode);
4834 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4835 &ll_md_blocking_ast, 0);
4836 if (it.it_request != NULL)
4837 ptlrpc_req_finished(it.it_request);
4838 it.it_request = NULL;
4840 ll_finish_md_op_data(op_data);
4842 /* set lock data in case this is a new lock */
4844 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4846 ll_intent_drop_lock(&it);
4852 * This function checks if there exists a LAYOUT lock on the client side,
4853 * or enqueues it if it doesn't have one in cache.
4855 * This function will not hold layout lock so it may be revoked any time after
4856 * this function returns. Any operations depend on layout should be redone
4859 * This function should be called before lov_io_init() to get an uptodate
4860 * layout version, the caller should save the version number and after IO
4861 * is finished, this function should be called again to verify that layout
4862 * is not changed during IO time.
4864 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4866 struct ll_inode_info *lli = ll_i2info(inode);
4867 struct ll_sb_info *sbi = ll_i2sbi(inode);
4868 struct lustre_handle lockh;
4869 struct layout_intent intent = {
4870 .li_opc = LAYOUT_INTENT_ACCESS,
4872 enum ldlm_mode mode;
4876 *gen = ll_layout_version_get(lli);
4877 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4881 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4882 LASSERT(S_ISREG(inode->i_mode));
4884 /* take layout lock mutex to enqueue layout lock exclusively. */
4885 mutex_lock(&lli->lli_layout_mutex);
4888 /* mostly layout lock is caching on the local side, so try to
4889 * match it before grabbing layout lock mutex. */
4890 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4891 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4892 if (mode != 0) { /* hit cached lock */
4893 rc = ll_layout_lock_set(&lockh, mode, inode);
4899 rc = ll_layout_intent(inode, &intent);
4905 *gen = ll_layout_version_get(lli);
4906 mutex_unlock(&lli->lli_layout_mutex);
4912 * Issue layout intent RPC indicating where in a file an IO is about to write.
4914 * \param[in] inode file inode.
4915 * \param[in] ext write range with start offset of fille in bytes where
4916 * an IO is about to write, and exclusive end offset in
4919 * \retval 0 on success
4920 * \retval < 0 error code
4922 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
4923 struct lu_extent *ext)
4925 struct layout_intent intent = {
4927 .li_extent.e_start = ext->e_start,
4928 .li_extent.e_end = ext->e_end,
4933 rc = ll_layout_intent(inode, &intent);
4939 * This function send a restore request to the MDT
4941 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4943 struct hsm_user_request *hur;
4947 len = sizeof(struct hsm_user_request) +
4948 sizeof(struct hsm_user_item);
4949 OBD_ALLOC(hur, len);
4953 hur->hur_request.hr_action = HUA_RESTORE;
4954 hur->hur_request.hr_archive_id = 0;
4955 hur->hur_request.hr_flags = 0;
4956 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4957 sizeof(hur->hur_user_item[0].hui_fid));
4958 hur->hur_user_item[0].hui_extent.offset = offset;
4959 hur->hur_user_item[0].hui_extent.length = length;
4960 hur->hur_request.hr_itemcount = 1;
4961 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,