4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
203 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
204 if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
205 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
207 rc = md_close(md_exp, op_data, och->och_mod, &req);
208 if (rc != 0 && rc != -EINTR)
209 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
210 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
212 if (rc == 0 && op_data->op_bias & bias) {
213 struct mdt_body *body;
215 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
216 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
220 ll_finish_md_op_data(op_data);
224 md_clear_open_replay_data(md_exp, och);
225 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
228 ptlrpc_req_finished(req); /* This is close request */
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
274 static int ll_md_close(struct inode *inode, struct file *file)
276 union ldlm_policy_data policy = {
277 .l_inodebits = { MDS_INODELOCK_OPEN },
279 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
280 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lustre_handle lockh;
283 enum ldlm_mode lockmode;
287 /* clear group lock, if present */
288 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
289 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
291 if (fd->fd_lease_och != NULL) {
294 /* Usually the lease is not released when the
295 * application crashed, we need to release here. */
296 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
297 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
298 PFID(&lli->lli_fid), rc, lease_broken);
300 fd->fd_lease_och = NULL;
303 if (fd->fd_och != NULL) {
304 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
309 /* Let's see if we have good enough OPEN lock on the file and if
310 we can skip talking to MDS */
311 mutex_lock(&lli->lli_och_mutex);
312 if (fd->fd_omode & FMODE_WRITE) {
314 LASSERT(lli->lli_open_fd_write_count);
315 lli->lli_open_fd_write_count--;
316 } else if (fd->fd_omode & FMODE_EXEC) {
318 LASSERT(lli->lli_open_fd_exec_count);
319 lli->lli_open_fd_exec_count--;
322 LASSERT(lli->lli_open_fd_read_count);
323 lli->lli_open_fd_read_count--;
325 mutex_unlock(&lli->lli_och_mutex);
327 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
328 LDLM_IBITS, &policy, lockmode, &lockh))
329 rc = ll_md_real_close(inode, fd->fd_omode);
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
338 /* While this returns an error code, fput() the caller does not, so we need
339 * to make every effort to clean up all of our state here. Also, applications
340 * rarely check close errors and even if an error is returned they will not
341 * re-try the close call.
343 int ll_file_release(struct inode *inode, struct file *file)
345 struct ll_file_data *fd;
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
351 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
352 PFID(ll_inode2fid(inode)), inode);
354 if (inode->i_sb->s_root != file_dentry(file))
355 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
356 fd = LUSTRE_FPRIVATE(file);
359 /* The last ref on @file, maybe not the the owner pid of statahead,
360 * because parent and child process can share the same file handle. */
361 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
362 ll_deauthorize_statahead(inode, fd);
364 if (inode->i_sb->s_root == file_dentry(file)) {
365 LUSTRE_FPRIVATE(file) = NULL;
366 ll_file_data_put(fd);
370 if (!S_ISDIR(inode->i_mode)) {
371 if (lli->lli_clob != NULL)
372 lov_read_and_clear_async_rc(lli->lli_clob);
373 lli->lli_async_rc = 0;
376 rc = ll_md_close(inode, file);
378 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
379 libcfs_debug_dumplog();
384 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
385 struct lookup_intent *itp)
387 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
388 struct dentry *parent = de->d_parent;
389 const char *name = NULL;
391 struct md_op_data *op_data;
392 struct ptlrpc_request *req = NULL;
396 LASSERT(parent != NULL);
397 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
399 /* if server supports open-by-fid, or file name is invalid, don't pack
400 * name in open request */
401 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
402 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
403 name = de->d_name.name;
404 len = de->d_name.len;
407 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
408 name, len, 0, LUSTRE_OPC_ANY, NULL);
410 RETURN(PTR_ERR(op_data));
411 op_data->op_data = lmm;
412 op_data->op_data_size = lmmsize;
414 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
415 &ll_md_blocking_ast, 0);
416 ll_finish_md_op_data(op_data);
418 /* reason for keep own exit path - don`t flood log
419 * with messages with -ESTALE errors.
421 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
422 it_open_error(DISP_OPEN_OPEN, itp))
424 ll_release_openhandle(de, itp);
428 if (it_disposition(itp, DISP_LOOKUP_NEG))
429 GOTO(out, rc = -ENOENT);
431 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
432 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
433 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
437 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
438 if (!rc && itp->it_lock_mode)
439 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
442 ptlrpc_req_finished(req);
443 ll_intent_drop_lock(itp);
445 /* We did open by fid, but by the time we got to the server,
446 * the object disappeared. If this is a create, we cannot really
447 * tell the userspace that the file it was trying to create
448 * does not exist. Instead let's return -ESTALE, and the VFS will
449 * retry the create with LOOKUP_REVAL that we are going to catch
450 * in ll_revalidate_dentry() and use lookup then.
452 if (rc == -ENOENT && itp->it_op & IT_CREAT)
458 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
459 struct obd_client_handle *och)
461 struct mdt_body *body;
463 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
464 och->och_fh = body->mbo_handle;
465 och->och_fid = body->mbo_fid1;
466 och->och_lease_handle.cookie = it->it_lock_handle;
467 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
468 och->och_flags = it->it_flags;
470 return md_set_open_replay_data(md_exp, och, it);
473 static int ll_local_open(struct file *file, struct lookup_intent *it,
474 struct ll_file_data *fd, struct obd_client_handle *och)
476 struct inode *inode = file_inode(file);
479 LASSERT(!LUSTRE_FPRIVATE(file));
486 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
491 LUSTRE_FPRIVATE(file) = fd;
492 ll_readahead_init(inode, &fd->fd_ras);
493 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
495 /* ll_cl_context initialize */
496 rwlock_init(&fd->fd_lock);
497 INIT_LIST_HEAD(&fd->fd_lccs);
502 /* Open a file, and (for the very first open) create objects on the OSTs at
503 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
504 * creation or open until ll_lov_setstripe() ioctl is called.
506 * If we already have the stripe MD locally then we don't request it in
507 * md_open(), by passing a lmm_size = 0.
509 * It is up to the application to ensure no other processes open this file
510 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
511 * used. We might be able to avoid races of that sort by getting lli_open_sem
512 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
513 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
515 int ll_file_open(struct inode *inode, struct file *file)
517 struct ll_inode_info *lli = ll_i2info(inode);
518 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
519 .it_flags = file->f_flags };
520 struct obd_client_handle **och_p = NULL;
521 __u64 *och_usecount = NULL;
522 struct ll_file_data *fd;
526 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
527 PFID(ll_inode2fid(inode)), inode, file->f_flags);
529 it = file->private_data; /* XXX: compat macro */
530 file->private_data = NULL; /* prevent ll_local_open assertion */
532 fd = ll_file_data_get();
534 GOTO(out_nofiledata, rc = -ENOMEM);
537 if (S_ISDIR(inode->i_mode))
538 ll_authorize_statahead(inode, fd);
540 if (inode->i_sb->s_root == file_dentry(file)) {
541 LUSTRE_FPRIVATE(file) = fd;
545 if (!it || !it->it_disposition) {
546 /* Convert f_flags into access mode. We cannot use file->f_mode,
547 * because everything but O_ACCMODE mask was stripped from
549 if ((oit.it_flags + 1) & O_ACCMODE)
551 if (file->f_flags & O_TRUNC)
552 oit.it_flags |= FMODE_WRITE;
554 /* kernel only call f_op->open in dentry_open. filp_open calls
555 * dentry_open after call to open_namei that checks permissions.
556 * Only nfsd_open call dentry_open directly without checking
557 * permissions and because of that this code below is safe. */
558 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
559 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
561 /* We do not want O_EXCL here, presumably we opened the file
562 * already? XXX - NFS implications? */
563 oit.it_flags &= ~O_EXCL;
565 /* bug20584, if "it_flags" contains O_CREAT, the file will be
566 * created if necessary, then "IT_CREAT" should be set to keep
567 * consistent with it */
568 if (oit.it_flags & O_CREAT)
569 oit.it_op |= IT_CREAT;
575 /* Let's see if we have file open on MDS already. */
576 if (it->it_flags & FMODE_WRITE) {
577 och_p = &lli->lli_mds_write_och;
578 och_usecount = &lli->lli_open_fd_write_count;
579 } else if (it->it_flags & FMODE_EXEC) {
580 och_p = &lli->lli_mds_exec_och;
581 och_usecount = &lli->lli_open_fd_exec_count;
583 och_p = &lli->lli_mds_read_och;
584 och_usecount = &lli->lli_open_fd_read_count;
587 mutex_lock(&lli->lli_och_mutex);
588 if (*och_p) { /* Open handle is present */
589 if (it_disposition(it, DISP_OPEN_OPEN)) {
590 /* Well, there's extra open request that we do not need,
591 let's close it somehow. This will decref request. */
592 rc = it_open_error(DISP_OPEN_OPEN, it);
594 mutex_unlock(&lli->lli_och_mutex);
595 GOTO(out_openerr, rc);
598 ll_release_openhandle(file_dentry(file), it);
602 rc = ll_local_open(file, it, fd, NULL);
605 mutex_unlock(&lli->lli_och_mutex);
606 GOTO(out_openerr, rc);
609 LASSERT(*och_usecount == 0);
610 if (!it->it_disposition) {
611 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
612 /* We cannot just request lock handle now, new ELC code
613 means that one of other OPEN locks for this file
614 could be cancelled, and since blocking ast handler
615 would attempt to grab och_mutex as well, that would
616 result in a deadlock */
617 mutex_unlock(&lli->lli_och_mutex);
619 * Normally called under two situations:
621 * 2. A race/condition on MDS resulting in no open
622 * handle to be returned from LOOKUP|OPEN request,
623 * for example if the target entry was a symlink.
625 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
626 * marked by a bit set in ll_iget_for_nfs. Clear the
627 * bit so that it's not confusing later callers.
629 * NB; when ldd is NULL, it must have come via normal
630 * lookup path only, since ll_iget_for_nfs always calls
633 if (ldd && ldd->lld_nfs_dentry) {
634 ldd->lld_nfs_dentry = 0;
635 it->it_flags |= MDS_OPEN_LOCK;
639 * Always specify MDS_OPEN_BY_FID because we don't want
640 * to get file with different fid.
642 it->it_flags |= MDS_OPEN_BY_FID;
643 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
646 GOTO(out_openerr, rc);
650 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
652 GOTO(out_och_free, rc = -ENOMEM);
656 /* md_intent_lock() didn't get a request ref if there was an
657 * open error, so don't do cleanup on the request here
659 /* XXX (green): Should not we bail out on any error here, not
660 * just open error? */
661 rc = it_open_error(DISP_OPEN_OPEN, it);
663 GOTO(out_och_free, rc);
665 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
666 "inode %p: disposition %x, status %d\n", inode,
667 it_disposition(it, ~0), it->it_status);
669 rc = ll_local_open(file, it, fd, *och_p);
671 GOTO(out_och_free, rc);
673 mutex_unlock(&lli->lli_och_mutex);
676 /* Must do this outside lli_och_mutex lock to prevent deadlock where
677 different kind of OPEN lock for this same inode gets cancelled
678 by ldlm_cancel_lru */
679 if (!S_ISREG(inode->i_mode))
680 GOTO(out_och_free, rc);
682 cl_lov_delay_create_clear(&file->f_flags);
683 GOTO(out_och_free, rc);
687 if (och_p && *och_p) {
688 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
689 *och_p = NULL; /* OBD_FREE writes some magic there */
692 mutex_unlock(&lli->lli_och_mutex);
695 if (lli->lli_opendir_key == fd)
696 ll_deauthorize_statahead(inode, fd);
698 ll_file_data_put(fd);
700 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
704 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
705 ptlrpc_req_finished(it->it_request);
706 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
712 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
713 struct ldlm_lock_desc *desc, void *data, int flag)
716 struct lustre_handle lockh;
720 case LDLM_CB_BLOCKING:
721 ldlm_lock2handle(lock, &lockh);
722 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
724 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
728 case LDLM_CB_CANCELING:
736 * When setting a lease on a file, we take ownership of the lli_mds_*_och
737 * and save it as fd->fd_och so as to force client to reopen the file even
738 * if it has an open lock in cache already.
740 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
741 struct lustre_handle *old_handle)
743 struct ll_inode_info *lli = ll_i2info(inode);
744 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
745 struct obd_client_handle **och_p;
750 /* Get the openhandle of the file */
751 mutex_lock(&lli->lli_och_mutex);
752 if (fd->fd_lease_och != NULL)
753 GOTO(out_unlock, rc = -EBUSY);
755 if (fd->fd_och == NULL) {
756 if (file->f_mode & FMODE_WRITE) {
757 LASSERT(lli->lli_mds_write_och != NULL);
758 och_p = &lli->lli_mds_write_och;
759 och_usecount = &lli->lli_open_fd_write_count;
761 LASSERT(lli->lli_mds_read_och != NULL);
762 och_p = &lli->lli_mds_read_och;
763 och_usecount = &lli->lli_open_fd_read_count;
766 if (*och_usecount > 1)
767 GOTO(out_unlock, rc = -EBUSY);
774 *old_handle = fd->fd_och->och_fh;
778 mutex_unlock(&lli->lli_och_mutex);
783 * Release ownership on lli_mds_*_och when putting back a file lease.
785 static int ll_lease_och_release(struct inode *inode, struct file *file)
787 struct ll_inode_info *lli = ll_i2info(inode);
788 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
789 struct obd_client_handle **och_p;
790 struct obd_client_handle *old_och = NULL;
795 mutex_lock(&lli->lli_och_mutex);
796 if (file->f_mode & FMODE_WRITE) {
797 och_p = &lli->lli_mds_write_och;
798 och_usecount = &lli->lli_open_fd_write_count;
800 och_p = &lli->lli_mds_read_och;
801 och_usecount = &lli->lli_open_fd_read_count;
804 /* The file may have been open by another process (broken lease) so
805 * *och_p is not NULL. In this case we should simply increase usecount
808 if (*och_p != NULL) {
809 old_och = fd->fd_och;
816 mutex_unlock(&lli->lli_och_mutex);
819 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
825 * Acquire a lease and open the file.
827 static struct obd_client_handle *
828 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
831 struct lookup_intent it = { .it_op = IT_OPEN };
832 struct ll_sb_info *sbi = ll_i2sbi(inode);
833 struct md_op_data *op_data;
834 struct ptlrpc_request *req = NULL;
835 struct lustre_handle old_handle = { 0 };
836 struct obd_client_handle *och = NULL;
841 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
842 RETURN(ERR_PTR(-EINVAL));
845 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
846 RETURN(ERR_PTR(-EPERM));
848 rc = ll_lease_och_acquire(inode, file, &old_handle);
855 RETURN(ERR_PTR(-ENOMEM));
857 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
858 LUSTRE_OPC_ANY, NULL);
860 GOTO(out, rc = PTR_ERR(op_data));
862 /* To tell the MDT this openhandle is from the same owner */
863 op_data->op_handle = old_handle;
865 it.it_flags = fmode | open_flags;
866 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
867 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
868 &ll_md_blocking_lease_ast,
869 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
870 * it can be cancelled which may mislead applications that the lease is
872 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
873 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
874 * doesn't deal with openhandle, so normal openhandle will be leaked. */
875 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
876 ll_finish_md_op_data(op_data);
877 ptlrpc_req_finished(req);
879 GOTO(out_release_it, rc);
881 if (it_disposition(&it, DISP_LOOKUP_NEG))
882 GOTO(out_release_it, rc = -ENOENT);
884 rc = it_open_error(DISP_OPEN_OPEN, &it);
886 GOTO(out_release_it, rc);
888 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
889 ll_och_fill(sbi->ll_md_exp, &it, och);
891 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
892 GOTO(out_close, rc = -EOPNOTSUPP);
894 /* already get lease, handle lease lock */
895 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
896 if (it.it_lock_mode == 0 ||
897 it.it_lock_bits != MDS_INODELOCK_OPEN) {
898 /* open lock must return for lease */
899 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
900 PFID(ll_inode2fid(inode)), it.it_lock_mode,
902 GOTO(out_close, rc = -EPROTO);
905 ll_intent_release(&it);
909 /* Cancel open lock */
910 if (it.it_lock_mode != 0) {
911 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
914 och->och_lease_handle.cookie = 0ULL;
916 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
918 CERROR("%s: error closing file "DFID": %d\n",
919 ll_get_fsname(inode->i_sb, NULL, 0),
920 PFID(&ll_i2info(inode)->lli_fid), rc2);
921 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
923 ll_intent_release(&it);
931 * Check whether a layout swap can be done between two inodes.
933 * \param[in] inode1 First inode to check
934 * \param[in] inode2 Second inode to check
936 * \retval 0 on success, layout swap can be performed between both inodes
937 * \retval negative error code if requirements are not met
939 static int ll_check_swap_layouts_validity(struct inode *inode1,
940 struct inode *inode2)
942 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
945 if (inode_permission(inode1, MAY_WRITE) ||
946 inode_permission(inode2, MAY_WRITE))
949 if (inode1->i_sb != inode2->i_sb)
955 static int ll_swap_layouts_close(struct obd_client_handle *och,
956 struct inode *inode, struct inode *inode2)
958 const struct lu_fid *fid1 = ll_inode2fid(inode);
959 const struct lu_fid *fid2;
963 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
964 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
966 rc = ll_check_swap_layouts_validity(inode, inode2);
968 GOTO(out_free_och, rc);
970 /* We now know that inode2 is a lustre inode */
971 fid2 = ll_inode2fid(inode2);
973 rc = lu_fid_cmp(fid1, fid2);
975 GOTO(out_free_och, rc = -EINVAL);
977 /* Close the file and {swap,merge} layouts between inode & inode2.
978 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
979 * because we still need it to pack l_remote_handle to MDT. */
980 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
983 och = NULL; /* freed in ll_close_inode_openhandle() */
993 * Release lease and close the file.
994 * It will check if the lease has ever broken.
996 static int ll_lease_close_intent(struct obd_client_handle *och,
998 bool *lease_broken, enum mds_op_bias bias,
1001 struct ldlm_lock *lock;
1002 bool cancelled = true;
1006 lock = ldlm_handle2lock(&och->och_lease_handle);
1008 lock_res_and_lock(lock);
1009 cancelled = ldlm_is_cancel(lock);
1010 unlock_res_and_lock(lock);
1011 LDLM_LOCK_PUT(lock);
1014 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1015 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1017 if (lease_broken != NULL)
1018 *lease_broken = cancelled;
1020 if (!cancelled && !bias)
1021 ldlm_cli_cancel(&och->och_lease_handle, 0);
1023 if (cancelled) { /* no need to excute intent */
1028 rc = ll_close_inode_openhandle(inode, och, bias, data);
1032 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1035 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1039 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1041 static int ll_lease_file_resync(struct obd_client_handle *och,
1042 struct inode *inode)
1044 struct ll_sb_info *sbi = ll_i2sbi(inode);
1045 struct md_op_data *op_data;
1046 __u64 data_version_unused;
1050 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1051 LUSTRE_OPC_ANY, NULL);
1052 if (IS_ERR(op_data))
1053 RETURN(PTR_ERR(op_data));
1055 /* before starting file resync, it's necessary to clean up page cache
1056 * in client memory, otherwise once the layout version is increased,
1057 * writing back cached data will be denied the OSTs. */
1058 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1062 op_data->op_handle = och->och_lease_handle;
1063 rc = md_file_resync(sbi->ll_md_exp, op_data);
1069 ll_finish_md_op_data(op_data);
1073 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1075 struct ll_inode_info *lli = ll_i2info(inode);
1076 struct cl_object *obj = lli->lli_clob;
1077 struct cl_attr *attr = vvp_env_thread_attr(env);
1085 ll_inode_size_lock(inode);
1087 /* Merge timestamps the most recently obtained from MDS with
1088 * timestamps obtained from OSTs.
1090 * Do not overwrite atime of inode because it may be refreshed
1091 * by file_accessed() function. If the read was served by cache
1092 * data, there is no RPC to be sent so that atime may not be
1093 * transferred to OSTs at all. MDT only updates atime at close time
1094 * if it's at least 'mdd.*.atime_diff' older.
1095 * All in all, the atime in Lustre does not strictly comply with
1096 * POSIX. Solving this problem needs to send an RPC to MDT for each
1097 * read, this will hurt performance. */
1098 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1099 LTIME_S(inode->i_atime) = lli->lli_atime;
1100 lli->lli_update_atime = 0;
1102 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1103 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1105 atime = LTIME_S(inode->i_atime);
1106 mtime = LTIME_S(inode->i_mtime);
1107 ctime = LTIME_S(inode->i_ctime);
1109 cl_object_attr_lock(obj);
1110 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1113 rc = cl_object_attr_get(env, obj, attr);
1114 cl_object_attr_unlock(obj);
1117 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1119 if (atime < attr->cat_atime)
1120 atime = attr->cat_atime;
1122 if (ctime < attr->cat_ctime)
1123 ctime = attr->cat_ctime;
1125 if (mtime < attr->cat_mtime)
1126 mtime = attr->cat_mtime;
1128 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1129 PFID(&lli->lli_fid), attr->cat_size);
1131 i_size_write(inode, attr->cat_size);
1132 inode->i_blocks = attr->cat_blocks;
1134 LTIME_S(inode->i_atime) = atime;
1135 LTIME_S(inode->i_mtime) = mtime;
1136 LTIME_S(inode->i_ctime) = ctime;
1139 ll_inode_size_unlock(inode);
1145 * Set designated mirror for I/O.
1147 * So far only read, write, and truncated can support to issue I/O to
1148 * designated mirror.
1150 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1152 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1154 /* clear layout version for generic(non-resync) I/O in case it carries
1155 * stale layout version due to I/O restart */
1156 io->ci_layout_version = 0;
1158 /* FLR: disable non-delay for designated mirror I/O because obviously
1159 * only one mirror is available */
1160 if (fd->fd_designated_mirror > 0) {
1162 io->ci_designated_mirror = fd->fd_designated_mirror;
1163 io->ci_layout_version = fd->fd_layout_version;
1164 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1168 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1169 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1172 static bool file_is_noatime(const struct file *file)
1174 const struct vfsmount *mnt = file->f_path.mnt;
1175 const struct inode *inode = file_inode((struct file *)file);
1177 /* Adapted from file_accessed() and touch_atime().*/
1178 if (file->f_flags & O_NOATIME)
1181 if (inode->i_flags & S_NOATIME)
1184 if (IS_NOATIME(inode))
1187 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1190 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1193 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1199 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1201 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1203 struct inode *inode = file_inode(file);
1204 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1206 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1207 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1208 io->u.ci_rw.rw_file = file;
1209 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1210 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1211 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1213 if (iot == CIT_WRITE) {
1214 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1215 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1216 file->f_flags & O_DIRECT ||
1219 io->ci_obj = ll_i2info(inode)->lli_clob;
1220 io->ci_lockreq = CILR_MAYBE;
1221 if (ll_file_nolock(file)) {
1222 io->ci_lockreq = CILR_NEVER;
1223 io->ci_no_srvlock = 1;
1224 } else if (file->f_flags & O_APPEND) {
1225 io->ci_lockreq = CILR_MANDATORY;
1227 io->ci_noatime = file_is_noatime(file);
1228 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1229 io->ci_pio = !io->u.ci_rw.rw_append;
1233 /* FLR: only use non-delay I/O for read as there is only one
1234 * avaliable mirror for write. */
1235 io->ci_ndelay = !(iot == CIT_WRITE);
1237 ll_io_set_mirror(io, file);
1240 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1242 struct cl_io_pt *pt = ptask->pt_cbdata;
1243 struct file *file = pt->cip_file;
1246 loff_t pos = pt->cip_pos;
1251 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1252 file_dentry(file)->d_name.name,
1253 pt->cip_iot == CIT_READ ? "read" : "write",
1254 pos, pos + pt->cip_count);
1256 env = cl_env_get(&refcheck);
1258 RETURN(PTR_ERR(env));
1260 io = vvp_env_thread_io(env);
1261 ll_io_init(io, file, pt->cip_iot);
1262 io->u.ci_rw.rw_iter = pt->cip_iter;
1263 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1264 io->ci_pio = 0; /* It's already in parallel task */
1266 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1267 pt->cip_count - pt->cip_result);
1269 struct vvp_io *vio = vvp_env_io(env);
1271 vio->vui_io_subtype = IO_NORMAL;
1272 vio->vui_fd = LUSTRE_FPRIVATE(file);
1274 ll_cl_add(file, env, io, LCC_RW);
1275 rc = cl_io_loop(env, io);
1276 ll_cl_remove(file, env);
1278 /* cl_io_rw_init() handled IO */
1282 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1288 if (io->ci_nob > 0) {
1289 pt->cip_result += io->ci_nob;
1290 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1292 pt->cip_iocb.ki_pos = pos;
1293 #ifdef HAVE_KIOCB_KI_LEFT
1294 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1295 #elif defined(HAVE_KI_NBYTES)
1296 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1300 cl_io_fini(env, io);
1301 cl_env_put(env, &refcheck);
1303 pt->cip_need_restart = io->ci_need_restart;
1305 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1306 file_dentry(file)->d_name.name,
1307 pt->cip_iot == CIT_READ ? "read" : "write",
1308 pt->cip_result, rc);
1310 RETURN(pt->cip_result > 0 ? 0 : rc);
1314 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1315 struct file *file, enum cl_io_type iot,
1316 loff_t *ppos, size_t count)
1318 struct range_lock range;
1319 struct vvp_io *vio = vvp_env_io(env);
1320 struct inode *inode = file_inode(file);
1321 struct ll_inode_info *lli = ll_i2info(inode);
1322 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1327 unsigned retried = 0;
1328 bool restarted = false;
1332 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1333 file_dentry(file)->d_name.name,
1334 iot == CIT_READ ? "read" : "write", pos, pos + count);
1337 io = vvp_env_thread_io(env);
1338 ll_io_init(io, file, iot);
1339 if (args->via_io_subtype == IO_NORMAL) {
1340 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1341 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1343 if (args->via_io_subtype != IO_NORMAL || restarted)
1345 io->ci_ndelay_tried = retried;
1347 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1348 bool range_locked = false;
1350 if (file->f_flags & O_APPEND)
1351 range_lock_init(&range, 0, LUSTRE_EOF);
1353 range_lock_init(&range, pos, pos + count - 1);
1355 vio->vui_fd = LUSTRE_FPRIVATE(file);
1356 vio->vui_io_subtype = args->via_io_subtype;
1358 switch (vio->vui_io_subtype) {
1360 /* Direct IO reads must also take range lock,
1361 * or multiple reads will try to work on the same pages
1362 * See LU-6227 for details. */
1363 if (((iot == CIT_WRITE) ||
1364 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1365 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1366 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1368 rc = range_lock(&lli->lli_write_tree, &range);
1372 range_locked = true;
1376 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1377 vio->u.splice.vui_flags = args->u.splice.via_flags;
1380 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1384 ll_cl_add(file, env, io, LCC_RW);
1385 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1386 !lli->lli_inode_locked) {
1388 lli->lli_inode_locked = 1;
1390 rc = cl_io_loop(env, io);
1391 if (lli->lli_inode_locked) {
1392 lli->lli_inode_locked = 0;
1393 inode_unlock(inode);
1395 ll_cl_remove(file, env);
1398 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1400 range_unlock(&lli->lli_write_tree, &range);
1403 /* cl_io_rw_init() handled IO */
1407 if (io->ci_nob > 0) {
1408 result += io->ci_nob;
1409 count -= io->ci_nob;
1411 if (args->via_io_subtype == IO_NORMAL) {
1412 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1414 /* CLIO is too complicated. See LU-11069. */
1415 if (cl_io_is_append(io))
1416 pos = io->u.ci_rw.rw_iocb.ki_pos;
1420 args->u.normal.via_iocb->ki_pos = pos;
1421 #ifdef HAVE_KIOCB_KI_LEFT
1422 args->u.normal.via_iocb->ki_left = count;
1423 #elif defined(HAVE_KI_NBYTES)
1424 args->u.normal.via_iocb->ki_nbytes = count;
1428 pos = io->u.ci_rw.rw_range.cir_pos;
1432 cl_io_fini(env, io);
1435 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1436 file->f_path.dentry->d_name.name,
1437 iot, rc, result, io->ci_need_restart);
1439 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1441 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1442 file_dentry(file)->d_name.name,
1443 iot == CIT_READ ? "read" : "write",
1444 pos, pos + count, result, rc);
1445 /* preserve the tried count for FLR */
1446 retried = io->ci_ndelay_tried;
1451 if (iot == CIT_READ) {
1453 ll_stats_ops_tally(ll_i2sbi(inode),
1454 LPROC_LL_READ_BYTES, result);
1455 } else if (iot == CIT_WRITE) {
1457 ll_stats_ops_tally(ll_i2sbi(inode),
1458 LPROC_LL_WRITE_BYTES, result);
1459 fd->fd_write_failed = false;
1460 } else if (result == 0 && rc == 0) {
1463 fd->fd_write_failed = true;
1465 fd->fd_write_failed = false;
1466 } else if (rc != -ERESTARTSYS) {
1467 fd->fd_write_failed = true;
1471 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1472 file_dentry(file)->d_name.name,
1473 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1477 RETURN(result > 0 ? result : rc);
1481 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1482 * especially for small I/O.
1484 * To serve a read request, CLIO has to create and initialize a cl_io and
1485 * then request DLM lock. This has turned out to have siginificant overhead
1486 * and affects the performance of small I/O dramatically.
1488 * It's not necessary to create a cl_io for each I/O. Under the help of read
1489 * ahead, most of the pages being read are already in memory cache and we can
1490 * read those pages directly because if the pages exist, the corresponding DLM
1491 * lock must exist so that page content must be valid.
1493 * In fast read implementation, the llite speculatively finds and reads pages
1494 * in memory cache. There are three scenarios for fast read:
1495 * - If the page exists and is uptodate, kernel VM will provide the data and
1496 * CLIO won't be intervened;
1497 * - If the page was brought into memory by read ahead, it will be exported
1498 * and read ahead parameters will be updated;
1499 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1500 * it will go back and invoke normal read, i.e., a cl_io will be created
1501 * and DLM lock will be requested.
1503 * POSIX compliance: posix standard states that read is intended to be atomic.
1504 * Lustre read implementation is in line with Linux kernel read implementation
1505 * and neither of them complies with POSIX standard in this matter. Fast read
1506 * doesn't make the situation worse on single node but it may interleave write
1507 * results from multiple nodes due to short read handling in ll_file_aio_read().
1509 * \param env - lu_env
1510 * \param iocb - kiocb from kernel
1511 * \param iter - user space buffers where the data will be copied
1513 * \retval - number of bytes have been read, or error code if error occurred.
1516 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1520 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1523 /* NB: we can't do direct IO for fast read because it will need a lock
1524 * to make IO engine happy. */
1525 if (iocb->ki_filp->f_flags & O_DIRECT)
1528 result = generic_file_read_iter(iocb, iter);
1530 /* If the first page is not in cache, generic_file_aio_read() will be
1531 * returned with -ENODATA.
1532 * See corresponding code in ll_readpage(). */
1533 if (result == -ENODATA)
1537 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1538 LPROC_LL_READ_BYTES, result);
1544 * Read from a file (through the page cache).
1546 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1549 struct vvp_io_args *args;
1554 result = ll_do_fast_read(iocb, to);
1555 if (result < 0 || iov_iter_count(to) == 0)
1558 env = cl_env_get(&refcheck);
1560 return PTR_ERR(env);
1562 args = ll_env_args(env, IO_NORMAL);
1563 args->u.normal.via_iter = to;
1564 args->u.normal.via_iocb = iocb;
1566 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1567 &iocb->ki_pos, iov_iter_count(to));
1570 else if (result == 0)
1573 cl_env_put(env, &refcheck);
1579 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1580 * If a page is already in the page cache and dirty (and some other things -
1581 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1582 * write to it without doing a full I/O, because Lustre already knows about it
1583 * and will write it out. This saves a lot of processing time.
1585 * All writes here are within one page, so exclusion is handled by the page
1586 * lock on the vm page. We do not do tiny writes for writes which touch
1587 * multiple pages because it's very unlikely multiple sequential pages are
1588 * are already dirty.
1590 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1591 * and are unlikely to be to already dirty pages.
1593 * Attribute updates are important here, we do them in ll_tiny_write_end.
1595 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1597 ssize_t count = iov_iter_count(iter);
1598 struct file *file = iocb->ki_filp;
1599 struct inode *inode = file_inode(file);
1604 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1605 * of function for why.
1607 if (count >= PAGE_SIZE ||
1608 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1611 result = __generic_file_write_iter(iocb, iter);
1613 /* If the page is not already dirty, ll_tiny_write_begin returns
1614 * -ENODATA. We continue on to normal write.
1616 if (result == -ENODATA)
1620 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1622 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1625 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1631 * Write to a file (through the page cache).
1633 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1635 struct vvp_io_args *args;
1637 ssize_t rc_tiny = 0, rc_normal;
1642 /* NB: we can't do direct IO for tiny writes because they use the page
1643 * cache, we can't do sync writes because tiny writes can't flush
1644 * pages, and we can't do append writes because we can't guarantee the
1645 * required DLM locks are held to protect file size.
1647 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1648 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1649 rc_tiny = ll_do_tiny_write(iocb, from);
1651 /* In case of error, go on and try normal write - Only stop if tiny
1652 * write completed I/O.
1654 if (iov_iter_count(from) == 0)
1655 GOTO(out, rc_normal = rc_tiny);
1657 env = cl_env_get(&refcheck);
1659 return PTR_ERR(env);
1661 args = ll_env_args(env, IO_NORMAL);
1662 args->u.normal.via_iter = from;
1663 args->u.normal.via_iocb = iocb;
1665 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1666 &iocb->ki_pos, iov_iter_count(from));
1668 /* On success, combine bytes written. */
1669 if (rc_tiny >= 0 && rc_normal > 0)
1670 rc_normal += rc_tiny;
1671 /* On error, only return error from normal write if tiny write did not
1672 * write any bytes. Otherwise return bytes written by tiny write.
1674 else if (rc_tiny > 0)
1675 rc_normal = rc_tiny;
1677 cl_env_put(env, &refcheck);
1682 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1684 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1686 static int ll_file_get_iov_count(const struct iovec *iov,
1687 unsigned long *nr_segs, size_t *count)
1692 for (seg = 0; seg < *nr_segs; seg++) {
1693 const struct iovec *iv = &iov[seg];
1696 * If any segment has a negative length, or the cumulative
1697 * length ever wraps negative then return -EINVAL.
1700 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1702 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1707 cnt -= iv->iov_len; /* This segment is no good */
1714 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1715 unsigned long nr_segs, loff_t pos)
1722 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1726 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1727 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1728 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1729 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1730 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1732 result = ll_file_read_iter(iocb, &to);
1737 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1740 struct iovec iov = { .iov_base = buf, .iov_len = count };
1745 init_sync_kiocb(&kiocb, file);
1746 kiocb.ki_pos = *ppos;
1747 #ifdef HAVE_KIOCB_KI_LEFT
1748 kiocb.ki_left = count;
1749 #elif defined(HAVE_KI_NBYTES)
1750 kiocb.i_nbytes = count;
1753 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1754 *ppos = kiocb.ki_pos;
1760 * Write to a file (through the page cache).
1763 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1764 unsigned long nr_segs, loff_t pos)
1766 struct iov_iter from;
1771 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1775 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1776 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1777 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1778 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1779 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1781 result = ll_file_write_iter(iocb, &from);
1786 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1787 size_t count, loff_t *ppos)
1789 struct iovec iov = { .iov_base = (void __user *)buf,
1796 init_sync_kiocb(&kiocb, file);
1797 kiocb.ki_pos = *ppos;
1798 #ifdef HAVE_KIOCB_KI_LEFT
1799 kiocb.ki_left = count;
1800 #elif defined(HAVE_KI_NBYTES)
1801 kiocb.ki_nbytes = count;
1804 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1805 *ppos = kiocb.ki_pos;
1809 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1812 * Send file content (through pagecache) somewhere with helper
1814 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1815 struct pipe_inode_info *pipe, size_t count,
1819 struct vvp_io_args *args;
1824 env = cl_env_get(&refcheck);
1826 RETURN(PTR_ERR(env));
1828 args = ll_env_args(env, IO_SPLICE);
1829 args->u.splice.via_pipe = pipe;
1830 args->u.splice.via_flags = flags;
1832 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1833 cl_env_put(env, &refcheck);
1837 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1838 __u64 flags, struct lov_user_md *lum, int lum_size)
1840 struct lookup_intent oit = {
1842 .it_flags = flags | MDS_OPEN_BY_FID,
1847 ll_inode_size_lock(inode);
1848 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1850 GOTO(out_unlock, rc);
1852 ll_release_openhandle(dentry, &oit);
1855 ll_inode_size_unlock(inode);
1856 ll_intent_release(&oit);
1861 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1862 struct lov_mds_md **lmmp, int *lmm_size,
1863 struct ptlrpc_request **request)
1865 struct ll_sb_info *sbi = ll_i2sbi(inode);
1866 struct mdt_body *body;
1867 struct lov_mds_md *lmm = NULL;
1868 struct ptlrpc_request *req = NULL;
1869 struct md_op_data *op_data;
1872 rc = ll_get_default_mdsize(sbi, &lmmsize);
1876 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1877 strlen(filename), lmmsize,
1878 LUSTRE_OPC_ANY, NULL);
1879 if (IS_ERR(op_data))
1880 RETURN(PTR_ERR(op_data));
1882 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1883 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1884 ll_finish_md_op_data(op_data);
1886 CDEBUG(D_INFO, "md_getattr_name failed "
1887 "on %s: rc %d\n", filename, rc);
1891 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1892 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1894 lmmsize = body->mbo_eadatasize;
1896 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1898 GOTO(out, rc = -ENODATA);
1901 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1902 LASSERT(lmm != NULL);
1904 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1905 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1906 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1907 GOTO(out, rc = -EPROTO);
1910 * This is coming from the MDS, so is probably in
1911 * little endian. We convert it to host endian before
1912 * passing it to userspace.
1914 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1917 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1918 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1919 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1920 if (le32_to_cpu(lmm->lmm_pattern) &
1921 LOV_PATTERN_F_RELEASED)
1925 /* if function called for directory - we should
1926 * avoid swab not existent lsm objects */
1927 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1928 lustre_swab_lov_user_md_v1(
1929 (struct lov_user_md_v1 *)lmm);
1930 if (S_ISREG(body->mbo_mode))
1931 lustre_swab_lov_user_md_objects(
1932 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1934 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1935 lustre_swab_lov_user_md_v3(
1936 (struct lov_user_md_v3 *)lmm);
1937 if (S_ISREG(body->mbo_mode))
1938 lustre_swab_lov_user_md_objects(
1939 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1941 } else if (lmm->lmm_magic ==
1942 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1943 lustre_swab_lov_comp_md_v1(
1944 (struct lov_comp_md_v1 *)lmm);
1950 *lmm_size = lmmsize;
1955 static int ll_lov_setea(struct inode *inode, struct file *file,
1958 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1959 struct lov_user_md *lump;
1960 int lum_size = sizeof(struct lov_user_md) +
1961 sizeof(struct lov_user_ost_data);
1965 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1968 OBD_ALLOC_LARGE(lump, lum_size);
1972 if (copy_from_user(lump, arg, lum_size))
1973 GOTO(out_lump, rc = -EFAULT);
1975 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1977 cl_lov_delay_create_clear(&file->f_flags);
1980 OBD_FREE_LARGE(lump, lum_size);
1984 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1991 env = cl_env_get(&refcheck);
1993 RETURN(PTR_ERR(env));
1995 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1996 cl_env_put(env, &refcheck);
2000 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2003 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2004 struct lov_user_md *klum;
2006 __u64 flags = FMODE_WRITE;
2009 rc = ll_copy_user_md(lum, &klum);
2014 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2019 rc = put_user(0, &lum->lmm_stripe_count);
2023 rc = ll_layout_refresh(inode, &gen);
2027 rc = ll_file_getstripe(inode, arg, lum_size);
2029 cl_lov_delay_create_clear(&file->f_flags);
2032 OBD_FREE(klum, lum_size);
2037 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2039 struct ll_inode_info *lli = ll_i2info(inode);
2040 struct cl_object *obj = lli->lli_clob;
2041 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2042 struct ll_grouplock grouplock;
2047 CWARN("group id for group lock must not be 0\n");
2051 if (ll_file_nolock(file))
2052 RETURN(-EOPNOTSUPP);
2054 spin_lock(&lli->lli_lock);
2055 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2056 CWARN("group lock already existed with gid %lu\n",
2057 fd->fd_grouplock.lg_gid);
2058 spin_unlock(&lli->lli_lock);
2061 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2062 spin_unlock(&lli->lli_lock);
2065 * XXX: group lock needs to protect all OST objects while PFL
2066 * can add new OST objects during the IO, so we'd instantiate
2067 * all OST objects before getting its group lock.
2072 struct cl_layout cl = {
2073 .cl_is_composite = false,
2075 struct lu_extent ext = {
2077 .e_end = OBD_OBJECT_EOF,
2080 env = cl_env_get(&refcheck);
2082 RETURN(PTR_ERR(env));
2084 rc = cl_object_layout_get(env, obj, &cl);
2085 if (!rc && cl.cl_is_composite)
2086 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2089 cl_env_put(env, &refcheck);
2094 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2095 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2099 spin_lock(&lli->lli_lock);
2100 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2101 spin_unlock(&lli->lli_lock);
2102 CERROR("another thread just won the race\n");
2103 cl_put_grouplock(&grouplock);
2107 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2108 fd->fd_grouplock = grouplock;
2109 spin_unlock(&lli->lli_lock);
2111 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2115 static int ll_put_grouplock(struct inode *inode, struct file *file,
2118 struct ll_inode_info *lli = ll_i2info(inode);
2119 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2120 struct ll_grouplock grouplock;
2123 spin_lock(&lli->lli_lock);
2124 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2125 spin_unlock(&lli->lli_lock);
2126 CWARN("no group lock held\n");
2130 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2132 if (fd->fd_grouplock.lg_gid != arg) {
2133 CWARN("group lock %lu doesn't match current id %lu\n",
2134 arg, fd->fd_grouplock.lg_gid);
2135 spin_unlock(&lli->lli_lock);
2139 grouplock = fd->fd_grouplock;
2140 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2141 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2142 spin_unlock(&lli->lli_lock);
2144 cl_put_grouplock(&grouplock);
2145 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2150 * Close inode open handle
2152 * \param dentry [in] dentry which contains the inode
2153 * \param it [in,out] intent which contains open info and result
2156 * \retval <0 failure
2158 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2160 struct inode *inode = dentry->d_inode;
2161 struct obd_client_handle *och;
2167 /* Root ? Do nothing. */
2168 if (dentry->d_inode->i_sb->s_root == dentry)
2171 /* No open handle to close? Move away */
2172 if (!it_disposition(it, DISP_OPEN_OPEN))
2175 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2177 OBD_ALLOC(och, sizeof(*och));
2179 GOTO(out, rc = -ENOMEM);
2181 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2183 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2185 /* this one is in place of ll_file_open */
2186 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2187 ptlrpc_req_finished(it->it_request);
2188 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2194 * Get size for inode for which FIEMAP mapping is requested.
2195 * Make the FIEMAP get_info call and returns the result.
2196 * \param fiemap kernel buffer to hold extens
2197 * \param num_bytes kernel buffer size
2199 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2205 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2208 /* Checks for fiemap flags */
2209 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2210 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2214 /* Check for FIEMAP_FLAG_SYNC */
2215 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2216 rc = filemap_fdatawrite(inode->i_mapping);
2221 env = cl_env_get(&refcheck);
2223 RETURN(PTR_ERR(env));
2225 if (i_size_read(inode) == 0) {
2226 rc = ll_glimpse_size(inode);
2231 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2232 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2233 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2235 /* If filesize is 0, then there would be no objects for mapping */
2236 if (fmkey.lfik_oa.o_size == 0) {
2237 fiemap->fm_mapped_extents = 0;
2241 fmkey.lfik_fiemap = *fiemap;
2243 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2244 &fmkey, fiemap, &num_bytes);
2246 cl_env_put(env, &refcheck);
2250 int ll_fid2path(struct inode *inode, void __user *arg)
2252 struct obd_export *exp = ll_i2mdexp(inode);
2253 const struct getinfo_fid2path __user *gfin = arg;
2255 struct getinfo_fid2path *gfout;
2261 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2262 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2265 /* Only need to get the buflen */
2266 if (get_user(pathlen, &gfin->gf_pathlen))
2269 if (pathlen > PATH_MAX)
2272 outsize = sizeof(*gfout) + pathlen;
2273 OBD_ALLOC(gfout, outsize);
2277 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2278 GOTO(gf_free, rc = -EFAULT);
2279 /* append root FID after gfout to let MDT know the root FID so that it
2280 * can lookup the correct path, this is mainly for fileset.
2281 * old server without fileset mount support will ignore this. */
2282 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2284 /* Call mdc_iocontrol */
2285 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2289 if (copy_to_user(arg, gfout, outsize))
2293 OBD_FREE(gfout, outsize);
2298 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2300 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2308 ioc->idv_version = 0;
2309 ioc->idv_layout_version = UINT_MAX;
2311 /* If no file object initialized, we consider its version is 0. */
2315 env = cl_env_get(&refcheck);
2317 RETURN(PTR_ERR(env));
2319 io = vvp_env_thread_io(env);
2321 io->u.ci_data_version.dv_data_version = 0;
2322 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2323 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2326 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2327 result = cl_io_loop(env, io);
2329 result = io->ci_result;
2331 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2332 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2334 cl_io_fini(env, io);
2336 if (unlikely(io->ci_need_restart))
2339 cl_env_put(env, &refcheck);
2345 * Read the data_version for inode.
2347 * This value is computed using stripe object version on OST.
2348 * Version is computed using server side locking.
2350 * @param flags if do sync on the OST side;
2352 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2353 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2355 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2357 struct ioc_data_version ioc = { .idv_flags = flags };
2360 rc = ll_ioc_data_version(inode, &ioc);
2362 *data_version = ioc.idv_version;
2368 * Trigger a HSM release request for the provided inode.
2370 int ll_hsm_release(struct inode *inode)
2373 struct obd_client_handle *och = NULL;
2374 __u64 data_version = 0;
2379 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2380 ll_get_fsname(inode->i_sb, NULL, 0),
2381 PFID(&ll_i2info(inode)->lli_fid));
2383 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2385 GOTO(out, rc = PTR_ERR(och));
2387 /* Grab latest data_version and [am]time values */
2388 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2392 env = cl_env_get(&refcheck);
2394 GOTO(out, rc = PTR_ERR(env));
2396 rc = ll_merge_attr(env, inode);
2397 cl_env_put(env, &refcheck);
2399 /* If error happen, we have the wrong size for a file.
2405 /* Release the file.
2406 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2407 * we still need it to pack l_remote_handle to MDT. */
2408 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2414 if (och != NULL && !IS_ERR(och)) /* close the file */
2415 ll_lease_close(och, inode, NULL);
2420 struct ll_swap_stack {
2423 struct inode *inode1;
2424 struct inode *inode2;
2429 static int ll_swap_layouts(struct file *file1, struct file *file2,
2430 struct lustre_swap_layouts *lsl)
2432 struct mdc_swap_layouts msl;
2433 struct md_op_data *op_data;
2436 struct ll_swap_stack *llss = NULL;
2439 OBD_ALLOC_PTR(llss);
2443 llss->inode1 = file_inode(file1);
2444 llss->inode2 = file_inode(file2);
2446 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2450 /* we use 2 bool because it is easier to swap than 2 bits */
2451 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2452 llss->check_dv1 = true;
2454 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2455 llss->check_dv2 = true;
2457 /* we cannot use lsl->sl_dvX directly because we may swap them */
2458 llss->dv1 = lsl->sl_dv1;
2459 llss->dv2 = lsl->sl_dv2;
2461 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2462 if (rc == 0) /* same file, done! */
2465 if (rc < 0) { /* sequentialize it */
2466 swap(llss->inode1, llss->inode2);
2468 swap(llss->dv1, llss->dv2);
2469 swap(llss->check_dv1, llss->check_dv2);
2473 if (gid != 0) { /* application asks to flush dirty cache */
2474 rc = ll_get_grouplock(llss->inode1, file1, gid);
2478 rc = ll_get_grouplock(llss->inode2, file2, gid);
2480 ll_put_grouplock(llss->inode1, file1, gid);
2485 /* ultimate check, before swaping the layouts we check if
2486 * dataversion has changed (if requested) */
2487 if (llss->check_dv1) {
2488 rc = ll_data_version(llss->inode1, &dv, 0);
2491 if (dv != llss->dv1)
2492 GOTO(putgl, rc = -EAGAIN);
2495 if (llss->check_dv2) {
2496 rc = ll_data_version(llss->inode2, &dv, 0);
2499 if (dv != llss->dv2)
2500 GOTO(putgl, rc = -EAGAIN);
2503 /* struct md_op_data is used to send the swap args to the mdt
2504 * only flags is missing, so we use struct mdc_swap_layouts
2505 * through the md_op_data->op_data */
2506 /* flags from user space have to be converted before they are send to
2507 * server, no flag is sent today, they are only used on the client */
2510 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2511 0, LUSTRE_OPC_ANY, &msl);
2512 if (IS_ERR(op_data))
2513 GOTO(free, rc = PTR_ERR(op_data));
2515 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2516 sizeof(*op_data), op_data, NULL);
2517 ll_finish_md_op_data(op_data);
2524 ll_put_grouplock(llss->inode2, file2, gid);
2525 ll_put_grouplock(llss->inode1, file1, gid);
2535 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2537 struct md_op_data *op_data;
2541 /* Detect out-of range masks */
2542 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2545 /* Non-root users are forbidden to set or clear flags which are
2546 * NOT defined in HSM_USER_MASK. */
2547 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2548 !cfs_capable(CFS_CAP_SYS_ADMIN))
2551 /* Detect out-of range archive id */
2552 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2553 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2556 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2557 LUSTRE_OPC_ANY, hss);
2558 if (IS_ERR(op_data))
2559 RETURN(PTR_ERR(op_data));
2561 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2562 sizeof(*op_data), op_data, NULL);
2564 ll_finish_md_op_data(op_data);
2569 static int ll_hsm_import(struct inode *inode, struct file *file,
2570 struct hsm_user_import *hui)
2572 struct hsm_state_set *hss = NULL;
2573 struct iattr *attr = NULL;
2577 if (!S_ISREG(inode->i_mode))
2583 GOTO(out, rc = -ENOMEM);
2585 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2586 hss->hss_archive_id = hui->hui_archive_id;
2587 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2588 rc = ll_hsm_state_set(inode, hss);
2592 OBD_ALLOC_PTR(attr);
2594 GOTO(out, rc = -ENOMEM);
2596 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2597 attr->ia_mode |= S_IFREG;
2598 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2599 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2600 attr->ia_size = hui->hui_size;
2601 attr->ia_mtime.tv_sec = hui->hui_mtime;
2602 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2603 attr->ia_atime.tv_sec = hui->hui_atime;
2604 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2606 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2607 ATTR_UID | ATTR_GID |
2608 ATTR_MTIME | ATTR_MTIME_SET |
2609 ATTR_ATIME | ATTR_ATIME_SET;
2613 rc = ll_setattr_raw(file_dentry(file), attr, true);
2617 inode_unlock(inode);
2629 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2631 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2632 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2635 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2637 struct inode *inode = file_inode(file);
2639 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2640 ATTR_MTIME | ATTR_MTIME_SET |
2641 ATTR_CTIME | ATTR_CTIME_SET,
2643 .tv_sec = lfu->lfu_atime_sec,
2644 .tv_nsec = lfu->lfu_atime_nsec,
2647 .tv_sec = lfu->lfu_mtime_sec,
2648 .tv_nsec = lfu->lfu_mtime_nsec,
2651 .tv_sec = lfu->lfu_ctime_sec,
2652 .tv_nsec = lfu->lfu_ctime_nsec,
2658 if (!capable(CAP_SYS_ADMIN))
2661 if (!S_ISREG(inode->i_mode))
2665 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2666 inode_unlock(inode);
2671 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2674 case MODE_READ_USER:
2676 case MODE_WRITE_USER:
2683 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2685 /* Used to allow the upper layers of the client to request an LDLM lock
2686 * without doing an actual read or write.
2688 * Used for ladvise lockahead to manually request specific locks.
2690 * \param[in] file file this ladvise lock request is on
2691 * \param[in] ladvise ladvise struct describing this lock request
2693 * \retval 0 success, no detailed result available (sync requests
2694 * and requests sent to the server [not handled locally]
2695 * cannot return detailed results)
2696 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2697 * see definitions for details.
2698 * \retval negative negative errno on error
2700 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2702 struct lu_env *env = NULL;
2703 struct cl_io *io = NULL;
2704 struct cl_lock *lock = NULL;
2705 struct cl_lock_descr *descr = NULL;
2706 struct dentry *dentry = file->f_path.dentry;
2707 struct inode *inode = dentry->d_inode;
2708 enum cl_lock_mode cl_mode;
2709 off_t start = ladvise->lla_start;
2710 off_t end = ladvise->lla_end;
2716 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2717 "start=%llu, end=%llu\n", dentry->d_name.len,
2718 dentry->d_name.name, dentry->d_inode,
2719 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2722 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2724 GOTO(out, result = cl_mode);
2726 /* Get IO environment */
2727 result = cl_io_get(inode, &env, &io, &refcheck);
2731 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2734 * nothing to do for this io. This currently happens when
2735 * stripe sub-object's are not yet created.
2737 result = io->ci_result;
2738 } else if (result == 0) {
2739 lock = vvp_env_lock(env);
2740 descr = &lock->cll_descr;
2742 descr->cld_obj = io->ci_obj;
2743 /* Convert byte offsets to pages */
2744 descr->cld_start = cl_index(io->ci_obj, start);
2745 descr->cld_end = cl_index(io->ci_obj, end);
2746 descr->cld_mode = cl_mode;
2747 /* CEF_MUST is used because we do not want to convert a
2748 * lockahead request to a lockless lock */
2749 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2752 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2753 descr->cld_enq_flags |= CEF_SPECULATIVE;
2755 result = cl_lock_request(env, io, lock);
2757 /* On success, we need to release the lock */
2759 cl_lock_release(env, lock);
2761 cl_io_fini(env, io);
2762 cl_env_put(env, &refcheck);
2764 /* -ECANCELED indicates a matching lock with a different extent
2765 * was already present, and -EEXIST indicates a matching lock
2766 * on exactly the same extent was already present.
2767 * We convert them to positive values for userspace to make
2768 * recognizing true errors easier.
2769 * Note we can only return these detailed results on async requests,
2770 * as sync requests look the same as i/o requests for locking. */
2771 if (result == -ECANCELED)
2772 result = LLA_RESULT_DIFFERENT;
2773 else if (result == -EEXIST)
2774 result = LLA_RESULT_SAME;
2779 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2781 static int ll_ladvise_sanity(struct inode *inode,
2782 struct llapi_lu_ladvise *ladvise)
2784 enum lu_ladvise_type advice = ladvise->lla_advice;
2785 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2786 * be in the first 32 bits of enum ladvise_flags */
2787 __u32 flags = ladvise->lla_peradvice_flags;
2788 /* 3 lines at 80 characters per line, should be plenty */
2791 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2793 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2794 "last supported advice is %s (value '%d'): rc = %d\n",
2795 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2796 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2800 /* Per-advice checks */
2802 case LU_LADVISE_LOCKNOEXPAND:
2803 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2805 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2807 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2808 ladvise_names[advice], rc);
2812 case LU_LADVISE_LOCKAHEAD:
2813 /* Currently only READ and WRITE modes can be requested */
2814 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2815 ladvise->lla_lockahead_mode == 0) {
2817 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2819 ll_get_fsname(inode->i_sb, NULL, 0),
2820 ladvise->lla_lockahead_mode,
2821 ladvise_names[advice], rc);
2824 case LU_LADVISE_WILLREAD:
2825 case LU_LADVISE_DONTNEED:
2827 /* Note fall through above - These checks apply to all advices
2828 * except LOCKNOEXPAND */
2829 if (flags & ~LF_DEFAULT_MASK) {
2831 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2833 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2834 ladvise_names[advice], rc);
2837 if (ladvise->lla_start >= ladvise->lla_end) {
2839 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2840 "for %s: rc = %d\n",
2841 ll_get_fsname(inode->i_sb, NULL, 0),
2842 ladvise->lla_start, ladvise->lla_end,
2843 ladvise_names[advice], rc);
2855 * Give file access advices
2857 * The ladvise interface is similar to Linux fadvise() system call, except it
2858 * forwards the advices directly from Lustre client to server. The server side
2859 * codes will apply appropriate read-ahead and caching techniques for the
2860 * corresponding files.
2862 * A typical workload for ladvise is e.g. a bunch of different clients are
2863 * doing small random reads of a file, so prefetching pages into OSS cache
2864 * with big linear reads before the random IO is a net benefit. Fetching
2865 * all that data into each client cache with fadvise() may not be, due to
2866 * much more data being sent to the client.
2868 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2869 struct llapi_lu_ladvise *ladvise)
2873 struct cl_ladvise_io *lio;
2878 env = cl_env_get(&refcheck);
2880 RETURN(PTR_ERR(env));
2882 io = vvp_env_thread_io(env);
2883 io->ci_obj = ll_i2info(inode)->lli_clob;
2885 /* initialize parameters for ladvise */
2886 lio = &io->u.ci_ladvise;
2887 lio->li_start = ladvise->lla_start;
2888 lio->li_end = ladvise->lla_end;
2889 lio->li_fid = ll_inode2fid(inode);
2890 lio->li_advice = ladvise->lla_advice;
2891 lio->li_flags = flags;
2893 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2894 rc = cl_io_loop(env, io);
2898 cl_io_fini(env, io);
2899 cl_env_put(env, &refcheck);
2903 static int ll_lock_noexpand(struct file *file, int flags)
2905 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2907 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2912 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2915 struct fsxattr fsxattr;
2917 if (copy_from_user(&fsxattr,
2918 (const struct fsxattr __user *)arg,
2922 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2923 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2924 if (copy_to_user((struct fsxattr __user *)arg,
2925 &fsxattr, sizeof(fsxattr)))
2931 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2935 struct md_op_data *op_data;
2936 struct ptlrpc_request *req = NULL;
2938 struct fsxattr fsxattr;
2939 struct cl_object *obj;
2941 /* only root could change project ID */
2942 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2945 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2946 LUSTRE_OPC_ANY, NULL);
2947 if (IS_ERR(op_data))
2948 RETURN(PTR_ERR(op_data));
2950 if (copy_from_user(&fsxattr,
2951 (const struct fsxattr __user *)arg,
2953 GOTO(out_fsxattr1, rc = -EFAULT);
2955 op_data->op_attr_flags = fsxattr.fsx_xflags;
2956 op_data->op_projid = fsxattr.fsx_projid;
2957 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2958 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2960 ptlrpc_req_finished(req);
2962 obj = ll_i2info(inode)->lli_clob;
2966 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2967 OBD_ALLOC_PTR(attr);
2969 GOTO(out_fsxattr1, rc = -ENOMEM);
2970 attr->ia_valid = ATTR_ATTR_FLAG;
2971 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2976 ll_finish_md_op_data(op_data);
2980 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2983 struct inode *inode = file_inode(file);
2984 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2985 struct ll_inode_info *lli = ll_i2info(inode);
2986 struct obd_client_handle *och = NULL;
2987 struct split_param sp;
2990 enum mds_op_bias bias = 0;
2991 struct file *layout_file = NULL;
2993 size_t data_size = 0;
2997 mutex_lock(&lli->lli_och_mutex);
2998 if (fd->fd_lease_och != NULL) {
2999 och = fd->fd_lease_och;
3000 fd->fd_lease_och = NULL;
3002 mutex_unlock(&lli->lli_och_mutex);
3005 GOTO(out, rc = -ENOLCK);
3007 fmode = och->och_flags;
3009 switch (ioc->lil_flags) {
3010 case LL_LEASE_RESYNC_DONE:
3011 if (ioc->lil_count > IOC_IDS_MAX)
3012 GOTO(out, rc = -EINVAL);
3014 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3015 OBD_ALLOC(data, data_size);
3017 GOTO(out, rc = -ENOMEM);
3019 if (copy_from_user(data, (void __user *)arg, data_size))
3020 GOTO(out, rc = -EFAULT);
3022 bias = MDS_CLOSE_RESYNC_DONE;
3024 case LL_LEASE_LAYOUT_MERGE: {
3027 if (ioc->lil_count != 1)
3028 GOTO(out, rc = -EINVAL);
3030 arg += sizeof(*ioc);
3031 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3032 GOTO(out, rc = -EFAULT);
3034 layout_file = fget(fd);
3036 GOTO(out, rc = -EBADF);
3038 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3039 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3040 GOTO(out, rc = -EPERM);
3042 data = file_inode(layout_file);
3043 bias = MDS_CLOSE_LAYOUT_MERGE;
3046 case LL_LEASE_LAYOUT_SPLIT: {
3050 if (ioc->lil_count != 2)
3051 GOTO(out, rc = -EINVAL);
3053 arg += sizeof(*ioc);
3054 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3055 GOTO(out, rc = -EFAULT);
3057 arg += sizeof(__u32);
3058 if (copy_from_user(&mirror_id, (void __user *)arg,
3060 GOTO(out, rc = -EFAULT);
3062 layout_file = fget(fdv);
3064 GOTO(out, rc = -EBADF);
3066 sp.sp_inode = file_inode(layout_file);
3067 sp.sp_mirror_id = (__u16)mirror_id;
3069 bias = MDS_CLOSE_LAYOUT_SPLIT;
3073 /* without close intent */
3077 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3081 rc = ll_lease_och_release(inode, file);
3090 switch (ioc->lil_flags) {
3091 case LL_LEASE_RESYNC_DONE:
3093 OBD_FREE(data, data_size);
3095 case LL_LEASE_LAYOUT_MERGE:
3096 case LL_LEASE_LAYOUT_SPLIT:
3103 rc = ll_lease_type_from_fmode(fmode);
3107 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3110 struct inode *inode = file_inode(file);
3111 struct ll_inode_info *lli = ll_i2info(inode);
3112 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3113 struct obd_client_handle *och = NULL;
3114 __u64 open_flags = 0;
3120 switch (ioc->lil_mode) {
3121 case LL_LEASE_WRLCK:
3122 if (!(file->f_mode & FMODE_WRITE))
3124 fmode = FMODE_WRITE;
3126 case LL_LEASE_RDLCK:
3127 if (!(file->f_mode & FMODE_READ))
3131 case LL_LEASE_UNLCK:
3132 RETURN(ll_file_unlock_lease(file, ioc, arg));
3137 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3139 /* apply for lease */
3140 if (ioc->lil_flags & LL_LEASE_RESYNC)
3141 open_flags = MDS_OPEN_RESYNC;
3142 och = ll_lease_open(inode, file, fmode, open_flags);
3144 RETURN(PTR_ERR(och));
3146 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3147 rc = ll_lease_file_resync(och, inode);
3149 ll_lease_close(och, inode, NULL);
3152 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3154 ll_lease_close(och, inode, NULL);
3160 mutex_lock(&lli->lli_och_mutex);
3161 if (fd->fd_lease_och == NULL) {
3162 fd->fd_lease_och = och;
3165 mutex_unlock(&lli->lli_och_mutex);
3167 /* impossible now that only excl is supported for now */
3168 ll_lease_close(och, inode, &lease_broken);
3175 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3177 struct inode *inode = file_inode(file);
3178 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3182 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3183 PFID(ll_inode2fid(inode)), inode, cmd);
3184 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3186 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3187 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3191 case LL_IOC_GETFLAGS:
3192 /* Get the current value of the file flags */
3193 return put_user(fd->fd_flags, (int __user *)arg);
3194 case LL_IOC_SETFLAGS:
3195 case LL_IOC_CLRFLAGS:
3196 /* Set or clear specific file flags */
3197 /* XXX This probably needs checks to ensure the flags are
3198 * not abused, and to handle any flag side effects.
3200 if (get_user(flags, (int __user *) arg))
3203 if (cmd == LL_IOC_SETFLAGS) {
3204 if ((flags & LL_FILE_IGNORE_LOCK) &&
3205 !(file->f_flags & O_DIRECT)) {
3206 CERROR("%s: unable to disable locking on "
3207 "non-O_DIRECT file\n", current->comm);
3211 fd->fd_flags |= flags;
3213 fd->fd_flags &= ~flags;
3216 case LL_IOC_LOV_SETSTRIPE:
3217 case LL_IOC_LOV_SETSTRIPE_NEW:
3218 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3219 case LL_IOC_LOV_SETEA:
3220 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3221 case LL_IOC_LOV_SWAP_LAYOUTS: {
3223 struct lustre_swap_layouts lsl;
3225 if (copy_from_user(&lsl, (char __user *)arg,
3226 sizeof(struct lustre_swap_layouts)))
3229 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3232 file2 = fget(lsl.sl_fd);
3236 /* O_WRONLY or O_RDWR */
3237 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3238 GOTO(out, rc = -EPERM);
3240 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3241 struct inode *inode2;
3242 struct ll_inode_info *lli;
3243 struct obd_client_handle *och = NULL;
3245 lli = ll_i2info(inode);
3246 mutex_lock(&lli->lli_och_mutex);
3247 if (fd->fd_lease_och != NULL) {
3248 och = fd->fd_lease_och;
3249 fd->fd_lease_och = NULL;
3251 mutex_unlock(&lli->lli_och_mutex);
3253 GOTO(out, rc = -ENOLCK);
3254 inode2 = file_inode(file2);
3255 rc = ll_swap_layouts_close(och, inode, inode2);
3257 rc = ll_swap_layouts(file, file2, &lsl);
3263 case LL_IOC_LOV_GETSTRIPE:
3264 case LL_IOC_LOV_GETSTRIPE_NEW:
3265 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3266 case FS_IOC_GETFLAGS:
3267 case FS_IOC_SETFLAGS:
3268 RETURN(ll_iocontrol(inode, file, cmd, arg));
3269 case FSFILT_IOC_GETVERSION:
3270 case FS_IOC_GETVERSION:
3271 RETURN(put_user(inode->i_generation, (int __user *)arg));
3272 /* We need to special case any other ioctls we want to handle,
3273 * to send them to the MDS/OST as appropriate and to properly
3274 * network encode the arg field. */
3275 case FS_IOC_SETVERSION:
3278 case LL_IOC_GROUP_LOCK:
3279 RETURN(ll_get_grouplock(inode, file, arg));
3280 case LL_IOC_GROUP_UNLOCK:
3281 RETURN(ll_put_grouplock(inode, file, arg));
3282 case IOC_OBD_STATFS:
3283 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3285 case LL_IOC_FLUSHCTX:
3286 RETURN(ll_flush_ctx(inode));
3287 case LL_IOC_PATH2FID: {
3288 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3289 sizeof(struct lu_fid)))
3294 case LL_IOC_GETPARENT:
3295 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3297 case OBD_IOC_FID2PATH:
3298 RETURN(ll_fid2path(inode, (void __user *)arg));
3299 case LL_IOC_DATA_VERSION: {
3300 struct ioc_data_version idv;
3303 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3306 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3307 rc = ll_ioc_data_version(inode, &idv);
3310 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3316 case LL_IOC_GET_MDTIDX: {
3319 mdtidx = ll_get_mdt_idx(inode);
3323 if (put_user((int)mdtidx, (int __user *)arg))
3328 case OBD_IOC_GETDTNAME:
3329 case OBD_IOC_GETMDNAME:
3330 RETURN(ll_get_obd_name(inode, cmd, arg));
3331 case LL_IOC_HSM_STATE_GET: {
3332 struct md_op_data *op_data;
3333 struct hsm_user_state *hus;
3340 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3341 LUSTRE_OPC_ANY, hus);
3342 if (IS_ERR(op_data)) {
3344 RETURN(PTR_ERR(op_data));
3347 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3350 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3353 ll_finish_md_op_data(op_data);
3357 case LL_IOC_HSM_STATE_SET: {
3358 struct hsm_state_set *hss;
3365 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3370 rc = ll_hsm_state_set(inode, hss);
3375 case LL_IOC_HSM_ACTION: {
3376 struct md_op_data *op_data;
3377 struct hsm_current_action *hca;
3384 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3385 LUSTRE_OPC_ANY, hca);
3386 if (IS_ERR(op_data)) {
3388 RETURN(PTR_ERR(op_data));
3391 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3394 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3397 ll_finish_md_op_data(op_data);
3401 case LL_IOC_SET_LEASE_OLD: {
3402 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3404 RETURN(ll_file_set_lease(file, &ioc, 0));
3406 case LL_IOC_SET_LEASE: {
3407 struct ll_ioc_lease ioc;
3409 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3412 RETURN(ll_file_set_lease(file, &ioc, arg));
3414 case LL_IOC_GET_LEASE: {
3415 struct ll_inode_info *lli = ll_i2info(inode);
3416 struct ldlm_lock *lock = NULL;
3419 mutex_lock(&lli->lli_och_mutex);
3420 if (fd->fd_lease_och != NULL) {
3421 struct obd_client_handle *och = fd->fd_lease_och;
3423 lock = ldlm_handle2lock(&och->och_lease_handle);
3425 lock_res_and_lock(lock);
3426 if (!ldlm_is_cancel(lock))
3427 fmode = och->och_flags;
3429 unlock_res_and_lock(lock);
3430 LDLM_LOCK_PUT(lock);
3433 mutex_unlock(&lli->lli_och_mutex);
3435 RETURN(ll_lease_type_from_fmode(fmode));
3437 case LL_IOC_HSM_IMPORT: {
3438 struct hsm_user_import *hui;
3444 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3449 rc = ll_hsm_import(inode, file, hui);
3454 case LL_IOC_FUTIMES_3: {
3455 struct ll_futimes_3 lfu;
3457 if (copy_from_user(&lfu,
3458 (const struct ll_futimes_3 __user *)arg,
3462 RETURN(ll_file_futimes_3(file, &lfu));
3464 case LL_IOC_LADVISE: {
3465 struct llapi_ladvise_hdr *k_ladvise_hdr;
3466 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3469 int alloc_size = sizeof(*k_ladvise_hdr);
3472 u_ladvise_hdr = (void __user *)arg;
3473 OBD_ALLOC_PTR(k_ladvise_hdr);
3474 if (k_ladvise_hdr == NULL)
3477 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3478 GOTO(out_ladvise, rc = -EFAULT);
3480 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3481 k_ladvise_hdr->lah_count < 1)
3482 GOTO(out_ladvise, rc = -EINVAL);
3484 num_advise = k_ladvise_hdr->lah_count;
3485 if (num_advise >= LAH_COUNT_MAX)
3486 GOTO(out_ladvise, rc = -EFBIG);
3488 OBD_FREE_PTR(k_ladvise_hdr);
3489 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3490 lah_advise[num_advise]);
3491 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3492 if (k_ladvise_hdr == NULL)
3496 * TODO: submit multiple advices to one server in a single RPC
3498 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3499 GOTO(out_ladvise, rc = -EFAULT);
3501 for (i = 0; i < num_advise; i++) {
3502 struct llapi_lu_ladvise *k_ladvise =
3503 &k_ladvise_hdr->lah_advise[i];
3504 struct llapi_lu_ladvise __user *u_ladvise =
3505 &u_ladvise_hdr->lah_advise[i];
3507 rc = ll_ladvise_sanity(inode, k_ladvise);
3509 GOTO(out_ladvise, rc);
3511 switch (k_ladvise->lla_advice) {
3512 case LU_LADVISE_LOCKNOEXPAND:
3513 rc = ll_lock_noexpand(file,
3514 k_ladvise->lla_peradvice_flags);
3515 GOTO(out_ladvise, rc);
3516 case LU_LADVISE_LOCKAHEAD:
3518 rc = ll_file_lock_ahead(file, k_ladvise);
3521 GOTO(out_ladvise, rc);
3524 &u_ladvise->lla_lockahead_result))
3525 GOTO(out_ladvise, rc = -EFAULT);
3528 rc = ll_ladvise(inode, file,
3529 k_ladvise_hdr->lah_flags,
3532 GOTO(out_ladvise, rc);
3539 OBD_FREE(k_ladvise_hdr, alloc_size);
3542 case LL_IOC_FLR_SET_MIRROR: {
3543 /* mirror I/O must be direct to avoid polluting page cache
3545 if (!(file->f_flags & O_DIRECT))
3548 fd->fd_designated_mirror = (__u32)arg;
3551 case LL_IOC_FSGETXATTR:
3552 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3553 case LL_IOC_FSSETXATTR:
3554 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3556 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3558 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3559 (void __user *)arg));
3563 #ifndef HAVE_FILE_LLSEEK_SIZE
3564 static inline loff_t
3565 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3567 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3569 if (offset > maxsize)
3572 if (offset != file->f_pos) {
3573 file->f_pos = offset;
3574 file->f_version = 0;
3580 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3581 loff_t maxsize, loff_t eof)
3583 struct inode *inode = file_inode(file);
3591 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3592 * position-querying operation. Avoid rewriting the "same"
3593 * f_pos value back to the file because a concurrent read(),
3594 * write() or lseek() might have altered it
3599 * f_lock protects against read/modify/write race with other
3600 * SEEK_CURs. Note that parallel writes and reads behave
3604 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3605 inode_unlock(inode);
3609 * In the generic case the entire file is data, so as long as
3610 * offset isn't at the end of the file then the offset is data.
3617 * There is a virtual hole at the end of the file, so as long as
3618 * offset isn't i_size or larger, return i_size.
3626 return llseek_execute(file, offset, maxsize);
3630 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3632 struct inode *inode = file_inode(file);
3633 loff_t retval, eof = 0;
3636 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3637 (origin == SEEK_CUR) ? file->f_pos : 0);
3638 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3639 PFID(ll_inode2fid(inode)), inode, retval, retval,
3641 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3643 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3644 retval = ll_glimpse_size(inode);
3647 eof = i_size_read(inode);
3650 retval = ll_generic_file_llseek_size(file, offset, origin,
3651 ll_file_maxbytes(inode), eof);
3655 static int ll_flush(struct file *file, fl_owner_t id)
3657 struct inode *inode = file_inode(file);
3658 struct ll_inode_info *lli = ll_i2info(inode);
3659 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3662 LASSERT(!S_ISDIR(inode->i_mode));
3664 /* catch async errors that were recorded back when async writeback
3665 * failed for pages in this mapping. */
3666 rc = lli->lli_async_rc;
3667 lli->lli_async_rc = 0;
3668 if (lli->lli_clob != NULL) {
3669 err = lov_read_and_clear_async_rc(lli->lli_clob);
3674 /* The application has been told write failure already.
3675 * Do not report failure again. */
3676 if (fd->fd_write_failed)
3678 return rc ? -EIO : 0;
3682 * Called to make sure a portion of file has been written out.
3683 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3685 * Return how many pages have been written.
3687 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3688 enum cl_fsync_mode mode, int ignore_layout)
3692 struct cl_fsync_io *fio;
3697 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3698 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3701 env = cl_env_get(&refcheck);
3703 RETURN(PTR_ERR(env));
3705 io = vvp_env_thread_io(env);
3706 io->ci_obj = ll_i2info(inode)->lli_clob;
3707 io->ci_ignore_layout = ignore_layout;
3709 /* initialize parameters for sync */
3710 fio = &io->u.ci_fsync;
3711 fio->fi_start = start;
3713 fio->fi_fid = ll_inode2fid(inode);
3714 fio->fi_mode = mode;
3715 fio->fi_nr_written = 0;
3717 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3718 result = cl_io_loop(env, io);
3720 result = io->ci_result;
3722 result = fio->fi_nr_written;
3723 cl_io_fini(env, io);
3724 cl_env_put(env, &refcheck);
3730 * When dentry is provided (the 'else' case), file_dentry() may be
3731 * null and dentry must be used directly rather than pulled from
3732 * file_dentry() as is done otherwise.
3735 #ifdef HAVE_FILE_FSYNC_4ARGS
3736 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3738 struct dentry *dentry = file_dentry(file);
3740 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3741 int ll_fsync(struct file *file, int datasync)
3743 struct dentry *dentry = file_dentry(file);
3745 loff_t end = LLONG_MAX;
3747 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3750 loff_t end = LLONG_MAX;
3752 struct inode *inode = dentry->d_inode;
3753 struct ll_inode_info *lli = ll_i2info(inode);
3754 struct ptlrpc_request *req;
3758 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3759 PFID(ll_inode2fid(inode)), inode);
3760 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3762 #ifdef HAVE_FILE_FSYNC_4ARGS
3763 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3764 lock_inode = !lli->lli_inode_locked;
3768 /* fsync's caller has already called _fdata{sync,write}, we want
3769 * that IO to finish before calling the osc and mdc sync methods */
3770 rc = filemap_fdatawait(inode->i_mapping);
3773 /* catch async errors that were recorded back when async writeback
3774 * failed for pages in this mapping. */
3775 if (!S_ISDIR(inode->i_mode)) {
3776 err = lli->lli_async_rc;
3777 lli->lli_async_rc = 0;
3780 if (lli->lli_clob != NULL) {
3781 err = lov_read_and_clear_async_rc(lli->lli_clob);
3787 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3791 ptlrpc_req_finished(req);
3793 if (S_ISREG(inode->i_mode)) {
3794 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3796 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3797 if (rc == 0 && err < 0)
3800 fd->fd_write_failed = true;
3802 fd->fd_write_failed = false;
3805 #ifdef HAVE_FILE_FSYNC_4ARGS
3807 inode_unlock(inode);
3813 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3815 struct inode *inode = file_inode(file);
3816 struct ll_sb_info *sbi = ll_i2sbi(inode);
3817 struct ldlm_enqueue_info einfo = {
3818 .ei_type = LDLM_FLOCK,
3819 .ei_cb_cp = ldlm_flock_completion_ast,
3820 .ei_cbdata = file_lock,
3822 struct md_op_data *op_data;
3823 struct lustre_handle lockh = { 0 };
3824 union ldlm_policy_data flock = { { 0 } };
3825 int fl_type = file_lock->fl_type;
3831 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3832 PFID(ll_inode2fid(inode)), file_lock);
3834 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3836 if (file_lock->fl_flags & FL_FLOCK) {
3837 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3838 /* flocks are whole-file locks */
3839 flock.l_flock.end = OFFSET_MAX;
3840 /* For flocks owner is determined by the local file desctiptor*/
3841 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3842 } else if (file_lock->fl_flags & FL_POSIX) {
3843 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3844 flock.l_flock.start = file_lock->fl_start;
3845 flock.l_flock.end = file_lock->fl_end;
3849 flock.l_flock.pid = file_lock->fl_pid;
3851 /* Somewhat ugly workaround for svc lockd.
3852 * lockd installs custom fl_lmops->lm_compare_owner that checks
3853 * for the fl_owner to be the same (which it always is on local node
3854 * I guess between lockd processes) and then compares pid.
3855 * As such we assign pid to the owner field to make it all work,
3856 * conflict with normal locks is unlikely since pid space and
3857 * pointer space for current->files are not intersecting */
3858 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3859 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3863 einfo.ei_mode = LCK_PR;
3866 /* An unlock request may or may not have any relation to
3867 * existing locks so we may not be able to pass a lock handle
3868 * via a normal ldlm_lock_cancel() request. The request may even
3869 * unlock a byte range in the middle of an existing lock. In
3870 * order to process an unlock request we need all of the same
3871 * information that is given with a normal read or write record
3872 * lock request. To avoid creating another ldlm unlock (cancel)
3873 * message we'll treat a LCK_NL flock request as an unlock. */
3874 einfo.ei_mode = LCK_NL;
3877 einfo.ei_mode = LCK_PW;
3880 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3895 flags = LDLM_FL_BLOCK_NOWAIT;
3901 flags = LDLM_FL_TEST_LOCK;
3904 CERROR("unknown fcntl lock command: %d\n", cmd);
3908 /* Save the old mode so that if the mode in the lock changes we
3909 * can decrement the appropriate reader or writer refcount. */
3910 file_lock->fl_type = einfo.ei_mode;
3912 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3913 LUSTRE_OPC_ANY, NULL);
3914 if (IS_ERR(op_data))
3915 RETURN(PTR_ERR(op_data));
3917 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3918 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3919 flock.l_flock.pid, flags, einfo.ei_mode,
3920 flock.l_flock.start, flock.l_flock.end);
3922 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3925 /* Restore the file lock type if not TEST lock. */
3926 if (!(flags & LDLM_FL_TEST_LOCK))
3927 file_lock->fl_type = fl_type;
3929 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3930 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3931 !(flags & LDLM_FL_TEST_LOCK))
3932 rc2 = locks_lock_file_wait(file, file_lock);
3934 if ((file_lock->fl_flags & FL_FLOCK) &&
3935 (rc == 0 || file_lock->fl_type == F_UNLCK))
3936 rc2 = flock_lock_file_wait(file, file_lock);
3937 if ((file_lock->fl_flags & FL_POSIX) &&
3938 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3939 !(flags & LDLM_FL_TEST_LOCK))
3940 rc2 = posix_lock_file_wait(file, file_lock);
3941 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3943 if (rc2 && file_lock->fl_type != F_UNLCK) {
3944 einfo.ei_mode = LCK_NL;
3945 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3950 ll_finish_md_op_data(op_data);
3955 int ll_get_fid_by_name(struct inode *parent, const char *name,
3956 int namelen, struct lu_fid *fid,
3957 struct inode **inode)
3959 struct md_op_data *op_data = NULL;
3960 struct mdt_body *body;
3961 struct ptlrpc_request *req;
3965 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3966 LUSTRE_OPC_ANY, NULL);
3967 if (IS_ERR(op_data))
3968 RETURN(PTR_ERR(op_data));
3970 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3971 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3972 ll_finish_md_op_data(op_data);
3976 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3978 GOTO(out_req, rc = -EFAULT);
3980 *fid = body->mbo_fid1;
3983 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3985 ptlrpc_req_finished(req);
3989 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3990 const char *name, int namelen)
3992 struct dentry *dchild = NULL;
3993 struct inode *child_inode = NULL;
3994 struct md_op_data *op_data;
3995 struct ptlrpc_request *request = NULL;
3996 struct obd_client_handle *och = NULL;
3998 struct mdt_body *body;
4000 __u64 data_version = 0;
4003 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4004 name, PFID(ll_inode2fid(parent)), mdtidx);
4006 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4007 0, LUSTRE_OPC_ANY, NULL);
4008 if (IS_ERR(op_data))
4009 RETURN(PTR_ERR(op_data));
4011 /* Get child FID first */
4012 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4015 dchild = d_lookup(file_dentry(file), &qstr);
4016 if (dchild != NULL) {
4017 if (dchild->d_inode != NULL)
4018 child_inode = igrab(dchild->d_inode);
4022 if (child_inode == NULL) {
4023 rc = ll_get_fid_by_name(parent, name, namelen,
4024 &op_data->op_fid3, &child_inode);
4029 if (child_inode == NULL)
4030 GOTO(out_free, rc = -EINVAL);
4033 * lfs migrate command needs to be blocked on the client
4034 * by checking the migrate FID against the FID of the
4037 if (child_inode == parent->i_sb->s_root->d_inode)
4038 GOTO(out_iput, rc = -EINVAL);
4040 inode_lock(child_inode);
4041 op_data->op_fid3 = *ll_inode2fid(child_inode);
4042 if (!fid_is_sane(&op_data->op_fid3)) {
4043 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4044 ll_get_fsname(parent->i_sb, NULL, 0), name,
4045 PFID(&op_data->op_fid3));
4046 GOTO(out_unlock, rc = -EINVAL);
4049 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4051 GOTO(out_unlock, rc);
4054 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4055 PFID(&op_data->op_fid3), mdtidx);
4056 GOTO(out_unlock, rc = 0);
4059 if (S_ISREG(child_inode->i_mode)) {
4060 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4064 GOTO(out_unlock, rc);
4067 rc = ll_data_version(child_inode, &data_version,
4070 GOTO(out_close, rc);
4072 op_data->op_handle = och->och_fh;
4073 op_data->op_data = och->och_mod;
4074 op_data->op_data_version = data_version;
4075 op_data->op_lease_handle = och->och_lease_handle;
4076 op_data->op_bias |= MDS_RENAME_MIGRATE;
4079 op_data->op_mds = mdtidx;
4080 op_data->op_cli_flags = CLI_MIGRATE;
4081 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4082 namelen, name, namelen, &request);
4084 LASSERT(request != NULL);
4085 ll_update_times(request, parent);
4087 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4088 LASSERT(body != NULL);
4090 /* If the server does release layout lock, then we cleanup
4091 * the client och here, otherwise release it in out_close: */
4093 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4094 obd_mod_put(och->och_mod);
4095 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4097 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4103 if (request != NULL) {
4104 ptlrpc_req_finished(request);
4108 /* Try again if the file layout has changed. */
4109 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4113 if (och != NULL) /* close the file */
4114 ll_lease_close(och, child_inode, NULL);
4116 clear_nlink(child_inode);
4118 inode_unlock(child_inode);
4122 ll_finish_md_op_data(op_data);
4127 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4135 * test if some locks matching bits and l_req_mode are acquired
4136 * - bits can be in different locks
4137 * - if found clear the common lock bits in *bits
4138 * - the bits not found, are kept in *bits
4140 * \param bits [IN] searched lock bits [IN]
4141 * \param l_req_mode [IN] searched lock mode
4142 * \retval boolean, true iff all bits are found
4144 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4146 struct lustre_handle lockh;
4147 union ldlm_policy_data policy;
4148 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4149 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4158 fid = &ll_i2info(inode)->lli_fid;
4159 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4160 ldlm_lockname[mode]);
4162 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4163 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4164 policy.l_inodebits.bits = *bits & (1 << i);
4165 if (policy.l_inodebits.bits == 0)
4168 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4169 &policy, mode, &lockh)) {
4170 struct ldlm_lock *lock;
4172 lock = ldlm_handle2lock(&lockh);
4175 ~(lock->l_policy_data.l_inodebits.bits);
4176 LDLM_LOCK_PUT(lock);
4178 *bits &= ~policy.l_inodebits.bits;
4185 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4186 struct lustre_handle *lockh, __u64 flags,
4187 enum ldlm_mode mode)
4189 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4194 fid = &ll_i2info(inode)->lli_fid;
4195 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4197 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4198 fid, LDLM_IBITS, &policy, mode, lockh);
4203 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4205 /* Already unlinked. Just update nlink and return success */
4206 if (rc == -ENOENT) {
4208 /* If it is striped directory, and there is bad stripe
4209 * Let's revalidate the dentry again, instead of returning
4211 if (S_ISDIR(inode->i_mode) &&
4212 ll_i2info(inode)->lli_lsm_md != NULL)
4215 /* This path cannot be hit for regular files unless in
4216 * case of obscure races, so no need to to validate
4218 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4220 } else if (rc != 0) {
4221 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4222 "%s: revalidate FID "DFID" error: rc = %d\n",
4223 ll_get_fsname(inode->i_sb, NULL, 0),
4224 PFID(ll_inode2fid(inode)), rc);
4230 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4232 struct inode *inode = dentry->d_inode;
4233 struct obd_export *exp = ll_i2mdexp(inode);
4234 struct lookup_intent oit = {
4237 struct ptlrpc_request *req = NULL;
4238 struct md_op_data *op_data;
4242 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4243 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4245 /* Call getattr by fid, so do not provide name at all. */
4246 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4247 LUSTRE_OPC_ANY, NULL);
4248 if (IS_ERR(op_data))
4249 RETURN(PTR_ERR(op_data));
4251 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4252 ll_finish_md_op_data(op_data);
4254 rc = ll_inode_revalidate_fini(inode, rc);
4258 rc = ll_revalidate_it_finish(req, &oit, dentry);
4260 ll_intent_release(&oit);
4264 /* Unlinked? Unhash dentry, so it is not picked up later by
4265 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4266 * here to preserve get_cwd functionality on 2.6.
4268 if (!dentry->d_inode->i_nlink) {
4269 ll_lock_dcache(inode);
4270 d_lustre_invalidate(dentry, 0);
4271 ll_unlock_dcache(inode);
4274 ll_lookup_finish_locks(&oit, dentry);
4276 ptlrpc_req_finished(req);
4281 static int ll_merge_md_attr(struct inode *inode)
4283 struct cl_attr attr = { 0 };
4286 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4287 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4288 &attr, ll_md_blocking_ast);
4292 set_nlink(inode, attr.cat_nlink);
4293 inode->i_blocks = attr.cat_blocks;
4294 i_size_write(inode, attr.cat_size);
4296 ll_i2info(inode)->lli_atime = attr.cat_atime;
4297 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4298 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4303 static inline dev_t ll_compat_encode_dev(dev_t dev)
4305 /* The compat_sys_*stat*() syscalls will fail unless the
4306 * device majors and minors are both less than 256. Note that
4307 * the value returned here will be passed through
4308 * old_encode_dev() in cp_compat_stat(). And so we are not
4309 * trying to return a valid compat (u16) device number, just
4310 * one that will pass the old_valid_dev() check. */
4312 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4315 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4316 int ll_getattr(const struct path *path, struct kstat *stat,
4317 u32 request_mask, unsigned int flags)
4319 struct dentry *de = path->dentry;
4321 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4324 struct inode *inode = de->d_inode;
4325 struct ll_sb_info *sbi = ll_i2sbi(inode);
4326 struct ll_inode_info *lli = ll_i2info(inode);
4329 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4331 rc = ll_inode_revalidate(de, IT_GETATTR);
4335 if (S_ISREG(inode->i_mode)) {
4336 /* In case of restore, the MDT has the right size and has
4337 * already send it back without granting the layout lock,
4338 * inode is up-to-date so glimpse is useless.
4339 * Also to glimpse we need the layout, in case of a running
4340 * restore the MDT holds the layout lock so the glimpse will
4341 * block up to the end of restore (getattr will block)
4343 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4344 rc = ll_glimpse_size(inode);
4349 /* If object isn't regular a file then don't validate size. */
4350 if (S_ISDIR(inode->i_mode) &&
4351 lli->lli_lsm_md != NULL) {
4352 rc = ll_merge_md_attr(inode);
4357 LTIME_S(inode->i_atime) = lli->lli_atime;
4358 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4359 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4362 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4364 if (ll_need_32bit_api(sbi)) {
4365 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4366 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4367 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4369 stat->ino = inode->i_ino;
4370 stat->dev = inode->i_sb->s_dev;
4371 stat->rdev = inode->i_rdev;
4374 stat->mode = inode->i_mode;
4375 stat->uid = inode->i_uid;
4376 stat->gid = inode->i_gid;
4377 stat->atime = inode->i_atime;
4378 stat->mtime = inode->i_mtime;
4379 stat->ctime = inode->i_ctime;
4380 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4382 stat->nlink = inode->i_nlink;
4383 stat->size = i_size_read(inode);
4384 stat->blocks = inode->i_blocks;
4389 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4390 __u64 start, __u64 len)
4394 struct fiemap *fiemap;
4395 unsigned int extent_count = fieinfo->fi_extents_max;
4397 num_bytes = sizeof(*fiemap) + (extent_count *
4398 sizeof(struct fiemap_extent));
4399 OBD_ALLOC_LARGE(fiemap, num_bytes);
4404 fiemap->fm_flags = fieinfo->fi_flags;
4405 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4406 fiemap->fm_start = start;
4407 fiemap->fm_length = len;
4408 if (extent_count > 0 &&
4409 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4410 sizeof(struct fiemap_extent)) != 0)
4411 GOTO(out, rc = -EFAULT);
4413 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4415 fieinfo->fi_flags = fiemap->fm_flags;
4416 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4417 if (extent_count > 0 &&
4418 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4419 fiemap->fm_mapped_extents *
4420 sizeof(struct fiemap_extent)) != 0)
4421 GOTO(out, rc = -EFAULT);
4423 OBD_FREE_LARGE(fiemap, num_bytes);
4427 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4429 struct ll_inode_info *lli = ll_i2info(inode);
4430 struct posix_acl *acl = NULL;
4433 spin_lock(&lli->lli_lock);
4434 /* VFS' acl_permission_check->check_acl will release the refcount */
4435 acl = posix_acl_dup(lli->lli_posix_acl);
4436 spin_unlock(&lli->lli_lock);
4441 #ifdef HAVE_IOP_SET_ACL
4442 #ifdef CONFIG_FS_POSIX_ACL
4443 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4445 struct ll_sb_info *sbi = ll_i2sbi(inode);
4446 struct ptlrpc_request *req = NULL;
4447 const char *name = NULL;
4449 size_t value_size = 0;
4454 case ACL_TYPE_ACCESS:
4455 name = XATTR_NAME_POSIX_ACL_ACCESS;
4457 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4460 case ACL_TYPE_DEFAULT:
4461 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4462 if (!S_ISDIR(inode->i_mode))
4463 rc = acl ? -EACCES : 0;
4474 value_size = posix_acl_xattr_size(acl->a_count);
4475 value = kmalloc(value_size, GFP_NOFS);
4477 GOTO(out, rc = -ENOMEM);
4479 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4481 GOTO(out_value, rc);
4484 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4485 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4486 name, value, value_size, 0, 0, &req);
4488 ptlrpc_req_finished(req);
4493 forget_cached_acl(inode, type);
4495 set_cached_acl(inode, type, acl);
4498 #endif /* CONFIG_FS_POSIX_ACL */
4499 #endif /* HAVE_IOP_SET_ACL */
4501 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4503 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4504 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4506 ll_check_acl(struct inode *inode, int mask)
4509 # ifdef CONFIG_FS_POSIX_ACL
4510 struct posix_acl *acl;
4514 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4515 if (flags & IPERM_FLAG_RCU)
4518 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4523 rc = posix_acl_permission(inode, acl, mask);
4524 posix_acl_release(acl);
4527 # else /* !CONFIG_FS_POSIX_ACL */
4529 # endif /* CONFIG_FS_POSIX_ACL */
4531 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4533 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4534 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4536 # ifdef HAVE_INODE_PERMISION_2ARGS
4537 int ll_inode_permission(struct inode *inode, int mask)
4539 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4544 struct ll_sb_info *sbi;
4545 struct root_squash_info *squash;
4546 struct cred *cred = NULL;
4547 const struct cred *old_cred = NULL;
4549 bool squash_id = false;
4552 #ifdef MAY_NOT_BLOCK
4553 if (mask & MAY_NOT_BLOCK)
4555 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4556 if (flags & IPERM_FLAG_RCU)
4560 /* as root inode are NOT getting validated in lookup operation,
4561 * need to do it before permission check. */
4563 if (inode == inode->i_sb->s_root->d_inode) {
4564 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4569 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4570 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4572 /* squash fsuid/fsgid if needed */
4573 sbi = ll_i2sbi(inode);
4574 squash = &sbi->ll_squash;
4575 if (unlikely(squash->rsi_uid != 0 &&
4576 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4577 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4581 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4582 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4583 squash->rsi_uid, squash->rsi_gid);
4585 /* update current process's credentials
4586 * and FS capability */
4587 cred = prepare_creds();
4591 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4592 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4593 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4594 if ((1 << cap) & CFS_CAP_FS_MASK)
4595 cap_lower(cred->cap_effective, cap);
4597 old_cred = override_creds(cred);
4600 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4601 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4602 /* restore current process's credentials and FS capability */
4604 revert_creds(old_cred);
4611 /* -o localflock - only provides locally consistent flock locks */
4612 struct file_operations ll_file_operations = {
4613 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4614 # ifdef HAVE_SYNC_READ_WRITE
4615 .read = new_sync_read,
4616 .write = new_sync_write,
4618 .read_iter = ll_file_read_iter,
4619 .write_iter = ll_file_write_iter,
4620 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4621 .read = ll_file_read,
4622 .aio_read = ll_file_aio_read,
4623 .write = ll_file_write,
4624 .aio_write = ll_file_aio_write,
4625 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4626 .unlocked_ioctl = ll_file_ioctl,
4627 .open = ll_file_open,
4628 .release = ll_file_release,
4629 .mmap = ll_file_mmap,
4630 .llseek = ll_file_seek,
4631 .splice_read = ll_file_splice_read,
4636 struct file_operations ll_file_operations_flock = {
4637 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4638 # ifdef HAVE_SYNC_READ_WRITE
4639 .read = new_sync_read,
4640 .write = new_sync_write,
4641 # endif /* HAVE_SYNC_READ_WRITE */
4642 .read_iter = ll_file_read_iter,
4643 .write_iter = ll_file_write_iter,
4644 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4645 .read = ll_file_read,
4646 .aio_read = ll_file_aio_read,
4647 .write = ll_file_write,
4648 .aio_write = ll_file_aio_write,
4649 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4650 .unlocked_ioctl = ll_file_ioctl,
4651 .open = ll_file_open,
4652 .release = ll_file_release,
4653 .mmap = ll_file_mmap,
4654 .llseek = ll_file_seek,
4655 .splice_read = ll_file_splice_read,
4658 .flock = ll_file_flock,
4659 .lock = ll_file_flock
4662 /* These are for -o noflock - to return ENOSYS on flock calls */
4663 struct file_operations ll_file_operations_noflock = {
4664 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4665 # ifdef HAVE_SYNC_READ_WRITE
4666 .read = new_sync_read,
4667 .write = new_sync_write,
4668 # endif /* HAVE_SYNC_READ_WRITE */
4669 .read_iter = ll_file_read_iter,
4670 .write_iter = ll_file_write_iter,
4671 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4672 .read = ll_file_read,
4673 .aio_read = ll_file_aio_read,
4674 .write = ll_file_write,
4675 .aio_write = ll_file_aio_write,
4676 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4677 .unlocked_ioctl = ll_file_ioctl,
4678 .open = ll_file_open,
4679 .release = ll_file_release,
4680 .mmap = ll_file_mmap,
4681 .llseek = ll_file_seek,
4682 .splice_read = ll_file_splice_read,
4685 .flock = ll_file_noflock,
4686 .lock = ll_file_noflock
4689 struct inode_operations ll_file_inode_operations = {
4690 .setattr = ll_setattr,
4691 .getattr = ll_getattr,
4692 .permission = ll_inode_permission,
4693 #ifdef HAVE_IOP_XATTR
4694 .setxattr = ll_setxattr,
4695 .getxattr = ll_getxattr,
4696 .removexattr = ll_removexattr,
4698 .listxattr = ll_listxattr,
4699 .fiemap = ll_fiemap,
4700 #ifdef HAVE_IOP_GET_ACL
4701 .get_acl = ll_get_acl,
4703 #ifdef HAVE_IOP_SET_ACL
4704 .set_acl = ll_set_acl,
4708 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4710 struct ll_inode_info *lli = ll_i2info(inode);
4711 struct cl_object *obj = lli->lli_clob;
4720 env = cl_env_get(&refcheck);
4722 RETURN(PTR_ERR(env));
4724 rc = cl_conf_set(env, lli->lli_clob, conf);
4728 if (conf->coc_opc == OBJECT_CONF_SET) {
4729 struct ldlm_lock *lock = conf->coc_lock;
4730 struct cl_layout cl = {
4734 LASSERT(lock != NULL);
4735 LASSERT(ldlm_has_layout(lock));
4737 /* it can only be allowed to match after layout is
4738 * applied to inode otherwise false layout would be
4739 * seen. Applying layout shoud happen before dropping
4740 * the intent lock. */
4741 ldlm_lock_allow_match(lock);
4743 rc = cl_object_layout_get(env, obj, &cl);
4748 DFID": layout version change: %u -> %u\n",
4749 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4751 ll_layout_version_set(lli, cl.cl_layout_gen);
4755 cl_env_put(env, &refcheck);
4760 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4761 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4764 struct ll_sb_info *sbi = ll_i2sbi(inode);
4765 struct ptlrpc_request *req;
4766 struct mdt_body *body;
4773 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4774 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4775 lock->l_lvb_data, lock->l_lvb_len);
4777 if (lock->l_lvb_data != NULL)
4780 /* if layout lock was granted right away, the layout is returned
4781 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4782 * blocked and then granted via completion ast, we have to fetch
4783 * layout here. Please note that we can't use the LVB buffer in
4784 * completion AST because it doesn't have a large enough buffer */
4785 rc = ll_get_default_mdsize(sbi, &lmmsize);
4787 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4788 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4792 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4794 GOTO(out, rc = -EPROTO);
4796 lmmsize = body->mbo_eadatasize;
4797 if (lmmsize == 0) /* empty layout */
4800 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4802 GOTO(out, rc = -EFAULT);
4804 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4805 if (lvbdata == NULL)
4806 GOTO(out, rc = -ENOMEM);
4808 memcpy(lvbdata, lmm, lmmsize);
4809 lock_res_and_lock(lock);
4810 if (unlikely(lock->l_lvb_data == NULL)) {
4811 lock->l_lvb_type = LVB_T_LAYOUT;
4812 lock->l_lvb_data = lvbdata;
4813 lock->l_lvb_len = lmmsize;
4816 unlock_res_and_lock(lock);
4819 OBD_FREE_LARGE(lvbdata, lmmsize);
4824 ptlrpc_req_finished(req);
4829 * Apply the layout to the inode. Layout lock is held and will be released
4832 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4833 struct inode *inode)
4835 struct ll_inode_info *lli = ll_i2info(inode);
4836 struct ll_sb_info *sbi = ll_i2sbi(inode);
4837 struct ldlm_lock *lock;
4838 struct cl_object_conf conf;
4841 bool wait_layout = false;
4844 LASSERT(lustre_handle_is_used(lockh));
4846 lock = ldlm_handle2lock(lockh);
4847 LASSERT(lock != NULL);
4848 LASSERT(ldlm_has_layout(lock));
4850 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4851 PFID(&lli->lli_fid), inode);
4853 /* in case this is a caching lock and reinstate with new inode */
4854 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4856 lock_res_and_lock(lock);
4857 lvb_ready = ldlm_is_lvb_ready(lock);
4858 unlock_res_and_lock(lock);
4860 /* checking lvb_ready is racy but this is okay. The worst case is
4861 * that multi processes may configure the file on the same time. */
4865 rc = ll_layout_fetch(inode, lock);
4869 /* for layout lock, lmm is stored in lock's lvb.
4870 * lvb_data is immutable if the lock is held so it's safe to access it
4873 * set layout to file. Unlikely this will fail as old layout was
4874 * surely eliminated */
4875 memset(&conf, 0, sizeof conf);
4876 conf.coc_opc = OBJECT_CONF_SET;
4877 conf.coc_inode = inode;
4878 conf.coc_lock = lock;
4879 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4880 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4881 rc = ll_layout_conf(inode, &conf);
4883 /* refresh layout failed, need to wait */
4884 wait_layout = rc == -EBUSY;
4887 LDLM_LOCK_PUT(lock);
4888 ldlm_lock_decref(lockh, mode);
4890 /* wait for IO to complete if it's still being used. */
4892 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4893 ll_get_fsname(inode->i_sb, NULL, 0),
4894 PFID(&lli->lli_fid), inode);
4896 memset(&conf, 0, sizeof conf);
4897 conf.coc_opc = OBJECT_CONF_WAIT;
4898 conf.coc_inode = inode;
4899 rc = ll_layout_conf(inode, &conf);
4903 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4904 ll_get_fsname(inode->i_sb, NULL, 0),
4905 PFID(&lli->lli_fid), rc);
4911 * Issue layout intent RPC to MDS.
4912 * \param inode [in] file inode
4913 * \param intent [in] layout intent
4915 * \retval 0 on success
4916 * \retval < 0 error code
4918 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4920 struct ll_inode_info *lli = ll_i2info(inode);
4921 struct ll_sb_info *sbi = ll_i2sbi(inode);
4922 struct md_op_data *op_data;
4923 struct lookup_intent it;
4924 struct ptlrpc_request *req;
4928 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4929 0, 0, LUSTRE_OPC_ANY, NULL);
4930 if (IS_ERR(op_data))
4931 RETURN(PTR_ERR(op_data));
4933 op_data->op_data = intent;
4934 op_data->op_data_size = sizeof(*intent);
4936 memset(&it, 0, sizeof(it));
4937 it.it_op = IT_LAYOUT;
4938 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4939 intent->li_opc == LAYOUT_INTENT_TRUNC)
4940 it.it_flags = FMODE_WRITE;
4942 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4943 ll_get_fsname(inode->i_sb, NULL, 0),
4944 PFID(&lli->lli_fid), inode);
4946 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4947 &ll_md_blocking_ast, 0);
4948 if (it.it_request != NULL)
4949 ptlrpc_req_finished(it.it_request);
4950 it.it_request = NULL;
4952 ll_finish_md_op_data(op_data);
4954 /* set lock data in case this is a new lock */
4956 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4958 ll_intent_drop_lock(&it);
4964 * This function checks if there exists a LAYOUT lock on the client side,
4965 * or enqueues it if it doesn't have one in cache.
4967 * This function will not hold layout lock so it may be revoked any time after
4968 * this function returns. Any operations depend on layout should be redone
4971 * This function should be called before lov_io_init() to get an uptodate
4972 * layout version, the caller should save the version number and after IO
4973 * is finished, this function should be called again to verify that layout
4974 * is not changed during IO time.
4976 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4978 struct ll_inode_info *lli = ll_i2info(inode);
4979 struct ll_sb_info *sbi = ll_i2sbi(inode);
4980 struct lustre_handle lockh;
4981 struct layout_intent intent = {
4982 .li_opc = LAYOUT_INTENT_ACCESS,
4984 enum ldlm_mode mode;
4988 *gen = ll_layout_version_get(lli);
4989 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4993 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4994 LASSERT(S_ISREG(inode->i_mode));
4996 /* take layout lock mutex to enqueue layout lock exclusively. */
4997 mutex_lock(&lli->lli_layout_mutex);
5000 /* mostly layout lock is caching on the local side, so try to
5001 * match it before grabbing layout lock mutex. */
5002 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5003 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5004 if (mode != 0) { /* hit cached lock */
5005 rc = ll_layout_lock_set(&lockh, mode, inode);
5011 rc = ll_layout_intent(inode, &intent);
5017 *gen = ll_layout_version_get(lli);
5018 mutex_unlock(&lli->lli_layout_mutex);
5024 * Issue layout intent RPC indicating where in a file an IO is about to write.
5026 * \param[in] inode file inode.
5027 * \param[in] ext write range with start offset of fille in bytes where
5028 * an IO is about to write, and exclusive end offset in
5031 * \retval 0 on success
5032 * \retval < 0 error code
5034 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5035 struct lu_extent *ext)
5037 struct layout_intent intent = {
5039 .li_extent.e_start = ext->e_start,
5040 .li_extent.e_end = ext->e_end,
5045 rc = ll_layout_intent(inode, &intent);
5051 * This function send a restore request to the MDT
5053 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5055 struct hsm_user_request *hur;
5059 len = sizeof(struct hsm_user_request) +
5060 sizeof(struct hsm_user_item);
5061 OBD_ALLOC(hur, len);
5065 hur->hur_request.hr_action = HUA_RESTORE;
5066 hur->hur_request.hr_archive_id = 0;
5067 hur->hur_request.hr_flags = 0;
5068 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5069 sizeof(hur->hur_user_item[0].hui_fid));
5070 hur->hur_user_item[0].hui_extent.offset = offset;
5071 hur->hur_user_item[0].hui_extent.length = length;
5072 hur->hur_request.hr_itemcount = 1;
5073 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,