4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 rc = md_close(md_exp, op_data, och->och_mod, &req);
203 if (rc != 0 && rc != -EINTR)
204 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
205 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
207 if (rc == 0 && op_data->op_bias & bias) {
208 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
215 ll_finish_md_op_data(op_data);
219 md_clear_open_replay_data(md_exp, och);
220 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
223 ptlrpc_req_finished(req); /* This is close request */
227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
229 struct ll_inode_info *lli = ll_i2info(inode);
230 struct obd_client_handle **och_p;
231 struct obd_client_handle *och;
236 if (fmode & FMODE_WRITE) {
237 och_p = &lli->lli_mds_write_och;
238 och_usecount = &lli->lli_open_fd_write_count;
239 } else if (fmode & FMODE_EXEC) {
240 och_p = &lli->lli_mds_exec_och;
241 och_usecount = &lli->lli_open_fd_exec_count;
243 LASSERT(fmode & FMODE_READ);
244 och_p = &lli->lli_mds_read_och;
245 och_usecount = &lli->lli_open_fd_read_count;
248 mutex_lock(&lli->lli_och_mutex);
249 if (*och_usecount > 0) {
250 /* There are still users of this handle, so skip
252 mutex_unlock(&lli->lli_och_mutex);
258 mutex_unlock(&lli->lli_och_mutex);
261 /* There might be a race and this handle may already
263 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
269 static int ll_md_close(struct inode *inode, struct file *file)
271 union ldlm_policy_data policy = {
272 .l_inodebits = { MDS_INODELOCK_OPEN },
274 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
275 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
276 struct ll_inode_info *lli = ll_i2info(inode);
277 struct lustre_handle lockh;
278 enum ldlm_mode lockmode;
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 mutex_lock(&lli->lli_och_mutex);
307 if (fd->fd_omode & FMODE_WRITE) {
309 LASSERT(lli->lli_open_fd_write_count);
310 lli->lli_open_fd_write_count--;
311 } else if (fd->fd_omode & FMODE_EXEC) {
313 LASSERT(lli->lli_open_fd_exec_count);
314 lli->lli_open_fd_exec_count--;
317 LASSERT(lli->lli_open_fd_read_count);
318 lli->lli_open_fd_read_count--;
320 mutex_unlock(&lli->lli_och_mutex);
322 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
323 LDLM_IBITS, &policy, lockmode, &lockh))
324 rc = ll_md_real_close(inode, fd->fd_omode);
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
333 /* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here. Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
338 int ll_file_release(struct inode *inode, struct file *file)
340 struct ll_file_data *fd;
341 struct ll_sb_info *sbi = ll_i2sbi(inode);
342 struct ll_inode_info *lli = ll_i2info(inode);
346 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
347 PFID(ll_inode2fid(inode)), inode);
349 if (inode->i_sb->s_root != file_dentry(file))
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead,
355 * because parent and child process can share the same file handle. */
356 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
357 ll_deauthorize_statahead(inode, fd);
359 if (inode->i_sb->s_root == file_dentry(file)) {
360 LUSTRE_FPRIVATE(file) = NULL;
361 ll_file_data_put(fd);
365 if (!S_ISDIR(inode->i_mode)) {
366 if (lli->lli_clob != NULL)
367 lov_read_and_clear_async_rc(lli->lli_clob);
368 lli->lli_async_rc = 0;
371 rc = ll_md_close(inode, file);
373 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
374 libcfs_debug_dumplog();
379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
380 struct lookup_intent *itp)
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
440 /* We did open by fid, but by the time we got to the server,
441 * the object disappeared. If this is a create, we cannot really
442 * tell the userspace that the file it was trying to create
443 * does not exist. Instead let's return -ESTALE, and the VFS will
444 * retry the create with LOOKUP_REVAL that we are going to catch
445 * in ll_revalidate_dentry() and use lookup then.
447 if (rc == -ENOENT && itp->it_op & IT_CREAT)
453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
454 struct obd_client_handle *och)
456 struct mdt_body *body;
458 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
459 och->och_fh = body->mbo_handle;
460 och->och_fid = body->mbo_fid1;
461 och->och_lease_handle.cookie = it->it_lock_handle;
462 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
463 och->och_flags = it->it_flags;
465 return md_set_open_replay_data(md_exp, och, it);
468 static int ll_local_open(struct file *file, struct lookup_intent *it,
469 struct ll_file_data *fd, struct obd_client_handle *och)
471 struct inode *inode = file_inode(file);
474 LASSERT(!LUSTRE_FPRIVATE(file));
481 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
486 LUSTRE_FPRIVATE(file) = fd;
487 ll_readahead_init(inode, &fd->fd_ras);
488 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
490 /* ll_cl_context initialize */
491 rwlock_init(&fd->fd_lock);
492 INIT_LIST_HEAD(&fd->fd_lccs);
497 /* Open a file, and (for the very first open) create objects on the OSTs at
498 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
499 * creation or open until ll_lov_setstripe() ioctl is called.
501 * If we already have the stripe MD locally then we don't request it in
502 * md_open(), by passing a lmm_size = 0.
504 * It is up to the application to ensure no other processes open this file
505 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
506 * used. We might be able to avoid races of that sort by getting lli_open_sem
507 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
508 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
510 int ll_file_open(struct inode *inode, struct file *file)
512 struct ll_inode_info *lli = ll_i2info(inode);
513 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
514 .it_flags = file->f_flags };
515 struct obd_client_handle **och_p = NULL;
516 __u64 *och_usecount = NULL;
517 struct ll_file_data *fd;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
522 PFID(ll_inode2fid(inode)), inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_openerr, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode))
533 ll_authorize_statahead(inode, fd);
535 if (inode->i_sb->s_root == file_dentry(file)) {
536 LUSTRE_FPRIVATE(file) = fd;
540 if (!it || !it->it_disposition) {
541 /* Convert f_flags into access mode. We cannot use file->f_mode,
542 * because everything but O_ACCMODE mask was stripped from
544 if ((oit.it_flags + 1) & O_ACCMODE)
546 if (file->f_flags & O_TRUNC)
547 oit.it_flags |= FMODE_WRITE;
549 /* kernel only call f_op->open in dentry_open. filp_open calls
550 * dentry_open after call to open_namei that checks permissions.
551 * Only nfsd_open call dentry_open directly without checking
552 * permissions and because of that this code below is safe. */
553 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
554 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556 /* We do not want O_EXCL here, presumably we opened the file
557 * already? XXX - NFS implications? */
558 oit.it_flags &= ~O_EXCL;
560 /* bug20584, if "it_flags" contains O_CREAT, the file will be
561 * created if necessary, then "IT_CREAT" should be set to keep
562 * consistent with it */
563 if (oit.it_flags & O_CREAT)
564 oit.it_op |= IT_CREAT;
570 /* Let's see if we have file open on MDS already. */
571 if (it->it_flags & FMODE_WRITE) {
572 och_p = &lli->lli_mds_write_och;
573 och_usecount = &lli->lli_open_fd_write_count;
574 } else if (it->it_flags & FMODE_EXEC) {
575 och_p = &lli->lli_mds_exec_och;
576 och_usecount = &lli->lli_open_fd_exec_count;
578 och_p = &lli->lli_mds_read_och;
579 och_usecount = &lli->lli_open_fd_read_count;
582 mutex_lock(&lli->lli_och_mutex);
583 if (*och_p) { /* Open handle is present */
584 if (it_disposition(it, DISP_OPEN_OPEN)) {
585 /* Well, there's extra open request that we do not need,
586 let's close it somehow. This will decref request. */
587 rc = it_open_error(DISP_OPEN_OPEN, it);
589 mutex_unlock(&lli->lli_och_mutex);
590 GOTO(out_openerr, rc);
593 ll_release_openhandle(file_dentry(file), it);
597 rc = ll_local_open(file, it, fd, NULL);
600 mutex_unlock(&lli->lli_och_mutex);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->it_disposition) {
606 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
607 /* We cannot just request lock handle now, new ELC code
608 means that one of other OPEN locks for this file
609 could be cancelled, and since blocking ast handler
610 would attempt to grab och_mutex as well, that would
611 result in a deadlock */
612 mutex_unlock(&lli->lli_och_mutex);
614 * Normally called under two situations:
616 * 2. A race/condition on MDS resulting in no open
617 * handle to be returned from LOOKUP|OPEN request,
618 * for example if the target entry was a symlink.
620 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
621 * marked by a bit set in ll_iget_for_nfs. Clear the
622 * bit so that it's not confusing later callers.
624 * NB; when ldd is NULL, it must have come via normal
625 * lookup path only, since ll_iget_for_nfs always calls
628 if (ldd && ldd->lld_nfs_dentry) {
629 ldd->lld_nfs_dentry = 0;
630 it->it_flags |= MDS_OPEN_LOCK;
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
641 GOTO(out_openerr, rc);
645 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
647 GOTO(out_och_free, rc = -ENOMEM);
651 /* md_intent_lock() didn't get a request ref if there was an
652 * open error, so don't do cleanup on the request here
654 /* XXX (green): Should not we bail out on any error here, not
655 * just open error? */
656 rc = it_open_error(DISP_OPEN_OPEN, it);
658 GOTO(out_och_free, rc);
660 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
661 "inode %p: disposition %x, status %d\n", inode,
662 it_disposition(it, ~0), it->it_status);
664 rc = ll_local_open(file, it, fd, *och_p);
666 GOTO(out_och_free, rc);
668 mutex_unlock(&lli->lli_och_mutex);
671 /* Must do this outside lli_och_mutex lock to prevent deadlock where
672 different kind of OPEN lock for this same inode gets cancelled
673 by ldlm_cancel_lru */
674 if (!S_ISREG(inode->i_mode))
675 GOTO(out_och_free, rc);
677 cl_lov_delay_create_clear(&file->f_flags);
678 GOTO(out_och_free, rc);
682 if (och_p && *och_p) {
683 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
684 *och_p = NULL; /* OBD_FREE writes some magic there */
687 mutex_unlock(&lli->lli_och_mutex);
690 if (lli->lli_opendir_key == fd)
691 ll_deauthorize_statahead(inode, fd);
693 ll_file_data_put(fd);
695 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
698 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
699 ptlrpc_req_finished(it->it_request);
700 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
706 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
707 struct ldlm_lock_desc *desc, void *data, int flag)
710 struct lustre_handle lockh;
714 case LDLM_CB_BLOCKING:
715 ldlm_lock2handle(lock, &lockh);
716 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
718 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
722 case LDLM_CB_CANCELING:
730 * When setting a lease on a file, we take ownership of the lli_mds_*_och
731 * and save it as fd->fd_och so as to force client to reopen the file even
732 * if it has an open lock in cache already.
734 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
735 struct lustre_handle *old_handle)
737 struct ll_inode_info *lli = ll_i2info(inode);
738 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
739 struct obd_client_handle **och_p;
744 /* Get the openhandle of the file */
745 mutex_lock(&lli->lli_och_mutex);
746 if (fd->fd_lease_och != NULL)
747 GOTO(out_unlock, rc = -EBUSY);
749 if (fd->fd_och == NULL) {
750 if (file->f_mode & FMODE_WRITE) {
751 LASSERT(lli->lli_mds_write_och != NULL);
752 och_p = &lli->lli_mds_write_och;
753 och_usecount = &lli->lli_open_fd_write_count;
755 LASSERT(lli->lli_mds_read_och != NULL);
756 och_p = &lli->lli_mds_read_och;
757 och_usecount = &lli->lli_open_fd_read_count;
760 if (*och_usecount > 1)
761 GOTO(out_unlock, rc = -EBUSY);
768 *old_handle = fd->fd_och->och_fh;
772 mutex_unlock(&lli->lli_och_mutex);
777 * Release ownership on lli_mds_*_och when putting back a file lease.
779 static int ll_lease_och_release(struct inode *inode, struct file *file)
781 struct ll_inode_info *lli = ll_i2info(inode);
782 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
783 struct obd_client_handle **och_p;
784 struct obd_client_handle *old_och = NULL;
789 mutex_lock(&lli->lli_och_mutex);
790 if (file->f_mode & FMODE_WRITE) {
791 och_p = &lli->lli_mds_write_och;
792 och_usecount = &lli->lli_open_fd_write_count;
794 och_p = &lli->lli_mds_read_och;
795 och_usecount = &lli->lli_open_fd_read_count;
798 /* The file may have been open by another process (broken lease) so
799 * *och_p is not NULL. In this case we should simply increase usecount
802 if (*och_p != NULL) {
803 old_och = fd->fd_och;
810 mutex_unlock(&lli->lli_och_mutex);
813 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
819 * Acquire a lease and open the file.
821 static struct obd_client_handle *
822 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
825 struct lookup_intent it = { .it_op = IT_OPEN };
826 struct ll_sb_info *sbi = ll_i2sbi(inode);
827 struct md_op_data *op_data;
828 struct ptlrpc_request *req = NULL;
829 struct lustre_handle old_handle = { 0 };
830 struct obd_client_handle *och = NULL;
835 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
836 RETURN(ERR_PTR(-EINVAL));
839 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
840 RETURN(ERR_PTR(-EPERM));
842 rc = ll_lease_och_acquire(inode, file, &old_handle);
849 RETURN(ERR_PTR(-ENOMEM));
851 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
852 LUSTRE_OPC_ANY, NULL);
854 GOTO(out, rc = PTR_ERR(op_data));
856 /* To tell the MDT this openhandle is from the same owner */
857 op_data->op_handle = old_handle;
859 it.it_flags = fmode | open_flags;
860 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
861 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
862 &ll_md_blocking_lease_ast,
863 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
864 * it can be cancelled which may mislead applications that the lease is
866 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
867 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
868 * doesn't deal with openhandle, so normal openhandle will be leaked. */
869 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
870 ll_finish_md_op_data(op_data);
871 ptlrpc_req_finished(req);
873 GOTO(out_release_it, rc);
875 if (it_disposition(&it, DISP_LOOKUP_NEG))
876 GOTO(out_release_it, rc = -ENOENT);
878 rc = it_open_error(DISP_OPEN_OPEN, &it);
880 GOTO(out_release_it, rc);
882 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
883 ll_och_fill(sbi->ll_md_exp, &it, och);
885 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
886 GOTO(out_close, rc = -EOPNOTSUPP);
888 /* already get lease, handle lease lock */
889 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
890 if (it.it_lock_mode == 0 ||
891 it.it_lock_bits != MDS_INODELOCK_OPEN) {
892 /* open lock must return for lease */
893 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
894 PFID(ll_inode2fid(inode)), it.it_lock_mode,
896 GOTO(out_close, rc = -EPROTO);
899 ll_intent_release(&it);
903 /* Cancel open lock */
904 if (it.it_lock_mode != 0) {
905 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
908 och->och_lease_handle.cookie = 0ULL;
910 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
912 CERROR("%s: error closing file "DFID": %d\n",
913 ll_get_fsname(inode->i_sb, NULL, 0),
914 PFID(&ll_i2info(inode)->lli_fid), rc2);
915 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
917 ll_intent_release(&it);
925 * Check whether a layout swap can be done between two inodes.
927 * \param[in] inode1 First inode to check
928 * \param[in] inode2 Second inode to check
930 * \retval 0 on success, layout swap can be performed between both inodes
931 * \retval negative error code if requirements are not met
933 static int ll_check_swap_layouts_validity(struct inode *inode1,
934 struct inode *inode2)
936 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
939 if (inode_permission(inode1, MAY_WRITE) ||
940 inode_permission(inode2, MAY_WRITE))
943 if (inode1->i_sb != inode2->i_sb)
949 static int ll_swap_layouts_close(struct obd_client_handle *och,
950 struct inode *inode, struct inode *inode2)
952 const struct lu_fid *fid1 = ll_inode2fid(inode);
953 const struct lu_fid *fid2;
957 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
958 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
960 rc = ll_check_swap_layouts_validity(inode, inode2);
962 GOTO(out_free_och, rc);
964 /* We now know that inode2 is a lustre inode */
965 fid2 = ll_inode2fid(inode2);
967 rc = lu_fid_cmp(fid1, fid2);
969 GOTO(out_free_och, rc = -EINVAL);
971 /* Close the file and {swap,merge} layouts between inode & inode2.
972 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
973 * because we still need it to pack l_remote_handle to MDT. */
974 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
977 och = NULL; /* freed in ll_close_inode_openhandle() */
987 * Release lease and close the file.
988 * It will check if the lease has ever broken.
990 static int ll_lease_close_intent(struct obd_client_handle *och,
992 bool *lease_broken, enum mds_op_bias bias,
995 struct ldlm_lock *lock;
996 bool cancelled = true;
1000 lock = ldlm_handle2lock(&och->och_lease_handle);
1002 lock_res_and_lock(lock);
1003 cancelled = ldlm_is_cancel(lock);
1004 unlock_res_and_lock(lock);
1005 LDLM_LOCK_PUT(lock);
1008 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1009 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1011 if (lease_broken != NULL)
1012 *lease_broken = cancelled;
1014 if (!cancelled && !bias)
1015 ldlm_cli_cancel(&och->och_lease_handle, 0);
1017 if (cancelled) { /* no need to excute intent */
1022 rc = ll_close_inode_openhandle(inode, och, bias, data);
1026 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1029 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1033 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1035 static int ll_lease_file_resync(struct obd_client_handle *och,
1036 struct inode *inode)
1038 struct ll_sb_info *sbi = ll_i2sbi(inode);
1039 struct md_op_data *op_data;
1040 __u64 data_version_unused;
1044 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1045 LUSTRE_OPC_ANY, NULL);
1046 if (IS_ERR(op_data))
1047 RETURN(PTR_ERR(op_data));
1049 /* before starting file resync, it's necessary to clean up page cache
1050 * in client memory, otherwise once the layout version is increased,
1051 * writing back cached data will be denied the OSTs. */
1052 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1056 op_data->op_handle = och->och_lease_handle;
1057 rc = md_file_resync(sbi->ll_md_exp, op_data);
1063 ll_finish_md_op_data(op_data);
1067 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1069 struct ll_inode_info *lli = ll_i2info(inode);
1070 struct cl_object *obj = lli->lli_clob;
1071 struct cl_attr *attr = vvp_env_thread_attr(env);
1079 ll_inode_size_lock(inode);
1081 /* Merge timestamps the most recently obtained from MDS with
1082 * timestamps obtained from OSTs.
1084 * Do not overwrite atime of inode because it may be refreshed
1085 * by file_accessed() function. If the read was served by cache
1086 * data, there is no RPC to be sent so that atime may not be
1087 * transferred to OSTs at all. MDT only updates atime at close time
1088 * if it's at least 'mdd.*.atime_diff' older.
1089 * All in all, the atime in Lustre does not strictly comply with
1090 * POSIX. Solving this problem needs to send an RPC to MDT for each
1091 * read, this will hurt performance. */
1092 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1093 LTIME_S(inode->i_atime) = lli->lli_atime;
1094 lli->lli_update_atime = 0;
1096 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1097 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1099 atime = LTIME_S(inode->i_atime);
1100 mtime = LTIME_S(inode->i_mtime);
1101 ctime = LTIME_S(inode->i_ctime);
1103 cl_object_attr_lock(obj);
1104 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1107 rc = cl_object_attr_get(env, obj, attr);
1108 cl_object_attr_unlock(obj);
1111 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1113 if (atime < attr->cat_atime)
1114 atime = attr->cat_atime;
1116 if (ctime < attr->cat_ctime)
1117 ctime = attr->cat_ctime;
1119 if (mtime < attr->cat_mtime)
1120 mtime = attr->cat_mtime;
1122 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1123 PFID(&lli->lli_fid), attr->cat_size);
1125 i_size_write(inode, attr->cat_size);
1126 inode->i_blocks = attr->cat_blocks;
1128 LTIME_S(inode->i_atime) = atime;
1129 LTIME_S(inode->i_mtime) = mtime;
1130 LTIME_S(inode->i_ctime) = ctime;
1133 ll_inode_size_unlock(inode);
1139 * Set designated mirror for I/O.
1141 * So far only read, write, and truncated can support to issue I/O to
1142 * designated mirror.
1144 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148 /* clear layout version for generic(non-resync) I/O in case it carries
1149 * stale layout version due to I/O restart */
1150 io->ci_layout_version = 0;
1152 /* FLR: disable non-delay for designated mirror I/O because obviously
1153 * only one mirror is available */
1154 if (fd->fd_designated_mirror > 0) {
1156 io->ci_designated_mirror = fd->fd_designated_mirror;
1157 io->ci_layout_version = fd->fd_layout_version;
1158 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1162 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1163 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1166 static bool file_is_noatime(const struct file *file)
1168 const struct vfsmount *mnt = file->f_path.mnt;
1169 const struct inode *inode = file_inode((struct file *)file);
1171 /* Adapted from file_accessed() and touch_atime().*/
1172 if (file->f_flags & O_NOATIME)
1175 if (inode->i_flags & S_NOATIME)
1178 if (IS_NOATIME(inode))
1181 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1184 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1187 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1193 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1195 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1197 struct inode *inode = file_inode(file);
1198 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1200 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1201 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1202 io->u.ci_rw.rw_file = file;
1203 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1204 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1205 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1207 if (iot == CIT_WRITE) {
1208 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1209 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1210 file->f_flags & O_DIRECT ||
1213 io->ci_obj = ll_i2info(inode)->lli_clob;
1214 io->ci_lockreq = CILR_MAYBE;
1215 if (ll_file_nolock(file)) {
1216 io->ci_lockreq = CILR_NEVER;
1217 io->ci_no_srvlock = 1;
1218 } else if (file->f_flags & O_APPEND) {
1219 io->ci_lockreq = CILR_MANDATORY;
1221 io->ci_noatime = file_is_noatime(file);
1222 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1223 io->ci_pio = !io->u.ci_rw.rw_append;
1227 /* FLR: only use non-delay I/O for read as there is only one
1228 * avaliable mirror for write. */
1229 io->ci_ndelay = !(iot == CIT_WRITE);
1231 ll_io_set_mirror(io, file);
1234 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1236 struct cl_io_pt *pt = ptask->pt_cbdata;
1237 struct file *file = pt->cip_file;
1240 loff_t pos = pt->cip_pos;
1245 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1246 file_dentry(file)->d_name.name,
1247 pt->cip_iot == CIT_READ ? "read" : "write",
1248 pos, pos + pt->cip_count);
1250 env = cl_env_get(&refcheck);
1252 RETURN(PTR_ERR(env));
1254 io = vvp_env_thread_io(env);
1255 ll_io_init(io, file, pt->cip_iot);
1256 io->u.ci_rw.rw_iter = pt->cip_iter;
1257 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1258 io->ci_pio = 0; /* It's already in parallel task */
1260 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1261 pt->cip_count - pt->cip_result);
1263 struct vvp_io *vio = vvp_env_io(env);
1265 vio->vui_io_subtype = IO_NORMAL;
1266 vio->vui_fd = LUSTRE_FPRIVATE(file);
1268 ll_cl_add(file, env, io, LCC_RW);
1269 rc = cl_io_loop(env, io);
1270 ll_cl_remove(file, env);
1272 /* cl_io_rw_init() handled IO */
1276 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1282 if (io->ci_nob > 0) {
1283 pt->cip_result += io->ci_nob;
1284 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1286 pt->cip_iocb.ki_pos = pos;
1287 #ifdef HAVE_KIOCB_KI_LEFT
1288 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1289 #elif defined(HAVE_KI_NBYTES)
1290 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1294 cl_io_fini(env, io);
1295 cl_env_put(env, &refcheck);
1297 pt->cip_need_restart = io->ci_need_restart;
1299 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1300 file_dentry(file)->d_name.name,
1301 pt->cip_iot == CIT_READ ? "read" : "write",
1302 pt->cip_result, rc);
1304 RETURN(pt->cip_result > 0 ? 0 : rc);
1308 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1309 struct file *file, enum cl_io_type iot,
1310 loff_t *ppos, size_t count)
1312 struct range_lock range;
1313 struct vvp_io *vio = vvp_env_io(env);
1314 struct inode *inode = file_inode(file);
1315 struct ll_inode_info *lli = ll_i2info(inode);
1316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1321 unsigned retried = 0;
1322 bool restarted = false;
1326 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1327 file_dentry(file)->d_name.name,
1328 iot == CIT_READ ? "read" : "write", pos, pos + count);
1331 io = vvp_env_thread_io(env);
1332 ll_io_init(io, file, iot);
1333 if (args->via_io_subtype == IO_NORMAL) {
1334 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1335 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1337 if (args->via_io_subtype != IO_NORMAL || restarted)
1339 io->ci_ndelay_tried = retried;
1341 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1342 bool range_locked = false;
1344 if (file->f_flags & O_APPEND)
1345 range_lock_init(&range, 0, LUSTRE_EOF);
1347 range_lock_init(&range, pos, pos + count - 1);
1349 vio->vui_fd = LUSTRE_FPRIVATE(file);
1350 vio->vui_io_subtype = args->via_io_subtype;
1352 switch (vio->vui_io_subtype) {
1354 /* Direct IO reads must also take range lock,
1355 * or multiple reads will try to work on the same pages
1356 * See LU-6227 for details. */
1357 if (((iot == CIT_WRITE) ||
1358 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1359 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1360 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1362 rc = range_lock(&lli->lli_write_tree, &range);
1366 range_locked = true;
1370 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1371 vio->u.splice.vui_flags = args->u.splice.via_flags;
1374 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1378 ll_cl_add(file, env, io, LCC_RW);
1379 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1380 !lli->lli_inode_locked) {
1382 lli->lli_inode_locked = 1;
1384 rc = cl_io_loop(env, io);
1385 if (lli->lli_inode_locked) {
1386 lli->lli_inode_locked = 0;
1387 inode_unlock(inode);
1389 ll_cl_remove(file, env);
1392 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1394 range_unlock(&lli->lli_write_tree, &range);
1397 /* cl_io_rw_init() handled IO */
1401 if (io->ci_nob > 0) {
1402 result += io->ci_nob;
1403 count -= io->ci_nob;
1405 if (args->via_io_subtype == IO_NORMAL) {
1406 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1408 args->u.normal.via_iocb->ki_pos = pos;
1409 #ifdef HAVE_KIOCB_KI_LEFT
1410 args->u.normal.via_iocb->ki_left = count;
1411 #elif defined(HAVE_KI_NBYTES)
1412 args->u.normal.via_iocb->ki_nbytes = count;
1416 pos = io->u.ci_rw.rw_range.cir_pos;
1420 cl_io_fini(env, io);
1423 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1424 file->f_path.dentry->d_name.name,
1425 iot, rc, result, io->ci_need_restart);
1427 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1429 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1430 file_dentry(file)->d_name.name,
1431 iot == CIT_READ ? "read" : "write",
1432 pos, pos + count, result, rc);
1433 /* preserve the tried count for FLR */
1434 retried = io->ci_ndelay_tried;
1439 if (iot == CIT_READ) {
1441 ll_stats_ops_tally(ll_i2sbi(inode),
1442 LPROC_LL_READ_BYTES, result);
1443 } else if (iot == CIT_WRITE) {
1445 ll_stats_ops_tally(ll_i2sbi(inode),
1446 LPROC_LL_WRITE_BYTES, result);
1447 fd->fd_write_failed = false;
1448 } else if (result == 0 && rc == 0) {
1451 fd->fd_write_failed = true;
1453 fd->fd_write_failed = false;
1454 } else if (rc != -ERESTARTSYS) {
1455 fd->fd_write_failed = true;
1459 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1460 file_dentry(file)->d_name.name,
1461 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1465 RETURN(result > 0 ? result : rc);
1469 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1470 * especially for small I/O.
1472 * To serve a read request, CLIO has to create and initialize a cl_io and
1473 * then request DLM lock. This has turned out to have siginificant overhead
1474 * and affects the performance of small I/O dramatically.
1476 * It's not necessary to create a cl_io for each I/O. Under the help of read
1477 * ahead, most of the pages being read are already in memory cache and we can
1478 * read those pages directly because if the pages exist, the corresponding DLM
1479 * lock must exist so that page content must be valid.
1481 * In fast read implementation, the llite speculatively finds and reads pages
1482 * in memory cache. There are three scenarios for fast read:
1483 * - If the page exists and is uptodate, kernel VM will provide the data and
1484 * CLIO won't be intervened;
1485 * - If the page was brought into memory by read ahead, it will be exported
1486 * and read ahead parameters will be updated;
1487 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1488 * it will go back and invoke normal read, i.e., a cl_io will be created
1489 * and DLM lock will be requested.
1491 * POSIX compliance: posix standard states that read is intended to be atomic.
1492 * Lustre read implementation is in line with Linux kernel read implementation
1493 * and neither of them complies with POSIX standard in this matter. Fast read
1494 * doesn't make the situation worse on single node but it may interleave write
1495 * results from multiple nodes due to short read handling in ll_file_aio_read().
1497 * \param env - lu_env
1498 * \param iocb - kiocb from kernel
1499 * \param iter - user space buffers where the data will be copied
1501 * \retval - number of bytes have been read, or error code if error occurred.
1504 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1508 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1511 /* NB: we can't do direct IO for fast read because it will need a lock
1512 * to make IO engine happy. */
1513 if (iocb->ki_filp->f_flags & O_DIRECT)
1516 result = generic_file_read_iter(iocb, iter);
1518 /* If the first page is not in cache, generic_file_aio_read() will be
1519 * returned with -ENODATA.
1520 * See corresponding code in ll_readpage(). */
1521 if (result == -ENODATA)
1525 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1526 LPROC_LL_READ_BYTES, result);
1532 * Read from a file (through the page cache).
1534 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1537 struct vvp_io_args *args;
1542 result = ll_do_fast_read(iocb, to);
1543 if (result < 0 || iov_iter_count(to) == 0)
1546 env = cl_env_get(&refcheck);
1548 return PTR_ERR(env);
1550 args = ll_env_args(env, IO_NORMAL);
1551 args->u.normal.via_iter = to;
1552 args->u.normal.via_iocb = iocb;
1554 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1555 &iocb->ki_pos, iov_iter_count(to));
1558 else if (result == 0)
1561 cl_env_put(env, &refcheck);
1567 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1568 * If a page is already in the page cache and dirty (and some other things -
1569 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1570 * write to it without doing a full I/O, because Lustre already knows about it
1571 * and will write it out. This saves a lot of processing time.
1573 * All writes here are within one page, so exclusion is handled by the page
1574 * lock on the vm page. Exception is appending, which requires locking the
1575 * full file to handle size issues. We do not do tiny writes for writes which
1576 * touch multiple pages because it's very unlikely multiple sequential pages
1577 * are already dirty.
1579 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1580 * and are unlikely to be to already dirty pages.
1582 * Attribute updates are important here, we do it in ll_tiny_write_end.
1584 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1586 ssize_t count = iov_iter_count(iter);
1587 struct file *file = iocb->ki_filp;
1588 struct inode *inode = file_inode(file);
1589 struct ll_inode_info *lli = ll_i2info(inode);
1590 struct range_lock range;
1592 bool append = false;
1596 /* NB: we can't do direct IO for tiny writes because they use the page
1597 * cache, and we can't do sync writes because tiny writes can't flush
1600 if (file->f_flags & (O_DIRECT | O_SYNC))
1603 /* It is relatively unlikely we will overwrite a full dirty page, so
1604 * limit tiny writes to < PAGE_SIZE
1606 if (count >= PAGE_SIZE)
1609 /* For append writes, we must take the range lock to protect size
1610 * and also move pos to current size before writing.
1612 if (file->f_flags & O_APPEND) {
1617 range_lock_init(&range, 0, LUSTRE_EOF);
1618 result = range_lock(&lli->lli_write_tree, &range);
1621 env = cl_env_get(&refcheck);
1623 GOTO(out, result = PTR_ERR(env));
1624 ll_merge_attr(env, inode);
1625 cl_env_put(env, &refcheck);
1626 iocb->ki_pos = i_size_read(inode);
1629 /* Does this write touch multiple pages?
1631 * This partly duplicates the PAGE_SIZE check above, but must come
1632 * after range locking for append writes because it depends on the
1633 * write position (ki_pos).
1635 if ((iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1638 result = __generic_file_write_iter(iocb, iter);
1640 /* If the page is not already dirty, ll_tiny_write_begin returns
1641 * -ENODATA. We continue on to normal write.
1643 if (result == -ENODATA)
1647 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1649 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1654 range_unlock(&lli->lli_write_tree, &range);
1656 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1662 * Write to a file (through the page cache).
1664 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1666 struct vvp_io_args *args;
1668 ssize_t rc_tiny, rc_normal;
1673 rc_tiny = ll_do_tiny_write(iocb, from);
1675 /* In case of error, go on and try normal write - Only stop if tiny
1676 * write completed I/O.
1678 if (iov_iter_count(from) == 0)
1679 GOTO(out, rc_normal = rc_tiny);
1681 env = cl_env_get(&refcheck);
1683 return PTR_ERR(env);
1685 args = ll_env_args(env, IO_NORMAL);
1686 args->u.normal.via_iter = from;
1687 args->u.normal.via_iocb = iocb;
1689 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1690 &iocb->ki_pos, iov_iter_count(from));
1692 /* On success, combine bytes written. */
1693 if (rc_tiny >= 0 && rc_normal > 0)
1694 rc_normal += rc_tiny;
1695 /* On error, only return error from normal write if tiny write did not
1696 * write any bytes. Otherwise return bytes written by tiny write.
1698 else if (rc_tiny > 0)
1699 rc_normal = rc_tiny;
1701 cl_env_put(env, &refcheck);
1706 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1708 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1710 static int ll_file_get_iov_count(const struct iovec *iov,
1711 unsigned long *nr_segs, size_t *count)
1716 for (seg = 0; seg < *nr_segs; seg++) {
1717 const struct iovec *iv = &iov[seg];
1720 * If any segment has a negative length, or the cumulative
1721 * length ever wraps negative then return -EINVAL.
1724 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1726 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1731 cnt -= iv->iov_len; /* This segment is no good */
1738 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1739 unsigned long nr_segs, loff_t pos)
1746 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1750 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1751 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1752 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1753 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1754 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1756 result = ll_file_read_iter(iocb, &to);
1761 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1764 struct iovec iov = { .iov_base = buf, .iov_len = count };
1769 init_sync_kiocb(&kiocb, file);
1770 kiocb.ki_pos = *ppos;
1771 #ifdef HAVE_KIOCB_KI_LEFT
1772 kiocb.ki_left = count;
1773 #elif defined(HAVE_KI_NBYTES)
1774 kiocb.i_nbytes = count;
1777 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1778 *ppos = kiocb.ki_pos;
1784 * Write to a file (through the page cache).
1787 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1788 unsigned long nr_segs, loff_t pos)
1790 struct iov_iter from;
1795 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1799 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1800 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1801 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1802 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1803 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1805 result = ll_file_write_iter(iocb, &from);
1810 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1811 size_t count, loff_t *ppos)
1813 struct iovec iov = { .iov_base = (void __user *)buf,
1820 init_sync_kiocb(&kiocb, file);
1821 kiocb.ki_pos = *ppos;
1822 #ifdef HAVE_KIOCB_KI_LEFT
1823 kiocb.ki_left = count;
1824 #elif defined(HAVE_KI_NBYTES)
1825 kiocb.ki_nbytes = count;
1828 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1829 *ppos = kiocb.ki_pos;
1833 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1836 * Send file content (through pagecache) somewhere with helper
1838 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1839 struct pipe_inode_info *pipe, size_t count,
1843 struct vvp_io_args *args;
1848 env = cl_env_get(&refcheck);
1850 RETURN(PTR_ERR(env));
1852 args = ll_env_args(env, IO_SPLICE);
1853 args->u.splice.via_pipe = pipe;
1854 args->u.splice.via_flags = flags;
1856 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1857 cl_env_put(env, &refcheck);
1861 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1862 __u64 flags, struct lov_user_md *lum, int lum_size)
1864 struct lookup_intent oit = {
1866 .it_flags = flags | MDS_OPEN_BY_FID,
1871 ll_inode_size_lock(inode);
1872 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1874 GOTO(out_unlock, rc);
1876 ll_release_openhandle(dentry, &oit);
1879 ll_inode_size_unlock(inode);
1880 ll_intent_release(&oit);
1885 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1886 struct lov_mds_md **lmmp, int *lmm_size,
1887 struct ptlrpc_request **request)
1889 struct ll_sb_info *sbi = ll_i2sbi(inode);
1890 struct mdt_body *body;
1891 struct lov_mds_md *lmm = NULL;
1892 struct ptlrpc_request *req = NULL;
1893 struct md_op_data *op_data;
1896 rc = ll_get_default_mdsize(sbi, &lmmsize);
1900 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1901 strlen(filename), lmmsize,
1902 LUSTRE_OPC_ANY, NULL);
1903 if (IS_ERR(op_data))
1904 RETURN(PTR_ERR(op_data));
1906 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1907 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1908 ll_finish_md_op_data(op_data);
1910 CDEBUG(D_INFO, "md_getattr_name failed "
1911 "on %s: rc %d\n", filename, rc);
1915 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1916 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1918 lmmsize = body->mbo_eadatasize;
1920 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1922 GOTO(out, rc = -ENODATA);
1925 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1926 LASSERT(lmm != NULL);
1928 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1929 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1930 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1931 GOTO(out, rc = -EPROTO);
1934 * This is coming from the MDS, so is probably in
1935 * little endian. We convert it to host endian before
1936 * passing it to userspace.
1938 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1941 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1942 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1943 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1944 if (le32_to_cpu(lmm->lmm_pattern) &
1945 LOV_PATTERN_F_RELEASED)
1949 /* if function called for directory - we should
1950 * avoid swab not existent lsm objects */
1951 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1952 lustre_swab_lov_user_md_v1(
1953 (struct lov_user_md_v1 *)lmm);
1954 if (S_ISREG(body->mbo_mode))
1955 lustre_swab_lov_user_md_objects(
1956 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1958 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1959 lustre_swab_lov_user_md_v3(
1960 (struct lov_user_md_v3 *)lmm);
1961 if (S_ISREG(body->mbo_mode))
1962 lustre_swab_lov_user_md_objects(
1963 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1965 } else if (lmm->lmm_magic ==
1966 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1967 lustre_swab_lov_comp_md_v1(
1968 (struct lov_comp_md_v1 *)lmm);
1974 *lmm_size = lmmsize;
1979 static int ll_lov_setea(struct inode *inode, struct file *file,
1982 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1983 struct lov_user_md *lump;
1984 int lum_size = sizeof(struct lov_user_md) +
1985 sizeof(struct lov_user_ost_data);
1989 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1992 OBD_ALLOC_LARGE(lump, lum_size);
1996 if (copy_from_user(lump, arg, lum_size))
1997 GOTO(out_lump, rc = -EFAULT);
1999 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2001 cl_lov_delay_create_clear(&file->f_flags);
2004 OBD_FREE_LARGE(lump, lum_size);
2008 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2015 env = cl_env_get(&refcheck);
2017 RETURN(PTR_ERR(env));
2019 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2020 cl_env_put(env, &refcheck);
2024 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2027 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2028 struct lov_user_md *klum;
2030 __u64 flags = FMODE_WRITE;
2033 rc = ll_copy_user_md(lum, &klum);
2038 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2043 rc = put_user(0, &lum->lmm_stripe_count);
2047 rc = ll_layout_refresh(inode, &gen);
2051 rc = ll_file_getstripe(inode, arg, lum_size);
2053 cl_lov_delay_create_clear(&file->f_flags);
2056 OBD_FREE(klum, lum_size);
2061 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2063 struct ll_inode_info *lli = ll_i2info(inode);
2064 struct cl_object *obj = lli->lli_clob;
2065 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2066 struct ll_grouplock grouplock;
2071 CWARN("group id for group lock must not be 0\n");
2075 if (ll_file_nolock(file))
2076 RETURN(-EOPNOTSUPP);
2078 spin_lock(&lli->lli_lock);
2079 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2080 CWARN("group lock already existed with gid %lu\n",
2081 fd->fd_grouplock.lg_gid);
2082 spin_unlock(&lli->lli_lock);
2085 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2086 spin_unlock(&lli->lli_lock);
2089 * XXX: group lock needs to protect all OST objects while PFL
2090 * can add new OST objects during the IO, so we'd instantiate
2091 * all OST objects before getting its group lock.
2096 struct cl_layout cl = {
2097 .cl_is_composite = false,
2099 struct lu_extent ext = {
2101 .e_end = OBD_OBJECT_EOF,
2104 env = cl_env_get(&refcheck);
2106 RETURN(PTR_ERR(env));
2108 rc = cl_object_layout_get(env, obj, &cl);
2109 if (!rc && cl.cl_is_composite)
2110 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2113 cl_env_put(env, &refcheck);
2118 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2119 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2123 spin_lock(&lli->lli_lock);
2124 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2125 spin_unlock(&lli->lli_lock);
2126 CERROR("another thread just won the race\n");
2127 cl_put_grouplock(&grouplock);
2131 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2132 fd->fd_grouplock = grouplock;
2133 spin_unlock(&lli->lli_lock);
2135 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2139 static int ll_put_grouplock(struct inode *inode, struct file *file,
2142 struct ll_inode_info *lli = ll_i2info(inode);
2143 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2144 struct ll_grouplock grouplock;
2147 spin_lock(&lli->lli_lock);
2148 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2149 spin_unlock(&lli->lli_lock);
2150 CWARN("no group lock held\n");
2154 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2156 if (fd->fd_grouplock.lg_gid != arg) {
2157 CWARN("group lock %lu doesn't match current id %lu\n",
2158 arg, fd->fd_grouplock.lg_gid);
2159 spin_unlock(&lli->lli_lock);
2163 grouplock = fd->fd_grouplock;
2164 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2165 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2166 spin_unlock(&lli->lli_lock);
2168 cl_put_grouplock(&grouplock);
2169 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2174 * Close inode open handle
2176 * \param dentry [in] dentry which contains the inode
2177 * \param it [in,out] intent which contains open info and result
2180 * \retval <0 failure
2182 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2184 struct inode *inode = dentry->d_inode;
2185 struct obd_client_handle *och;
2191 /* Root ? Do nothing. */
2192 if (dentry->d_inode->i_sb->s_root == dentry)
2195 /* No open handle to close? Move away */
2196 if (!it_disposition(it, DISP_OPEN_OPEN))
2199 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2201 OBD_ALLOC(och, sizeof(*och));
2203 GOTO(out, rc = -ENOMEM);
2205 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2207 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2209 /* this one is in place of ll_file_open */
2210 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2211 ptlrpc_req_finished(it->it_request);
2212 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2218 * Get size for inode for which FIEMAP mapping is requested.
2219 * Make the FIEMAP get_info call and returns the result.
2220 * \param fiemap kernel buffer to hold extens
2221 * \param num_bytes kernel buffer size
2223 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2229 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2232 /* Checks for fiemap flags */
2233 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2234 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2238 /* Check for FIEMAP_FLAG_SYNC */
2239 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2240 rc = filemap_fdatawrite(inode->i_mapping);
2245 env = cl_env_get(&refcheck);
2247 RETURN(PTR_ERR(env));
2249 if (i_size_read(inode) == 0) {
2250 rc = ll_glimpse_size(inode);
2255 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2256 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2257 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2259 /* If filesize is 0, then there would be no objects for mapping */
2260 if (fmkey.lfik_oa.o_size == 0) {
2261 fiemap->fm_mapped_extents = 0;
2265 fmkey.lfik_fiemap = *fiemap;
2267 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2268 &fmkey, fiemap, &num_bytes);
2270 cl_env_put(env, &refcheck);
2274 int ll_fid2path(struct inode *inode, void __user *arg)
2276 struct obd_export *exp = ll_i2mdexp(inode);
2277 const struct getinfo_fid2path __user *gfin = arg;
2279 struct getinfo_fid2path *gfout;
2285 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2286 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2289 /* Only need to get the buflen */
2290 if (get_user(pathlen, &gfin->gf_pathlen))
2293 if (pathlen > PATH_MAX)
2296 outsize = sizeof(*gfout) + pathlen;
2297 OBD_ALLOC(gfout, outsize);
2301 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2302 GOTO(gf_free, rc = -EFAULT);
2303 /* append root FID after gfout to let MDT know the root FID so that it
2304 * can lookup the correct path, this is mainly for fileset.
2305 * old server without fileset mount support will ignore this. */
2306 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2308 /* Call mdc_iocontrol */
2309 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2313 if (copy_to_user(arg, gfout, outsize))
2317 OBD_FREE(gfout, outsize);
2322 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2324 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2332 ioc->idv_version = 0;
2333 ioc->idv_layout_version = UINT_MAX;
2335 /* If no file object initialized, we consider its version is 0. */
2339 env = cl_env_get(&refcheck);
2341 RETURN(PTR_ERR(env));
2343 io = vvp_env_thread_io(env);
2345 io->u.ci_data_version.dv_data_version = 0;
2346 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2347 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2350 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2351 result = cl_io_loop(env, io);
2353 result = io->ci_result;
2355 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2356 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2358 cl_io_fini(env, io);
2360 if (unlikely(io->ci_need_restart))
2363 cl_env_put(env, &refcheck);
2369 * Read the data_version for inode.
2371 * This value is computed using stripe object version on OST.
2372 * Version is computed using server side locking.
2374 * @param flags if do sync on the OST side;
2376 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2377 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2379 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2381 struct ioc_data_version ioc = { .idv_flags = flags };
2384 rc = ll_ioc_data_version(inode, &ioc);
2386 *data_version = ioc.idv_version;
2392 * Trigger a HSM release request for the provided inode.
2394 int ll_hsm_release(struct inode *inode)
2397 struct obd_client_handle *och = NULL;
2398 __u64 data_version = 0;
2403 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2404 ll_get_fsname(inode->i_sb, NULL, 0),
2405 PFID(&ll_i2info(inode)->lli_fid));
2407 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2409 GOTO(out, rc = PTR_ERR(och));
2411 /* Grab latest data_version and [am]time values */
2412 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2416 env = cl_env_get(&refcheck);
2418 GOTO(out, rc = PTR_ERR(env));
2420 rc = ll_merge_attr(env, inode);
2421 cl_env_put(env, &refcheck);
2423 /* If error happen, we have the wrong size for a file.
2429 /* Release the file.
2430 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2431 * we still need it to pack l_remote_handle to MDT. */
2432 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2438 if (och != NULL && !IS_ERR(och)) /* close the file */
2439 ll_lease_close(och, inode, NULL);
2444 struct ll_swap_stack {
2447 struct inode *inode1;
2448 struct inode *inode2;
2453 static int ll_swap_layouts(struct file *file1, struct file *file2,
2454 struct lustre_swap_layouts *lsl)
2456 struct mdc_swap_layouts msl;
2457 struct md_op_data *op_data;
2460 struct ll_swap_stack *llss = NULL;
2463 OBD_ALLOC_PTR(llss);
2467 llss->inode1 = file_inode(file1);
2468 llss->inode2 = file_inode(file2);
2470 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2474 /* we use 2 bool because it is easier to swap than 2 bits */
2475 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2476 llss->check_dv1 = true;
2478 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2479 llss->check_dv2 = true;
2481 /* we cannot use lsl->sl_dvX directly because we may swap them */
2482 llss->dv1 = lsl->sl_dv1;
2483 llss->dv2 = lsl->sl_dv2;
2485 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2486 if (rc == 0) /* same file, done! */
2489 if (rc < 0) { /* sequentialize it */
2490 swap(llss->inode1, llss->inode2);
2492 swap(llss->dv1, llss->dv2);
2493 swap(llss->check_dv1, llss->check_dv2);
2497 if (gid != 0) { /* application asks to flush dirty cache */
2498 rc = ll_get_grouplock(llss->inode1, file1, gid);
2502 rc = ll_get_grouplock(llss->inode2, file2, gid);
2504 ll_put_grouplock(llss->inode1, file1, gid);
2509 /* ultimate check, before swaping the layouts we check if
2510 * dataversion has changed (if requested) */
2511 if (llss->check_dv1) {
2512 rc = ll_data_version(llss->inode1, &dv, 0);
2515 if (dv != llss->dv1)
2516 GOTO(putgl, rc = -EAGAIN);
2519 if (llss->check_dv2) {
2520 rc = ll_data_version(llss->inode2, &dv, 0);
2523 if (dv != llss->dv2)
2524 GOTO(putgl, rc = -EAGAIN);
2527 /* struct md_op_data is used to send the swap args to the mdt
2528 * only flags is missing, so we use struct mdc_swap_layouts
2529 * through the md_op_data->op_data */
2530 /* flags from user space have to be converted before they are send to
2531 * server, no flag is sent today, they are only used on the client */
2534 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2535 0, LUSTRE_OPC_ANY, &msl);
2536 if (IS_ERR(op_data))
2537 GOTO(free, rc = PTR_ERR(op_data));
2539 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2540 sizeof(*op_data), op_data, NULL);
2541 ll_finish_md_op_data(op_data);
2548 ll_put_grouplock(llss->inode2, file2, gid);
2549 ll_put_grouplock(llss->inode1, file1, gid);
2559 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2561 struct md_op_data *op_data;
2565 /* Detect out-of range masks */
2566 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2569 /* Non-root users are forbidden to set or clear flags which are
2570 * NOT defined in HSM_USER_MASK. */
2571 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2572 !cfs_capable(CFS_CAP_SYS_ADMIN))
2575 /* Detect out-of range archive id */
2576 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2577 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2580 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2581 LUSTRE_OPC_ANY, hss);
2582 if (IS_ERR(op_data))
2583 RETURN(PTR_ERR(op_data));
2585 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2586 sizeof(*op_data), op_data, NULL);
2588 ll_finish_md_op_data(op_data);
2593 static int ll_hsm_import(struct inode *inode, struct file *file,
2594 struct hsm_user_import *hui)
2596 struct hsm_state_set *hss = NULL;
2597 struct iattr *attr = NULL;
2601 if (!S_ISREG(inode->i_mode))
2607 GOTO(out, rc = -ENOMEM);
2609 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2610 hss->hss_archive_id = hui->hui_archive_id;
2611 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2612 rc = ll_hsm_state_set(inode, hss);
2616 OBD_ALLOC_PTR(attr);
2618 GOTO(out, rc = -ENOMEM);
2620 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2621 attr->ia_mode |= S_IFREG;
2622 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2623 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2624 attr->ia_size = hui->hui_size;
2625 attr->ia_mtime.tv_sec = hui->hui_mtime;
2626 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2627 attr->ia_atime.tv_sec = hui->hui_atime;
2628 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2630 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2631 ATTR_UID | ATTR_GID |
2632 ATTR_MTIME | ATTR_MTIME_SET |
2633 ATTR_ATIME | ATTR_ATIME_SET;
2637 rc = ll_setattr_raw(file_dentry(file), attr, true);
2641 inode_unlock(inode);
2653 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2655 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2656 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2659 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2661 struct inode *inode = file_inode(file);
2663 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2664 ATTR_MTIME | ATTR_MTIME_SET |
2665 ATTR_CTIME | ATTR_CTIME_SET,
2667 .tv_sec = lfu->lfu_atime_sec,
2668 .tv_nsec = lfu->lfu_atime_nsec,
2671 .tv_sec = lfu->lfu_mtime_sec,
2672 .tv_nsec = lfu->lfu_mtime_nsec,
2675 .tv_sec = lfu->lfu_ctime_sec,
2676 .tv_nsec = lfu->lfu_ctime_nsec,
2682 if (!capable(CAP_SYS_ADMIN))
2685 if (!S_ISREG(inode->i_mode))
2689 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2690 inode_unlock(inode);
2695 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2698 case MODE_READ_USER:
2700 case MODE_WRITE_USER:
2707 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2709 /* Used to allow the upper layers of the client to request an LDLM lock
2710 * without doing an actual read or write.
2712 * Used for ladvise lockahead to manually request specific locks.
2714 * \param[in] file file this ladvise lock request is on
2715 * \param[in] ladvise ladvise struct describing this lock request
2717 * \retval 0 success, no detailed result available (sync requests
2718 * and requests sent to the server [not handled locally]
2719 * cannot return detailed results)
2720 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2721 * see definitions for details.
2722 * \retval negative negative errno on error
2724 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2726 struct lu_env *env = NULL;
2727 struct cl_io *io = NULL;
2728 struct cl_lock *lock = NULL;
2729 struct cl_lock_descr *descr = NULL;
2730 struct dentry *dentry = file->f_path.dentry;
2731 struct inode *inode = dentry->d_inode;
2732 enum cl_lock_mode cl_mode;
2733 off_t start = ladvise->lla_start;
2734 off_t end = ladvise->lla_end;
2740 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2741 "start=%llu, end=%llu\n", dentry->d_name.len,
2742 dentry->d_name.name, dentry->d_inode,
2743 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2746 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2748 GOTO(out, result = cl_mode);
2750 /* Get IO environment */
2751 result = cl_io_get(inode, &env, &io, &refcheck);
2755 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2758 * nothing to do for this io. This currently happens when
2759 * stripe sub-object's are not yet created.
2761 result = io->ci_result;
2762 } else if (result == 0) {
2763 lock = vvp_env_lock(env);
2764 descr = &lock->cll_descr;
2766 descr->cld_obj = io->ci_obj;
2767 /* Convert byte offsets to pages */
2768 descr->cld_start = cl_index(io->ci_obj, start);
2769 descr->cld_end = cl_index(io->ci_obj, end);
2770 descr->cld_mode = cl_mode;
2771 /* CEF_MUST is used because we do not want to convert a
2772 * lockahead request to a lockless lock */
2773 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2776 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2777 descr->cld_enq_flags |= CEF_SPECULATIVE;
2779 result = cl_lock_request(env, io, lock);
2781 /* On success, we need to release the lock */
2783 cl_lock_release(env, lock);
2785 cl_io_fini(env, io);
2786 cl_env_put(env, &refcheck);
2788 /* -ECANCELED indicates a matching lock with a different extent
2789 * was already present, and -EEXIST indicates a matching lock
2790 * on exactly the same extent was already present.
2791 * We convert them to positive values for userspace to make
2792 * recognizing true errors easier.
2793 * Note we can only return these detailed results on async requests,
2794 * as sync requests look the same as i/o requests for locking. */
2795 if (result == -ECANCELED)
2796 result = LLA_RESULT_DIFFERENT;
2797 else if (result == -EEXIST)
2798 result = LLA_RESULT_SAME;
2803 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2805 static int ll_ladvise_sanity(struct inode *inode,
2806 struct llapi_lu_ladvise *ladvise)
2808 enum lu_ladvise_type advice = ladvise->lla_advice;
2809 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2810 * be in the first 32 bits of enum ladvise_flags */
2811 __u32 flags = ladvise->lla_peradvice_flags;
2812 /* 3 lines at 80 characters per line, should be plenty */
2815 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2817 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2818 "last supported advice is %s (value '%d'): rc = %d\n",
2819 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2820 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2824 /* Per-advice checks */
2826 case LU_LADVISE_LOCKNOEXPAND:
2827 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2829 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2831 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2832 ladvise_names[advice], rc);
2836 case LU_LADVISE_LOCKAHEAD:
2837 /* Currently only READ and WRITE modes can be requested */
2838 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2839 ladvise->lla_lockahead_mode == 0) {
2841 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2843 ll_get_fsname(inode->i_sb, NULL, 0),
2844 ladvise->lla_lockahead_mode,
2845 ladvise_names[advice], rc);
2848 case LU_LADVISE_WILLREAD:
2849 case LU_LADVISE_DONTNEED:
2851 /* Note fall through above - These checks apply to all advices
2852 * except LOCKNOEXPAND */
2853 if (flags & ~LF_DEFAULT_MASK) {
2855 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2857 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2858 ladvise_names[advice], rc);
2861 if (ladvise->lla_start >= ladvise->lla_end) {
2863 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2864 "for %s: rc = %d\n",
2865 ll_get_fsname(inode->i_sb, NULL, 0),
2866 ladvise->lla_start, ladvise->lla_end,
2867 ladvise_names[advice], rc);
2879 * Give file access advices
2881 * The ladvise interface is similar to Linux fadvise() system call, except it
2882 * forwards the advices directly from Lustre client to server. The server side
2883 * codes will apply appropriate read-ahead and caching techniques for the
2884 * corresponding files.
2886 * A typical workload for ladvise is e.g. a bunch of different clients are
2887 * doing small random reads of a file, so prefetching pages into OSS cache
2888 * with big linear reads before the random IO is a net benefit. Fetching
2889 * all that data into each client cache with fadvise() may not be, due to
2890 * much more data being sent to the client.
2892 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2893 struct llapi_lu_ladvise *ladvise)
2897 struct cl_ladvise_io *lio;
2902 env = cl_env_get(&refcheck);
2904 RETURN(PTR_ERR(env));
2906 io = vvp_env_thread_io(env);
2907 io->ci_obj = ll_i2info(inode)->lli_clob;
2909 /* initialize parameters for ladvise */
2910 lio = &io->u.ci_ladvise;
2911 lio->li_start = ladvise->lla_start;
2912 lio->li_end = ladvise->lla_end;
2913 lio->li_fid = ll_inode2fid(inode);
2914 lio->li_advice = ladvise->lla_advice;
2915 lio->li_flags = flags;
2917 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2918 rc = cl_io_loop(env, io);
2922 cl_io_fini(env, io);
2923 cl_env_put(env, &refcheck);
2927 static int ll_lock_noexpand(struct file *file, int flags)
2929 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2931 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2936 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2939 struct fsxattr fsxattr;
2941 if (copy_from_user(&fsxattr,
2942 (const struct fsxattr __user *)arg,
2946 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2947 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2948 if (copy_to_user((struct fsxattr __user *)arg,
2949 &fsxattr, sizeof(fsxattr)))
2955 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2959 struct md_op_data *op_data;
2960 struct ptlrpc_request *req = NULL;
2962 struct fsxattr fsxattr;
2963 struct cl_object *obj;
2965 /* only root could change project ID */
2966 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2969 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2970 LUSTRE_OPC_ANY, NULL);
2971 if (IS_ERR(op_data))
2972 RETURN(PTR_ERR(op_data));
2974 if (copy_from_user(&fsxattr,
2975 (const struct fsxattr __user *)arg,
2977 GOTO(out_fsxattr1, rc = -EFAULT);
2979 op_data->op_attr_flags = fsxattr.fsx_xflags;
2980 op_data->op_projid = fsxattr.fsx_projid;
2981 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2982 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2984 ptlrpc_req_finished(req);
2986 obj = ll_i2info(inode)->lli_clob;
2990 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2991 OBD_ALLOC_PTR(attr);
2993 GOTO(out_fsxattr1, rc = -ENOMEM);
2994 attr->ia_valid = ATTR_ATTR_FLAG;
2995 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3000 ll_finish_md_op_data(op_data);
3004 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3007 struct inode *inode = file_inode(file);
3008 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3009 struct ll_inode_info *lli = ll_i2info(inode);
3010 struct obd_client_handle *och = NULL;
3011 struct split_param sp;
3014 enum mds_op_bias bias = 0;
3015 struct file *layout_file = NULL;
3017 size_t data_size = 0;
3021 mutex_lock(&lli->lli_och_mutex);
3022 if (fd->fd_lease_och != NULL) {
3023 och = fd->fd_lease_och;
3024 fd->fd_lease_och = NULL;
3026 mutex_unlock(&lli->lli_och_mutex);
3029 GOTO(out, rc = -ENOLCK);
3031 fmode = och->och_flags;
3033 switch (ioc->lil_flags) {
3034 case LL_LEASE_RESYNC_DONE:
3035 if (ioc->lil_count > IOC_IDS_MAX)
3036 GOTO(out, rc = -EINVAL);
3038 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3039 OBD_ALLOC(data, data_size);
3041 GOTO(out, rc = -ENOMEM);
3043 if (copy_from_user(data, (void __user *)arg, data_size))
3044 GOTO(out, rc = -EFAULT);
3046 bias = MDS_CLOSE_RESYNC_DONE;
3048 case LL_LEASE_LAYOUT_MERGE: {
3051 if (ioc->lil_count != 1)
3052 GOTO(out, rc = -EINVAL);
3054 arg += sizeof(*ioc);
3055 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3056 GOTO(out, rc = -EFAULT);
3058 layout_file = fget(fd);
3060 GOTO(out, rc = -EBADF);
3062 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3063 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3064 GOTO(out, rc = -EPERM);
3066 data = file_inode(layout_file);
3067 bias = MDS_CLOSE_LAYOUT_MERGE;
3070 case LL_LEASE_LAYOUT_SPLIT: {
3074 if (ioc->lil_count != 2)
3075 GOTO(out, rc = -EINVAL);
3077 arg += sizeof(*ioc);
3078 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3079 GOTO(out, rc = -EFAULT);
3081 arg += sizeof(__u32);
3082 if (copy_from_user(&mirror_id, (void __user *)arg,
3084 GOTO(out, rc = -EFAULT);
3086 layout_file = fget(fdv);
3088 GOTO(out, rc = -EBADF);
3090 sp.sp_inode = file_inode(layout_file);
3091 sp.sp_mirror_id = (__u16)mirror_id;
3093 bias = MDS_CLOSE_LAYOUT_SPLIT;
3097 /* without close intent */
3101 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3105 rc = ll_lease_och_release(inode, file);
3114 switch (ioc->lil_flags) {
3115 case LL_LEASE_RESYNC_DONE:
3117 OBD_FREE(data, data_size);
3119 case LL_LEASE_LAYOUT_MERGE:
3120 case LL_LEASE_LAYOUT_SPLIT:
3127 rc = ll_lease_type_from_fmode(fmode);
3131 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3134 struct inode *inode = file_inode(file);
3135 struct ll_inode_info *lli = ll_i2info(inode);
3136 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3137 struct obd_client_handle *och = NULL;
3138 __u64 open_flags = 0;
3144 switch (ioc->lil_mode) {
3145 case LL_LEASE_WRLCK:
3146 if (!(file->f_mode & FMODE_WRITE))
3148 fmode = FMODE_WRITE;
3150 case LL_LEASE_RDLCK:
3151 if (!(file->f_mode & FMODE_READ))
3155 case LL_LEASE_UNLCK:
3156 RETURN(ll_file_unlock_lease(file, ioc, arg));
3161 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3163 /* apply for lease */
3164 if (ioc->lil_flags & LL_LEASE_RESYNC)
3165 open_flags = MDS_OPEN_RESYNC;
3166 och = ll_lease_open(inode, file, fmode, open_flags);
3168 RETURN(PTR_ERR(och));
3170 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3171 rc = ll_lease_file_resync(och, inode);
3173 ll_lease_close(och, inode, NULL);
3176 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3178 ll_lease_close(och, inode, NULL);
3184 mutex_lock(&lli->lli_och_mutex);
3185 if (fd->fd_lease_och == NULL) {
3186 fd->fd_lease_och = och;
3189 mutex_unlock(&lli->lli_och_mutex);
3191 /* impossible now that only excl is supported for now */
3192 ll_lease_close(och, inode, &lease_broken);
3199 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3201 struct inode *inode = file_inode(file);
3202 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3206 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3207 PFID(ll_inode2fid(inode)), inode, cmd);
3208 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3210 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3211 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3215 case LL_IOC_GETFLAGS:
3216 /* Get the current value of the file flags */
3217 return put_user(fd->fd_flags, (int __user *)arg);
3218 case LL_IOC_SETFLAGS:
3219 case LL_IOC_CLRFLAGS:
3220 /* Set or clear specific file flags */
3221 /* XXX This probably needs checks to ensure the flags are
3222 * not abused, and to handle any flag side effects.
3224 if (get_user(flags, (int __user *) arg))
3227 if (cmd == LL_IOC_SETFLAGS) {
3228 if ((flags & LL_FILE_IGNORE_LOCK) &&
3229 !(file->f_flags & O_DIRECT)) {
3230 CERROR("%s: unable to disable locking on "
3231 "non-O_DIRECT file\n", current->comm);
3235 fd->fd_flags |= flags;
3237 fd->fd_flags &= ~flags;
3240 case LL_IOC_LOV_SETSTRIPE:
3241 case LL_IOC_LOV_SETSTRIPE_NEW:
3242 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3243 case LL_IOC_LOV_SETEA:
3244 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3245 case LL_IOC_LOV_SWAP_LAYOUTS: {
3247 struct lustre_swap_layouts lsl;
3249 if (copy_from_user(&lsl, (char __user *)arg,
3250 sizeof(struct lustre_swap_layouts)))
3253 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3256 file2 = fget(lsl.sl_fd);
3260 /* O_WRONLY or O_RDWR */
3261 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3262 GOTO(out, rc = -EPERM);
3264 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3265 struct inode *inode2;
3266 struct ll_inode_info *lli;
3267 struct obd_client_handle *och = NULL;
3269 lli = ll_i2info(inode);
3270 mutex_lock(&lli->lli_och_mutex);
3271 if (fd->fd_lease_och != NULL) {
3272 och = fd->fd_lease_och;
3273 fd->fd_lease_och = NULL;
3275 mutex_unlock(&lli->lli_och_mutex);
3277 GOTO(out, rc = -ENOLCK);
3278 inode2 = file_inode(file2);
3279 rc = ll_swap_layouts_close(och, inode, inode2);
3281 rc = ll_swap_layouts(file, file2, &lsl);
3287 case LL_IOC_LOV_GETSTRIPE:
3288 case LL_IOC_LOV_GETSTRIPE_NEW:
3289 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3290 case FSFILT_IOC_GETFLAGS:
3291 case FSFILT_IOC_SETFLAGS:
3292 RETURN(ll_iocontrol(inode, file, cmd, arg));
3293 case FSFILT_IOC_GETVERSION_OLD:
3294 case FSFILT_IOC_GETVERSION:
3295 RETURN(put_user(inode->i_generation, (int __user *)arg));
3296 case LL_IOC_GROUP_LOCK:
3297 RETURN(ll_get_grouplock(inode, file, arg));
3298 case LL_IOC_GROUP_UNLOCK:
3299 RETURN(ll_put_grouplock(inode, file, arg));
3300 case IOC_OBD_STATFS:
3301 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3303 /* We need to special case any other ioctls we want to handle,
3304 * to send them to the MDS/OST as appropriate and to properly
3305 * network encode the arg field.
3306 case FSFILT_IOC_SETVERSION_OLD:
3307 case FSFILT_IOC_SETVERSION:
3309 case LL_IOC_FLUSHCTX:
3310 RETURN(ll_flush_ctx(inode));
3311 case LL_IOC_PATH2FID: {
3312 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3313 sizeof(struct lu_fid)))
3318 case LL_IOC_GETPARENT:
3319 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3321 case OBD_IOC_FID2PATH:
3322 RETURN(ll_fid2path(inode, (void __user *)arg));
3323 case LL_IOC_DATA_VERSION: {
3324 struct ioc_data_version idv;
3327 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3330 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3331 rc = ll_ioc_data_version(inode, &idv);
3334 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3340 case LL_IOC_GET_MDTIDX: {
3343 mdtidx = ll_get_mdt_idx(inode);
3347 if (put_user((int)mdtidx, (int __user *)arg))
3352 case OBD_IOC_GETDTNAME:
3353 case OBD_IOC_GETMDNAME:
3354 RETURN(ll_get_obd_name(inode, cmd, arg));
3355 case LL_IOC_HSM_STATE_GET: {
3356 struct md_op_data *op_data;
3357 struct hsm_user_state *hus;
3364 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3365 LUSTRE_OPC_ANY, hus);
3366 if (IS_ERR(op_data)) {
3368 RETURN(PTR_ERR(op_data));
3371 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3374 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3377 ll_finish_md_op_data(op_data);
3381 case LL_IOC_HSM_STATE_SET: {
3382 struct hsm_state_set *hss;
3389 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3394 rc = ll_hsm_state_set(inode, hss);
3399 case LL_IOC_HSM_ACTION: {
3400 struct md_op_data *op_data;
3401 struct hsm_current_action *hca;
3408 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3409 LUSTRE_OPC_ANY, hca);
3410 if (IS_ERR(op_data)) {
3412 RETURN(PTR_ERR(op_data));
3415 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3418 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3421 ll_finish_md_op_data(op_data);
3425 case LL_IOC_SET_LEASE_OLD: {
3426 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3428 RETURN(ll_file_set_lease(file, &ioc, 0));
3430 case LL_IOC_SET_LEASE: {
3431 struct ll_ioc_lease ioc;
3433 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3436 RETURN(ll_file_set_lease(file, &ioc, arg));
3438 case LL_IOC_GET_LEASE: {
3439 struct ll_inode_info *lli = ll_i2info(inode);
3440 struct ldlm_lock *lock = NULL;
3443 mutex_lock(&lli->lli_och_mutex);
3444 if (fd->fd_lease_och != NULL) {
3445 struct obd_client_handle *och = fd->fd_lease_och;
3447 lock = ldlm_handle2lock(&och->och_lease_handle);
3449 lock_res_and_lock(lock);
3450 if (!ldlm_is_cancel(lock))
3451 fmode = och->och_flags;
3453 unlock_res_and_lock(lock);
3454 LDLM_LOCK_PUT(lock);
3457 mutex_unlock(&lli->lli_och_mutex);
3459 RETURN(ll_lease_type_from_fmode(fmode));
3461 case LL_IOC_HSM_IMPORT: {
3462 struct hsm_user_import *hui;
3468 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3473 rc = ll_hsm_import(inode, file, hui);
3478 case LL_IOC_FUTIMES_3: {
3479 struct ll_futimes_3 lfu;
3481 if (copy_from_user(&lfu,
3482 (const struct ll_futimes_3 __user *)arg,
3486 RETURN(ll_file_futimes_3(file, &lfu));
3488 case LL_IOC_LADVISE: {
3489 struct llapi_ladvise_hdr *k_ladvise_hdr;
3490 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3493 int alloc_size = sizeof(*k_ladvise_hdr);
3496 u_ladvise_hdr = (void __user *)arg;
3497 OBD_ALLOC_PTR(k_ladvise_hdr);
3498 if (k_ladvise_hdr == NULL)
3501 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3502 GOTO(out_ladvise, rc = -EFAULT);
3504 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3505 k_ladvise_hdr->lah_count < 1)
3506 GOTO(out_ladvise, rc = -EINVAL);
3508 num_advise = k_ladvise_hdr->lah_count;
3509 if (num_advise >= LAH_COUNT_MAX)
3510 GOTO(out_ladvise, rc = -EFBIG);
3512 OBD_FREE_PTR(k_ladvise_hdr);
3513 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3514 lah_advise[num_advise]);
3515 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3516 if (k_ladvise_hdr == NULL)
3520 * TODO: submit multiple advices to one server in a single RPC
3522 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3523 GOTO(out_ladvise, rc = -EFAULT);
3525 for (i = 0; i < num_advise; i++) {
3526 struct llapi_lu_ladvise *k_ladvise =
3527 &k_ladvise_hdr->lah_advise[i];
3528 struct llapi_lu_ladvise __user *u_ladvise =
3529 &u_ladvise_hdr->lah_advise[i];
3531 rc = ll_ladvise_sanity(inode, k_ladvise);
3533 GOTO(out_ladvise, rc);
3535 switch (k_ladvise->lla_advice) {
3536 case LU_LADVISE_LOCKNOEXPAND:
3537 rc = ll_lock_noexpand(file,
3538 k_ladvise->lla_peradvice_flags);
3539 GOTO(out_ladvise, rc);
3540 case LU_LADVISE_LOCKAHEAD:
3542 rc = ll_file_lock_ahead(file, k_ladvise);
3545 GOTO(out_ladvise, rc);
3548 &u_ladvise->lla_lockahead_result))
3549 GOTO(out_ladvise, rc = -EFAULT);
3552 rc = ll_ladvise(inode, file,
3553 k_ladvise_hdr->lah_flags,
3556 GOTO(out_ladvise, rc);
3563 OBD_FREE(k_ladvise_hdr, alloc_size);
3566 case LL_IOC_FLR_SET_MIRROR: {
3567 /* mirror I/O must be direct to avoid polluting page cache
3569 if (!(file->f_flags & O_DIRECT))
3572 fd->fd_designated_mirror = (__u32)arg;
3575 case LL_IOC_FSGETXATTR:
3576 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3577 case LL_IOC_FSSETXATTR:
3578 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3580 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3582 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3583 (void __user *)arg));
3587 #ifndef HAVE_FILE_LLSEEK_SIZE
3588 static inline loff_t
3589 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3591 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3593 if (offset > maxsize)
3596 if (offset != file->f_pos) {
3597 file->f_pos = offset;
3598 file->f_version = 0;
3604 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3605 loff_t maxsize, loff_t eof)
3607 struct inode *inode = file_inode(file);
3615 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3616 * position-querying operation. Avoid rewriting the "same"
3617 * f_pos value back to the file because a concurrent read(),
3618 * write() or lseek() might have altered it
3623 * f_lock protects against read/modify/write race with other
3624 * SEEK_CURs. Note that parallel writes and reads behave
3628 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3629 inode_unlock(inode);
3633 * In the generic case the entire file is data, so as long as
3634 * offset isn't at the end of the file then the offset is data.
3641 * There is a virtual hole at the end of the file, so as long as
3642 * offset isn't i_size or larger, return i_size.
3650 return llseek_execute(file, offset, maxsize);
3654 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3656 struct inode *inode = file_inode(file);
3657 loff_t retval, eof = 0;
3660 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3661 (origin == SEEK_CUR) ? file->f_pos : 0);
3662 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3663 PFID(ll_inode2fid(inode)), inode, retval, retval,
3665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3667 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3668 retval = ll_glimpse_size(inode);
3671 eof = i_size_read(inode);
3674 retval = ll_generic_file_llseek_size(file, offset, origin,
3675 ll_file_maxbytes(inode), eof);
3679 static int ll_flush(struct file *file, fl_owner_t id)
3681 struct inode *inode = file_inode(file);
3682 struct ll_inode_info *lli = ll_i2info(inode);
3683 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3686 LASSERT(!S_ISDIR(inode->i_mode));
3688 /* catch async errors that were recorded back when async writeback
3689 * failed for pages in this mapping. */
3690 rc = lli->lli_async_rc;
3691 lli->lli_async_rc = 0;
3692 if (lli->lli_clob != NULL) {
3693 err = lov_read_and_clear_async_rc(lli->lli_clob);
3698 /* The application has been told write failure already.
3699 * Do not report failure again. */
3700 if (fd->fd_write_failed)
3702 return rc ? -EIO : 0;
3706 * Called to make sure a portion of file has been written out.
3707 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3709 * Return how many pages have been written.
3711 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3712 enum cl_fsync_mode mode, int ignore_layout)
3716 struct cl_fsync_io *fio;
3721 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3722 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3725 env = cl_env_get(&refcheck);
3727 RETURN(PTR_ERR(env));
3729 io = vvp_env_thread_io(env);
3730 io->ci_obj = ll_i2info(inode)->lli_clob;
3731 io->ci_ignore_layout = ignore_layout;
3733 /* initialize parameters for sync */
3734 fio = &io->u.ci_fsync;
3735 fio->fi_start = start;
3737 fio->fi_fid = ll_inode2fid(inode);
3738 fio->fi_mode = mode;
3739 fio->fi_nr_written = 0;
3741 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3742 result = cl_io_loop(env, io);
3744 result = io->ci_result;
3746 result = fio->fi_nr_written;
3747 cl_io_fini(env, io);
3748 cl_env_put(env, &refcheck);
3754 * When dentry is provided (the 'else' case), file_dentry() may be
3755 * null and dentry must be used directly rather than pulled from
3756 * file_dentry() as is done otherwise.
3759 #ifdef HAVE_FILE_FSYNC_4ARGS
3760 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3762 struct dentry *dentry = file_dentry(file);
3764 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3765 int ll_fsync(struct file *file, int datasync)
3767 struct dentry *dentry = file_dentry(file);
3769 loff_t end = LLONG_MAX;
3771 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3774 loff_t end = LLONG_MAX;
3776 struct inode *inode = dentry->d_inode;
3777 struct ll_inode_info *lli = ll_i2info(inode);
3778 struct ptlrpc_request *req;
3782 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3783 PFID(ll_inode2fid(inode)), inode);
3784 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3786 #ifdef HAVE_FILE_FSYNC_4ARGS
3787 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3788 lock_inode = !lli->lli_inode_locked;
3792 /* fsync's caller has already called _fdata{sync,write}, we want
3793 * that IO to finish before calling the osc and mdc sync methods */
3794 rc = filemap_fdatawait(inode->i_mapping);
3797 /* catch async errors that were recorded back when async writeback
3798 * failed for pages in this mapping. */
3799 if (!S_ISDIR(inode->i_mode)) {
3800 err = lli->lli_async_rc;
3801 lli->lli_async_rc = 0;
3804 if (lli->lli_clob != NULL) {
3805 err = lov_read_and_clear_async_rc(lli->lli_clob);
3811 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3815 ptlrpc_req_finished(req);
3817 if (S_ISREG(inode->i_mode)) {
3818 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3820 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3821 if (rc == 0 && err < 0)
3824 fd->fd_write_failed = true;
3826 fd->fd_write_failed = false;
3829 #ifdef HAVE_FILE_FSYNC_4ARGS
3831 inode_unlock(inode);
3837 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3839 struct inode *inode = file_inode(file);
3840 struct ll_sb_info *sbi = ll_i2sbi(inode);
3841 struct ldlm_enqueue_info einfo = {
3842 .ei_type = LDLM_FLOCK,
3843 .ei_cb_cp = ldlm_flock_completion_ast,
3844 .ei_cbdata = file_lock,
3846 struct md_op_data *op_data;
3847 struct lustre_handle lockh = { 0 };
3848 union ldlm_policy_data flock = { { 0 } };
3849 int fl_type = file_lock->fl_type;
3855 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3856 PFID(ll_inode2fid(inode)), file_lock);
3858 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3860 if (file_lock->fl_flags & FL_FLOCK) {
3861 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3862 /* flocks are whole-file locks */
3863 flock.l_flock.end = OFFSET_MAX;
3864 /* For flocks owner is determined by the local file desctiptor*/
3865 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3866 } else if (file_lock->fl_flags & FL_POSIX) {
3867 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3868 flock.l_flock.start = file_lock->fl_start;
3869 flock.l_flock.end = file_lock->fl_end;
3873 flock.l_flock.pid = file_lock->fl_pid;
3875 /* Somewhat ugly workaround for svc lockd.
3876 * lockd installs custom fl_lmops->lm_compare_owner that checks
3877 * for the fl_owner to be the same (which it always is on local node
3878 * I guess between lockd processes) and then compares pid.
3879 * As such we assign pid to the owner field to make it all work,
3880 * conflict with normal locks is unlikely since pid space and
3881 * pointer space for current->files are not intersecting */
3882 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3883 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3887 einfo.ei_mode = LCK_PR;
3890 /* An unlock request may or may not have any relation to
3891 * existing locks so we may not be able to pass a lock handle
3892 * via a normal ldlm_lock_cancel() request. The request may even
3893 * unlock a byte range in the middle of an existing lock. In
3894 * order to process an unlock request we need all of the same
3895 * information that is given with a normal read or write record
3896 * lock request. To avoid creating another ldlm unlock (cancel)
3897 * message we'll treat a LCK_NL flock request as an unlock. */
3898 einfo.ei_mode = LCK_NL;
3901 einfo.ei_mode = LCK_PW;
3904 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3919 flags = LDLM_FL_BLOCK_NOWAIT;
3925 flags = LDLM_FL_TEST_LOCK;
3928 CERROR("unknown fcntl lock command: %d\n", cmd);
3932 /* Save the old mode so that if the mode in the lock changes we
3933 * can decrement the appropriate reader or writer refcount. */
3934 file_lock->fl_type = einfo.ei_mode;
3936 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3937 LUSTRE_OPC_ANY, NULL);
3938 if (IS_ERR(op_data))
3939 RETURN(PTR_ERR(op_data));
3941 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3942 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3943 flock.l_flock.pid, flags, einfo.ei_mode,
3944 flock.l_flock.start, flock.l_flock.end);
3946 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3949 /* Restore the file lock type if not TEST lock. */
3950 if (!(flags & LDLM_FL_TEST_LOCK))
3951 file_lock->fl_type = fl_type;
3953 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3954 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3955 !(flags & LDLM_FL_TEST_LOCK))
3956 rc2 = locks_lock_file_wait(file, file_lock);
3958 if ((file_lock->fl_flags & FL_FLOCK) &&
3959 (rc == 0 || file_lock->fl_type == F_UNLCK))
3960 rc2 = flock_lock_file_wait(file, file_lock);
3961 if ((file_lock->fl_flags & FL_POSIX) &&
3962 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3963 !(flags & LDLM_FL_TEST_LOCK))
3964 rc2 = posix_lock_file_wait(file, file_lock);
3965 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3967 if (rc2 && file_lock->fl_type != F_UNLCK) {
3968 einfo.ei_mode = LCK_NL;
3969 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3974 ll_finish_md_op_data(op_data);
3979 int ll_get_fid_by_name(struct inode *parent, const char *name,
3980 int namelen, struct lu_fid *fid,
3981 struct inode **inode)
3983 struct md_op_data *op_data = NULL;
3984 struct mdt_body *body;
3985 struct ptlrpc_request *req;
3989 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3990 LUSTRE_OPC_ANY, NULL);
3991 if (IS_ERR(op_data))
3992 RETURN(PTR_ERR(op_data));
3994 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3995 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3996 ll_finish_md_op_data(op_data);
4000 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4002 GOTO(out_req, rc = -EFAULT);
4004 *fid = body->mbo_fid1;
4007 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4009 ptlrpc_req_finished(req);
4013 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
4014 const char *name, int namelen)
4016 struct dentry *dchild = NULL;
4017 struct inode *child_inode = NULL;
4018 struct md_op_data *op_data;
4019 struct ptlrpc_request *request = NULL;
4020 struct obd_client_handle *och = NULL;
4022 struct mdt_body *body;
4024 __u64 data_version = 0;
4027 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4028 name, PFID(ll_inode2fid(parent)), mdtidx);
4030 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4031 0, LUSTRE_OPC_ANY, NULL);
4032 if (IS_ERR(op_data))
4033 RETURN(PTR_ERR(op_data));
4035 /* Get child FID first */
4036 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4039 dchild = d_lookup(file_dentry(file), &qstr);
4040 if (dchild != NULL) {
4041 if (dchild->d_inode != NULL)
4042 child_inode = igrab(dchild->d_inode);
4046 if (child_inode == NULL) {
4047 rc = ll_get_fid_by_name(parent, name, namelen,
4048 &op_data->op_fid3, &child_inode);
4053 if (child_inode == NULL)
4054 GOTO(out_free, rc = -EINVAL);
4057 * lfs migrate command needs to be blocked on the client
4058 * by checking the migrate FID against the FID of the
4061 if (child_inode == parent->i_sb->s_root->d_inode)
4062 GOTO(out_iput, rc = -EINVAL);
4064 inode_lock(child_inode);
4065 op_data->op_fid3 = *ll_inode2fid(child_inode);
4066 if (!fid_is_sane(&op_data->op_fid3)) {
4067 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4068 ll_get_fsname(parent->i_sb, NULL, 0), name,
4069 PFID(&op_data->op_fid3));
4070 GOTO(out_unlock, rc = -EINVAL);
4073 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4075 GOTO(out_unlock, rc);
4078 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4079 PFID(&op_data->op_fid3), mdtidx);
4080 GOTO(out_unlock, rc = 0);
4083 if (S_ISREG(child_inode->i_mode)) {
4084 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4088 GOTO(out_unlock, rc);
4091 rc = ll_data_version(child_inode, &data_version,
4094 GOTO(out_close, rc);
4096 op_data->op_handle = och->och_fh;
4097 op_data->op_data = och->och_mod;
4098 op_data->op_data_version = data_version;
4099 op_data->op_lease_handle = och->och_lease_handle;
4100 op_data->op_bias |= MDS_RENAME_MIGRATE;
4103 op_data->op_mds = mdtidx;
4104 op_data->op_cli_flags = CLI_MIGRATE;
4105 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4106 namelen, name, namelen, &request);
4108 LASSERT(request != NULL);
4109 ll_update_times(request, parent);
4111 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4112 LASSERT(body != NULL);
4114 /* If the server does release layout lock, then we cleanup
4115 * the client och here, otherwise release it in out_close: */
4117 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4118 obd_mod_put(och->och_mod);
4119 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4121 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4127 if (request != NULL) {
4128 ptlrpc_req_finished(request);
4132 /* Try again if the file layout has changed. */
4133 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4137 if (och != NULL) /* close the file */
4138 ll_lease_close(och, child_inode, NULL);
4140 clear_nlink(child_inode);
4142 inode_unlock(child_inode);
4146 ll_finish_md_op_data(op_data);
4151 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4159 * test if some locks matching bits and l_req_mode are acquired
4160 * - bits can be in different locks
4161 * - if found clear the common lock bits in *bits
4162 * - the bits not found, are kept in *bits
4164 * \param bits [IN] searched lock bits [IN]
4165 * \param l_req_mode [IN] searched lock mode
4166 * \retval boolean, true iff all bits are found
4168 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4170 struct lustre_handle lockh;
4171 union ldlm_policy_data policy;
4172 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4173 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4182 fid = &ll_i2info(inode)->lli_fid;
4183 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4184 ldlm_lockname[mode]);
4186 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4187 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4188 policy.l_inodebits.bits = *bits & (1 << i);
4189 if (policy.l_inodebits.bits == 0)
4192 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4193 &policy, mode, &lockh)) {
4194 struct ldlm_lock *lock;
4196 lock = ldlm_handle2lock(&lockh);
4199 ~(lock->l_policy_data.l_inodebits.bits);
4200 LDLM_LOCK_PUT(lock);
4202 *bits &= ~policy.l_inodebits.bits;
4209 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4210 struct lustre_handle *lockh, __u64 flags,
4211 enum ldlm_mode mode)
4213 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4218 fid = &ll_i2info(inode)->lli_fid;
4219 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4221 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4222 fid, LDLM_IBITS, &policy, mode, lockh);
4227 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4229 /* Already unlinked. Just update nlink and return success */
4230 if (rc == -ENOENT) {
4232 /* If it is striped directory, and there is bad stripe
4233 * Let's revalidate the dentry again, instead of returning
4235 if (S_ISDIR(inode->i_mode) &&
4236 ll_i2info(inode)->lli_lsm_md != NULL)
4239 /* This path cannot be hit for regular files unless in
4240 * case of obscure races, so no need to to validate
4242 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4244 } else if (rc != 0) {
4245 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4246 "%s: revalidate FID "DFID" error: rc = %d\n",
4247 ll_get_fsname(inode->i_sb, NULL, 0),
4248 PFID(ll_inode2fid(inode)), rc);
4254 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4256 struct inode *inode = dentry->d_inode;
4257 struct obd_export *exp = ll_i2mdexp(inode);
4258 struct lookup_intent oit = {
4261 struct ptlrpc_request *req = NULL;
4262 struct md_op_data *op_data;
4266 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4267 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4269 /* Call getattr by fid, so do not provide name at all. */
4270 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4271 LUSTRE_OPC_ANY, NULL);
4272 if (IS_ERR(op_data))
4273 RETURN(PTR_ERR(op_data));
4275 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4276 ll_finish_md_op_data(op_data);
4278 rc = ll_inode_revalidate_fini(inode, rc);
4282 rc = ll_revalidate_it_finish(req, &oit, dentry);
4284 ll_intent_release(&oit);
4288 /* Unlinked? Unhash dentry, so it is not picked up later by
4289 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4290 * here to preserve get_cwd functionality on 2.6.
4292 if (!dentry->d_inode->i_nlink) {
4293 ll_lock_dcache(inode);
4294 d_lustre_invalidate(dentry, 0);
4295 ll_unlock_dcache(inode);
4298 ll_lookup_finish_locks(&oit, dentry);
4300 ptlrpc_req_finished(req);
4305 static int ll_merge_md_attr(struct inode *inode)
4307 struct cl_attr attr = { 0 };
4310 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4311 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4312 &attr, ll_md_blocking_ast);
4316 set_nlink(inode, attr.cat_nlink);
4317 inode->i_blocks = attr.cat_blocks;
4318 i_size_write(inode, attr.cat_size);
4320 ll_i2info(inode)->lli_atime = attr.cat_atime;
4321 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4322 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4327 static inline dev_t ll_compat_encode_dev(dev_t dev)
4329 /* The compat_sys_*stat*() syscalls will fail unless the
4330 * device majors and minors are both less than 256. Note that
4331 * the value returned here will be passed through
4332 * old_encode_dev() in cp_compat_stat(). And so we are not
4333 * trying to return a valid compat (u16) device number, just
4334 * one that will pass the old_valid_dev() check. */
4336 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4339 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4340 int ll_getattr(const struct path *path, struct kstat *stat,
4341 u32 request_mask, unsigned int flags)
4343 struct dentry *de = path->dentry;
4345 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4348 struct inode *inode = de->d_inode;
4349 struct ll_sb_info *sbi = ll_i2sbi(inode);
4350 struct ll_inode_info *lli = ll_i2info(inode);
4353 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4355 rc = ll_inode_revalidate(de, IT_GETATTR);
4359 if (S_ISREG(inode->i_mode)) {
4360 /* In case of restore, the MDT has the right size and has
4361 * already send it back without granting the layout lock,
4362 * inode is up-to-date so glimpse is useless.
4363 * Also to glimpse we need the layout, in case of a running
4364 * restore the MDT holds the layout lock so the glimpse will
4365 * block up to the end of restore (getattr will block)
4367 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4368 rc = ll_glimpse_size(inode);
4373 /* If object isn't regular a file then don't validate size. */
4374 if (S_ISDIR(inode->i_mode) &&
4375 lli->lli_lsm_md != NULL) {
4376 rc = ll_merge_md_attr(inode);
4381 LTIME_S(inode->i_atime) = lli->lli_atime;
4382 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4383 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4386 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4388 if (ll_need_32bit_api(sbi)) {
4389 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4390 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4391 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4393 stat->ino = inode->i_ino;
4394 stat->dev = inode->i_sb->s_dev;
4395 stat->rdev = inode->i_rdev;
4398 stat->mode = inode->i_mode;
4399 stat->uid = inode->i_uid;
4400 stat->gid = inode->i_gid;
4401 stat->atime = inode->i_atime;
4402 stat->mtime = inode->i_mtime;
4403 stat->ctime = inode->i_ctime;
4404 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4406 stat->nlink = inode->i_nlink;
4407 stat->size = i_size_read(inode);
4408 stat->blocks = inode->i_blocks;
4413 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4414 __u64 start, __u64 len)
4418 struct fiemap *fiemap;
4419 unsigned int extent_count = fieinfo->fi_extents_max;
4421 num_bytes = sizeof(*fiemap) + (extent_count *
4422 sizeof(struct fiemap_extent));
4423 OBD_ALLOC_LARGE(fiemap, num_bytes);
4428 fiemap->fm_flags = fieinfo->fi_flags;
4429 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4430 fiemap->fm_start = start;
4431 fiemap->fm_length = len;
4432 if (extent_count > 0 &&
4433 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4434 sizeof(struct fiemap_extent)) != 0)
4435 GOTO(out, rc = -EFAULT);
4437 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4439 fieinfo->fi_flags = fiemap->fm_flags;
4440 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4441 if (extent_count > 0 &&
4442 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4443 fiemap->fm_mapped_extents *
4444 sizeof(struct fiemap_extent)) != 0)
4445 GOTO(out, rc = -EFAULT);
4447 OBD_FREE_LARGE(fiemap, num_bytes);
4451 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4453 struct ll_inode_info *lli = ll_i2info(inode);
4454 struct posix_acl *acl = NULL;
4457 spin_lock(&lli->lli_lock);
4458 /* VFS' acl_permission_check->check_acl will release the refcount */
4459 acl = posix_acl_dup(lli->lli_posix_acl);
4460 spin_unlock(&lli->lli_lock);
4465 #ifdef HAVE_IOP_SET_ACL
4466 #ifdef CONFIG_FS_POSIX_ACL
4467 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4469 const char *name = NULL;
4476 case ACL_TYPE_ACCESS:
4478 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4482 name = XATTR_NAME_POSIX_ACL_ACCESS;
4484 case ACL_TYPE_DEFAULT:
4485 if (!S_ISDIR(inode->i_mode))
4486 GOTO(out, rc = acl ? -EACCES : 0);
4487 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4490 GOTO(out, rc = -EINVAL);
4494 size = posix_acl_xattr_size(acl->a_count);
4495 value = kmalloc(size, GFP_NOFS);
4497 GOTO(out, rc = -ENOMEM);
4499 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4504 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4505 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4510 set_cached_acl(inode, type, acl);
4512 forget_cached_acl(inode, type);
4515 #endif /* CONFIG_FS_POSIX_ACL */
4516 #endif /* HAVE_IOP_SET_ACL */
4518 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4520 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4521 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4523 ll_check_acl(struct inode *inode, int mask)
4526 # ifdef CONFIG_FS_POSIX_ACL
4527 struct posix_acl *acl;
4531 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4532 if (flags & IPERM_FLAG_RCU)
4535 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4540 rc = posix_acl_permission(inode, acl, mask);
4541 posix_acl_release(acl);
4544 # else /* !CONFIG_FS_POSIX_ACL */
4546 # endif /* CONFIG_FS_POSIX_ACL */
4548 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4550 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4551 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4553 # ifdef HAVE_INODE_PERMISION_2ARGS
4554 int ll_inode_permission(struct inode *inode, int mask)
4556 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4561 struct ll_sb_info *sbi;
4562 struct root_squash_info *squash;
4563 struct cred *cred = NULL;
4564 const struct cred *old_cred = NULL;
4566 bool squash_id = false;
4569 #ifdef MAY_NOT_BLOCK
4570 if (mask & MAY_NOT_BLOCK)
4572 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4573 if (flags & IPERM_FLAG_RCU)
4577 /* as root inode are NOT getting validated in lookup operation,
4578 * need to do it before permission check. */
4580 if (inode == inode->i_sb->s_root->d_inode) {
4581 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4586 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4587 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4589 /* squash fsuid/fsgid if needed */
4590 sbi = ll_i2sbi(inode);
4591 squash = &sbi->ll_squash;
4592 if (unlikely(squash->rsi_uid != 0 &&
4593 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4594 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4598 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4599 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4600 squash->rsi_uid, squash->rsi_gid);
4602 /* update current process's credentials
4603 * and FS capability */
4604 cred = prepare_creds();
4608 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4609 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4610 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4611 if ((1 << cap) & CFS_CAP_FS_MASK)
4612 cap_lower(cred->cap_effective, cap);
4614 old_cred = override_creds(cred);
4617 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4618 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4619 /* restore current process's credentials and FS capability */
4621 revert_creds(old_cred);
4628 /* -o localflock - only provides locally consistent flock locks */
4629 struct file_operations ll_file_operations = {
4630 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4631 # ifdef HAVE_SYNC_READ_WRITE
4632 .read = new_sync_read,
4633 .write = new_sync_write,
4635 .read_iter = ll_file_read_iter,
4636 .write_iter = ll_file_write_iter,
4637 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4638 .read = ll_file_read,
4639 .aio_read = ll_file_aio_read,
4640 .write = ll_file_write,
4641 .aio_write = ll_file_aio_write,
4642 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4643 .unlocked_ioctl = ll_file_ioctl,
4644 .open = ll_file_open,
4645 .release = ll_file_release,
4646 .mmap = ll_file_mmap,
4647 .llseek = ll_file_seek,
4648 .splice_read = ll_file_splice_read,
4653 struct file_operations ll_file_operations_flock = {
4654 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4655 # ifdef HAVE_SYNC_READ_WRITE
4656 .read = new_sync_read,
4657 .write = new_sync_write,
4658 # endif /* HAVE_SYNC_READ_WRITE */
4659 .read_iter = ll_file_read_iter,
4660 .write_iter = ll_file_write_iter,
4661 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4662 .read = ll_file_read,
4663 .aio_read = ll_file_aio_read,
4664 .write = ll_file_write,
4665 .aio_write = ll_file_aio_write,
4666 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4667 .unlocked_ioctl = ll_file_ioctl,
4668 .open = ll_file_open,
4669 .release = ll_file_release,
4670 .mmap = ll_file_mmap,
4671 .llseek = ll_file_seek,
4672 .splice_read = ll_file_splice_read,
4675 .flock = ll_file_flock,
4676 .lock = ll_file_flock
4679 /* These are for -o noflock - to return ENOSYS on flock calls */
4680 struct file_operations ll_file_operations_noflock = {
4681 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4682 # ifdef HAVE_SYNC_READ_WRITE
4683 .read = new_sync_read,
4684 .write = new_sync_write,
4685 # endif /* HAVE_SYNC_READ_WRITE */
4686 .read_iter = ll_file_read_iter,
4687 .write_iter = ll_file_write_iter,
4688 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4689 .read = ll_file_read,
4690 .aio_read = ll_file_aio_read,
4691 .write = ll_file_write,
4692 .aio_write = ll_file_aio_write,
4693 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4694 .unlocked_ioctl = ll_file_ioctl,
4695 .open = ll_file_open,
4696 .release = ll_file_release,
4697 .mmap = ll_file_mmap,
4698 .llseek = ll_file_seek,
4699 .splice_read = ll_file_splice_read,
4702 .flock = ll_file_noflock,
4703 .lock = ll_file_noflock
4706 struct inode_operations ll_file_inode_operations = {
4707 .setattr = ll_setattr,
4708 .getattr = ll_getattr,
4709 .permission = ll_inode_permission,
4710 #ifdef HAVE_IOP_XATTR
4711 .setxattr = ll_setxattr,
4712 .getxattr = ll_getxattr,
4713 .removexattr = ll_removexattr,
4715 .listxattr = ll_listxattr,
4716 .fiemap = ll_fiemap,
4717 #ifdef HAVE_IOP_GET_ACL
4718 .get_acl = ll_get_acl,
4720 #ifdef HAVE_IOP_SET_ACL
4721 .set_acl = ll_set_acl,
4725 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4727 struct ll_inode_info *lli = ll_i2info(inode);
4728 struct cl_object *obj = lli->lli_clob;
4737 env = cl_env_get(&refcheck);
4739 RETURN(PTR_ERR(env));
4741 rc = cl_conf_set(env, lli->lli_clob, conf);
4745 if (conf->coc_opc == OBJECT_CONF_SET) {
4746 struct ldlm_lock *lock = conf->coc_lock;
4747 struct cl_layout cl = {
4751 LASSERT(lock != NULL);
4752 LASSERT(ldlm_has_layout(lock));
4754 /* it can only be allowed to match after layout is
4755 * applied to inode otherwise false layout would be
4756 * seen. Applying layout shoud happen before dropping
4757 * the intent lock. */
4758 ldlm_lock_allow_match(lock);
4760 rc = cl_object_layout_get(env, obj, &cl);
4765 DFID": layout version change: %u -> %u\n",
4766 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4768 ll_layout_version_set(lli, cl.cl_layout_gen);
4772 cl_env_put(env, &refcheck);
4777 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4778 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4781 struct ll_sb_info *sbi = ll_i2sbi(inode);
4782 struct ptlrpc_request *req;
4783 struct mdt_body *body;
4790 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4791 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4792 lock->l_lvb_data, lock->l_lvb_len);
4794 if (lock->l_lvb_data != NULL)
4797 /* if layout lock was granted right away, the layout is returned
4798 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4799 * blocked and then granted via completion ast, we have to fetch
4800 * layout here. Please note that we can't use the LVB buffer in
4801 * completion AST because it doesn't have a large enough buffer */
4802 rc = ll_get_default_mdsize(sbi, &lmmsize);
4804 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4805 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4810 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4812 GOTO(out, rc = -EPROTO);
4814 lmmsize = body->mbo_eadatasize;
4815 if (lmmsize == 0) /* empty layout */
4818 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4820 GOTO(out, rc = -EFAULT);
4822 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4823 if (lvbdata == NULL)
4824 GOTO(out, rc = -ENOMEM);
4826 memcpy(lvbdata, lmm, lmmsize);
4827 lock_res_and_lock(lock);
4828 if (unlikely(lock->l_lvb_data == NULL)) {
4829 lock->l_lvb_type = LVB_T_LAYOUT;
4830 lock->l_lvb_data = lvbdata;
4831 lock->l_lvb_len = lmmsize;
4834 unlock_res_and_lock(lock);
4837 OBD_FREE_LARGE(lvbdata, lmmsize);
4842 ptlrpc_req_finished(req);
4847 * Apply the layout to the inode. Layout lock is held and will be released
4850 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4851 struct inode *inode)
4853 struct ll_inode_info *lli = ll_i2info(inode);
4854 struct ll_sb_info *sbi = ll_i2sbi(inode);
4855 struct ldlm_lock *lock;
4856 struct cl_object_conf conf;
4859 bool wait_layout = false;
4862 LASSERT(lustre_handle_is_used(lockh));
4864 lock = ldlm_handle2lock(lockh);
4865 LASSERT(lock != NULL);
4866 LASSERT(ldlm_has_layout(lock));
4868 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4869 PFID(&lli->lli_fid), inode);
4871 /* in case this is a caching lock and reinstate with new inode */
4872 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4874 lock_res_and_lock(lock);
4875 lvb_ready = ldlm_is_lvb_ready(lock);
4876 unlock_res_and_lock(lock);
4878 /* checking lvb_ready is racy but this is okay. The worst case is
4879 * that multi processes may configure the file on the same time. */
4883 rc = ll_layout_fetch(inode, lock);
4887 /* for layout lock, lmm is stored in lock's lvb.
4888 * lvb_data is immutable if the lock is held so it's safe to access it
4891 * set layout to file. Unlikely this will fail as old layout was
4892 * surely eliminated */
4893 memset(&conf, 0, sizeof conf);
4894 conf.coc_opc = OBJECT_CONF_SET;
4895 conf.coc_inode = inode;
4896 conf.coc_lock = lock;
4897 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4898 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4899 rc = ll_layout_conf(inode, &conf);
4901 /* refresh layout failed, need to wait */
4902 wait_layout = rc == -EBUSY;
4905 LDLM_LOCK_PUT(lock);
4906 ldlm_lock_decref(lockh, mode);
4908 /* wait for IO to complete if it's still being used. */
4910 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4911 ll_get_fsname(inode->i_sb, NULL, 0),
4912 PFID(&lli->lli_fid), inode);
4914 memset(&conf, 0, sizeof conf);
4915 conf.coc_opc = OBJECT_CONF_WAIT;
4916 conf.coc_inode = inode;
4917 rc = ll_layout_conf(inode, &conf);
4921 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4922 ll_get_fsname(inode->i_sb, NULL, 0),
4923 PFID(&lli->lli_fid), rc);
4929 * Issue layout intent RPC to MDS.
4930 * \param inode [in] file inode
4931 * \param intent [in] layout intent
4933 * \retval 0 on success
4934 * \retval < 0 error code
4936 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4938 struct ll_inode_info *lli = ll_i2info(inode);
4939 struct ll_sb_info *sbi = ll_i2sbi(inode);
4940 struct md_op_data *op_data;
4941 struct lookup_intent it;
4942 struct ptlrpc_request *req;
4946 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4947 0, 0, LUSTRE_OPC_ANY, NULL);
4948 if (IS_ERR(op_data))
4949 RETURN(PTR_ERR(op_data));
4951 op_data->op_data = intent;
4952 op_data->op_data_size = sizeof(*intent);
4954 memset(&it, 0, sizeof(it));
4955 it.it_op = IT_LAYOUT;
4956 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4957 intent->li_opc == LAYOUT_INTENT_TRUNC)
4958 it.it_flags = FMODE_WRITE;
4960 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4961 ll_get_fsname(inode->i_sb, NULL, 0),
4962 PFID(&lli->lli_fid), inode);
4964 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4965 &ll_md_blocking_ast, 0);
4966 if (it.it_request != NULL)
4967 ptlrpc_req_finished(it.it_request);
4968 it.it_request = NULL;
4970 ll_finish_md_op_data(op_data);
4972 /* set lock data in case this is a new lock */
4974 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4976 ll_intent_drop_lock(&it);
4982 * This function checks if there exists a LAYOUT lock on the client side,
4983 * or enqueues it if it doesn't have one in cache.
4985 * This function will not hold layout lock so it may be revoked any time after
4986 * this function returns. Any operations depend on layout should be redone
4989 * This function should be called before lov_io_init() to get an uptodate
4990 * layout version, the caller should save the version number and after IO
4991 * is finished, this function should be called again to verify that layout
4992 * is not changed during IO time.
4994 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4996 struct ll_inode_info *lli = ll_i2info(inode);
4997 struct ll_sb_info *sbi = ll_i2sbi(inode);
4998 struct lustre_handle lockh;
4999 struct layout_intent intent = {
5000 .li_opc = LAYOUT_INTENT_ACCESS,
5002 enum ldlm_mode mode;
5006 *gen = ll_layout_version_get(lli);
5007 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5011 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5012 LASSERT(S_ISREG(inode->i_mode));
5014 /* take layout lock mutex to enqueue layout lock exclusively. */
5015 mutex_lock(&lli->lli_layout_mutex);
5018 /* mostly layout lock is caching on the local side, so try to
5019 * match it before grabbing layout lock mutex. */
5020 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5021 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5022 if (mode != 0) { /* hit cached lock */
5023 rc = ll_layout_lock_set(&lockh, mode, inode);
5029 rc = ll_layout_intent(inode, &intent);
5035 *gen = ll_layout_version_get(lli);
5036 mutex_unlock(&lli->lli_layout_mutex);
5042 * Issue layout intent RPC indicating where in a file an IO is about to write.
5044 * \param[in] inode file inode.
5045 * \param[in] ext write range with start offset of fille in bytes where
5046 * an IO is about to write, and exclusive end offset in
5049 * \retval 0 on success
5050 * \retval < 0 error code
5052 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5053 struct lu_extent *ext)
5055 struct layout_intent intent = {
5057 .li_extent.e_start = ext->e_start,
5058 .li_extent.e_end = ext->e_end,
5063 rc = ll_layout_intent(inode, &intent);
5069 * This function send a restore request to the MDT
5071 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5073 struct hsm_user_request *hur;
5077 len = sizeof(struct hsm_user_request) +
5078 sizeof(struct hsm_user_item);
5079 OBD_ALLOC(hur, len);
5083 hur->hur_request.hr_action = HUA_RESTORE;
5084 hur->hur_request.hr_archive_id = 0;
5085 hur->hur_request.hr_flags = 0;
5086 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5087 sizeof(hur->hur_user_item[0].hui_fid));
5088 hur->hur_user_item[0].hui_extent.offset = offset;
5089 hur->hur_user_item[0].hui_extent.length = length;
5090 hur->hur_request.hr_itemcount = 1;
5091 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,