4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 rc = md_close(md_exp, op_data, och->och_mod, &req);
203 if (rc != 0 && rc != -EINTR)
204 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
205 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
207 if (rc == 0 && op_data->op_bias & bias) {
208 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
215 ll_finish_md_op_data(op_data);
219 md_clear_open_replay_data(md_exp, och);
220 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
223 ptlrpc_req_finished(req); /* This is close request */
227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
229 struct ll_inode_info *lli = ll_i2info(inode);
230 struct obd_client_handle **och_p;
231 struct obd_client_handle *och;
236 if (fmode & FMODE_WRITE) {
237 och_p = &lli->lli_mds_write_och;
238 och_usecount = &lli->lli_open_fd_write_count;
239 } else if (fmode & FMODE_EXEC) {
240 och_p = &lli->lli_mds_exec_och;
241 och_usecount = &lli->lli_open_fd_exec_count;
243 LASSERT(fmode & FMODE_READ);
244 och_p = &lli->lli_mds_read_och;
245 och_usecount = &lli->lli_open_fd_read_count;
248 mutex_lock(&lli->lli_och_mutex);
249 if (*och_usecount > 0) {
250 /* There are still users of this handle, so skip
252 mutex_unlock(&lli->lli_och_mutex);
258 mutex_unlock(&lli->lli_och_mutex);
261 /* There might be a race and this handle may already
263 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
269 static int ll_md_close(struct inode *inode, struct file *file)
271 union ldlm_policy_data policy = {
272 .l_inodebits = { MDS_INODELOCK_OPEN },
274 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
275 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
276 struct ll_inode_info *lli = ll_i2info(inode);
277 struct lustre_handle lockh;
278 enum ldlm_mode lockmode;
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 mutex_lock(&lli->lli_och_mutex);
307 if (fd->fd_omode & FMODE_WRITE) {
309 LASSERT(lli->lli_open_fd_write_count);
310 lli->lli_open_fd_write_count--;
311 } else if (fd->fd_omode & FMODE_EXEC) {
313 LASSERT(lli->lli_open_fd_exec_count);
314 lli->lli_open_fd_exec_count--;
317 LASSERT(lli->lli_open_fd_read_count);
318 lli->lli_open_fd_read_count--;
320 mutex_unlock(&lli->lli_och_mutex);
322 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
323 LDLM_IBITS, &policy, lockmode, &lockh))
324 rc = ll_md_real_close(inode, fd->fd_omode);
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
333 /* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here. Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
338 int ll_file_release(struct inode *inode, struct file *file)
340 struct ll_file_data *fd;
341 struct ll_sb_info *sbi = ll_i2sbi(inode);
342 struct ll_inode_info *lli = ll_i2info(inode);
346 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
347 PFID(ll_inode2fid(inode)), inode);
349 if (inode->i_sb->s_root != file_dentry(file))
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead,
355 * because parent and child process can share the same file handle. */
356 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
357 ll_deauthorize_statahead(inode, fd);
359 if (inode->i_sb->s_root == file_dentry(file)) {
360 LUSTRE_FPRIVATE(file) = NULL;
361 ll_file_data_put(fd);
365 if (!S_ISDIR(inode->i_mode)) {
366 if (lli->lli_clob != NULL)
367 lov_read_and_clear_async_rc(lli->lli_clob);
368 lli->lli_async_rc = 0;
371 rc = ll_md_close(inode, file);
373 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
374 libcfs_debug_dumplog();
379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
380 struct lookup_intent *itp)
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
440 /* We did open by fid, but by the time we got to the server,
441 * the object disappeared. If this is a create, we cannot really
442 * tell the userspace that the file it was trying to create
443 * does not exist. Instead let's return -ESTALE, and the VFS will
444 * retry the create with LOOKUP_REVAL that we are going to catch
445 * in ll_revalidate_dentry() and use lookup then.
447 if (rc == -ENOENT && itp->it_op & IT_CREAT)
453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
454 struct obd_client_handle *och)
456 struct mdt_body *body;
458 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
459 och->och_fh = body->mbo_handle;
460 och->och_fid = body->mbo_fid1;
461 och->och_lease_handle.cookie = it->it_lock_handle;
462 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
463 och->och_flags = it->it_flags;
465 return md_set_open_replay_data(md_exp, och, it);
468 static int ll_local_open(struct file *file, struct lookup_intent *it,
469 struct ll_file_data *fd, struct obd_client_handle *och)
471 struct inode *inode = file_inode(file);
474 LASSERT(!LUSTRE_FPRIVATE(file));
481 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
486 LUSTRE_FPRIVATE(file) = fd;
487 ll_readahead_init(inode, &fd->fd_ras);
488 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
490 /* ll_cl_context initialize */
491 rwlock_init(&fd->fd_lock);
492 INIT_LIST_HEAD(&fd->fd_lccs);
497 /* Open a file, and (for the very first open) create objects on the OSTs at
498 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
499 * creation or open until ll_lov_setstripe() ioctl is called.
501 * If we already have the stripe MD locally then we don't request it in
502 * md_open(), by passing a lmm_size = 0.
504 * It is up to the application to ensure no other processes open this file
505 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
506 * used. We might be able to avoid races of that sort by getting lli_open_sem
507 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
508 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
510 int ll_file_open(struct inode *inode, struct file *file)
512 struct ll_inode_info *lli = ll_i2info(inode);
513 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
514 .it_flags = file->f_flags };
515 struct obd_client_handle **och_p = NULL;
516 __u64 *och_usecount = NULL;
517 struct ll_file_data *fd;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
522 PFID(ll_inode2fid(inode)), inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_nofiledata, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode))
533 ll_authorize_statahead(inode, fd);
535 if (inode->i_sb->s_root == file_dentry(file)) {
536 LUSTRE_FPRIVATE(file) = fd;
540 if (!it || !it->it_disposition) {
541 /* Convert f_flags into access mode. We cannot use file->f_mode,
542 * because everything but O_ACCMODE mask was stripped from
544 if ((oit.it_flags + 1) & O_ACCMODE)
546 if (file->f_flags & O_TRUNC)
547 oit.it_flags |= FMODE_WRITE;
549 /* kernel only call f_op->open in dentry_open. filp_open calls
550 * dentry_open after call to open_namei that checks permissions.
551 * Only nfsd_open call dentry_open directly without checking
552 * permissions and because of that this code below is safe. */
553 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
554 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556 /* We do not want O_EXCL here, presumably we opened the file
557 * already? XXX - NFS implications? */
558 oit.it_flags &= ~O_EXCL;
560 /* bug20584, if "it_flags" contains O_CREAT, the file will be
561 * created if necessary, then "IT_CREAT" should be set to keep
562 * consistent with it */
563 if (oit.it_flags & O_CREAT)
564 oit.it_op |= IT_CREAT;
570 /* Let's see if we have file open on MDS already. */
571 if (it->it_flags & FMODE_WRITE) {
572 och_p = &lli->lli_mds_write_och;
573 och_usecount = &lli->lli_open_fd_write_count;
574 } else if (it->it_flags & FMODE_EXEC) {
575 och_p = &lli->lli_mds_exec_och;
576 och_usecount = &lli->lli_open_fd_exec_count;
578 och_p = &lli->lli_mds_read_och;
579 och_usecount = &lli->lli_open_fd_read_count;
582 mutex_lock(&lli->lli_och_mutex);
583 if (*och_p) { /* Open handle is present */
584 if (it_disposition(it, DISP_OPEN_OPEN)) {
585 /* Well, there's extra open request that we do not need,
586 let's close it somehow. This will decref request. */
587 rc = it_open_error(DISP_OPEN_OPEN, it);
589 mutex_unlock(&lli->lli_och_mutex);
590 GOTO(out_openerr, rc);
593 ll_release_openhandle(file_dentry(file), it);
597 rc = ll_local_open(file, it, fd, NULL);
600 mutex_unlock(&lli->lli_och_mutex);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->it_disposition) {
606 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
607 /* We cannot just request lock handle now, new ELC code
608 means that one of other OPEN locks for this file
609 could be cancelled, and since blocking ast handler
610 would attempt to grab och_mutex as well, that would
611 result in a deadlock */
612 mutex_unlock(&lli->lli_och_mutex);
614 * Normally called under two situations:
616 * 2. A race/condition on MDS resulting in no open
617 * handle to be returned from LOOKUP|OPEN request,
618 * for example if the target entry was a symlink.
620 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
621 * marked by a bit set in ll_iget_for_nfs. Clear the
622 * bit so that it's not confusing later callers.
624 * NB; when ldd is NULL, it must have come via normal
625 * lookup path only, since ll_iget_for_nfs always calls
628 if (ldd && ldd->lld_nfs_dentry) {
629 ldd->lld_nfs_dentry = 0;
630 it->it_flags |= MDS_OPEN_LOCK;
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
641 GOTO(out_openerr, rc);
645 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
647 GOTO(out_och_free, rc = -ENOMEM);
651 /* md_intent_lock() didn't get a request ref if there was an
652 * open error, so don't do cleanup on the request here
654 /* XXX (green): Should not we bail out on any error here, not
655 * just open error? */
656 rc = it_open_error(DISP_OPEN_OPEN, it);
658 GOTO(out_och_free, rc);
660 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
661 "inode %p: disposition %x, status %d\n", inode,
662 it_disposition(it, ~0), it->it_status);
664 rc = ll_local_open(file, it, fd, *och_p);
666 GOTO(out_och_free, rc);
668 mutex_unlock(&lli->lli_och_mutex);
671 /* Must do this outside lli_och_mutex lock to prevent deadlock where
672 different kind of OPEN lock for this same inode gets cancelled
673 by ldlm_cancel_lru */
674 if (!S_ISREG(inode->i_mode))
675 GOTO(out_och_free, rc);
677 cl_lov_delay_create_clear(&file->f_flags);
678 GOTO(out_och_free, rc);
682 if (och_p && *och_p) {
683 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
684 *och_p = NULL; /* OBD_FREE writes some magic there */
687 mutex_unlock(&lli->lli_och_mutex);
690 if (lli->lli_opendir_key == fd)
691 ll_deauthorize_statahead(inode, fd);
693 ll_file_data_put(fd);
695 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
699 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
700 ptlrpc_req_finished(it->it_request);
701 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
707 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
708 struct ldlm_lock_desc *desc, void *data, int flag)
711 struct lustre_handle lockh;
715 case LDLM_CB_BLOCKING:
716 ldlm_lock2handle(lock, &lockh);
717 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
719 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
723 case LDLM_CB_CANCELING:
731 * When setting a lease on a file, we take ownership of the lli_mds_*_och
732 * and save it as fd->fd_och so as to force client to reopen the file even
733 * if it has an open lock in cache already.
735 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
736 struct lustre_handle *old_handle)
738 struct ll_inode_info *lli = ll_i2info(inode);
739 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
740 struct obd_client_handle **och_p;
745 /* Get the openhandle of the file */
746 mutex_lock(&lli->lli_och_mutex);
747 if (fd->fd_lease_och != NULL)
748 GOTO(out_unlock, rc = -EBUSY);
750 if (fd->fd_och == NULL) {
751 if (file->f_mode & FMODE_WRITE) {
752 LASSERT(lli->lli_mds_write_och != NULL);
753 och_p = &lli->lli_mds_write_och;
754 och_usecount = &lli->lli_open_fd_write_count;
756 LASSERT(lli->lli_mds_read_och != NULL);
757 och_p = &lli->lli_mds_read_och;
758 och_usecount = &lli->lli_open_fd_read_count;
761 if (*och_usecount > 1)
762 GOTO(out_unlock, rc = -EBUSY);
769 *old_handle = fd->fd_och->och_fh;
773 mutex_unlock(&lli->lli_och_mutex);
778 * Release ownership on lli_mds_*_och when putting back a file lease.
780 static int ll_lease_och_release(struct inode *inode, struct file *file)
782 struct ll_inode_info *lli = ll_i2info(inode);
783 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
784 struct obd_client_handle **och_p;
785 struct obd_client_handle *old_och = NULL;
790 mutex_lock(&lli->lli_och_mutex);
791 if (file->f_mode & FMODE_WRITE) {
792 och_p = &lli->lli_mds_write_och;
793 och_usecount = &lli->lli_open_fd_write_count;
795 och_p = &lli->lli_mds_read_och;
796 och_usecount = &lli->lli_open_fd_read_count;
799 /* The file may have been open by another process (broken lease) so
800 * *och_p is not NULL. In this case we should simply increase usecount
803 if (*och_p != NULL) {
804 old_och = fd->fd_och;
811 mutex_unlock(&lli->lli_och_mutex);
814 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
820 * Acquire a lease and open the file.
822 static struct obd_client_handle *
823 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
826 struct lookup_intent it = { .it_op = IT_OPEN };
827 struct ll_sb_info *sbi = ll_i2sbi(inode);
828 struct md_op_data *op_data;
829 struct ptlrpc_request *req = NULL;
830 struct lustre_handle old_handle = { 0 };
831 struct obd_client_handle *och = NULL;
836 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
837 RETURN(ERR_PTR(-EINVAL));
840 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
841 RETURN(ERR_PTR(-EPERM));
843 rc = ll_lease_och_acquire(inode, file, &old_handle);
850 RETURN(ERR_PTR(-ENOMEM));
852 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
853 LUSTRE_OPC_ANY, NULL);
855 GOTO(out, rc = PTR_ERR(op_data));
857 /* To tell the MDT this openhandle is from the same owner */
858 op_data->op_handle = old_handle;
860 it.it_flags = fmode | open_flags;
861 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
862 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
863 &ll_md_blocking_lease_ast,
864 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
865 * it can be cancelled which may mislead applications that the lease is
867 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
868 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
869 * doesn't deal with openhandle, so normal openhandle will be leaked. */
870 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
871 ll_finish_md_op_data(op_data);
872 ptlrpc_req_finished(req);
874 GOTO(out_release_it, rc);
876 if (it_disposition(&it, DISP_LOOKUP_NEG))
877 GOTO(out_release_it, rc = -ENOENT);
879 rc = it_open_error(DISP_OPEN_OPEN, &it);
881 GOTO(out_release_it, rc);
883 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
884 ll_och_fill(sbi->ll_md_exp, &it, och);
886 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
887 GOTO(out_close, rc = -EOPNOTSUPP);
889 /* already get lease, handle lease lock */
890 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
891 if (it.it_lock_mode == 0 ||
892 it.it_lock_bits != MDS_INODELOCK_OPEN) {
893 /* open lock must return for lease */
894 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
895 PFID(ll_inode2fid(inode)), it.it_lock_mode,
897 GOTO(out_close, rc = -EPROTO);
900 ll_intent_release(&it);
904 /* Cancel open lock */
905 if (it.it_lock_mode != 0) {
906 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
909 och->och_lease_handle.cookie = 0ULL;
911 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
913 CERROR("%s: error closing file "DFID": %d\n",
914 ll_get_fsname(inode->i_sb, NULL, 0),
915 PFID(&ll_i2info(inode)->lli_fid), rc2);
916 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
918 ll_intent_release(&it);
926 * Check whether a layout swap can be done between two inodes.
928 * \param[in] inode1 First inode to check
929 * \param[in] inode2 Second inode to check
931 * \retval 0 on success, layout swap can be performed between both inodes
932 * \retval negative error code if requirements are not met
934 static int ll_check_swap_layouts_validity(struct inode *inode1,
935 struct inode *inode2)
937 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
940 if (inode_permission(inode1, MAY_WRITE) ||
941 inode_permission(inode2, MAY_WRITE))
944 if (inode1->i_sb != inode2->i_sb)
950 static int ll_swap_layouts_close(struct obd_client_handle *och,
951 struct inode *inode, struct inode *inode2)
953 const struct lu_fid *fid1 = ll_inode2fid(inode);
954 const struct lu_fid *fid2;
958 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
959 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
961 rc = ll_check_swap_layouts_validity(inode, inode2);
963 GOTO(out_free_och, rc);
965 /* We now know that inode2 is a lustre inode */
966 fid2 = ll_inode2fid(inode2);
968 rc = lu_fid_cmp(fid1, fid2);
970 GOTO(out_free_och, rc = -EINVAL);
972 /* Close the file and {swap,merge} layouts between inode & inode2.
973 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
974 * because we still need it to pack l_remote_handle to MDT. */
975 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
978 och = NULL; /* freed in ll_close_inode_openhandle() */
988 * Release lease and close the file.
989 * It will check if the lease has ever broken.
991 static int ll_lease_close_intent(struct obd_client_handle *och,
993 bool *lease_broken, enum mds_op_bias bias,
996 struct ldlm_lock *lock;
997 bool cancelled = true;
1001 lock = ldlm_handle2lock(&och->och_lease_handle);
1003 lock_res_and_lock(lock);
1004 cancelled = ldlm_is_cancel(lock);
1005 unlock_res_and_lock(lock);
1006 LDLM_LOCK_PUT(lock);
1009 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1010 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1012 if (lease_broken != NULL)
1013 *lease_broken = cancelled;
1015 if (!cancelled && !bias)
1016 ldlm_cli_cancel(&och->och_lease_handle, 0);
1018 if (cancelled) { /* no need to excute intent */
1023 rc = ll_close_inode_openhandle(inode, och, bias, data);
1027 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1030 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1034 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1036 static int ll_lease_file_resync(struct obd_client_handle *och,
1037 struct inode *inode)
1039 struct ll_sb_info *sbi = ll_i2sbi(inode);
1040 struct md_op_data *op_data;
1041 __u64 data_version_unused;
1045 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1046 LUSTRE_OPC_ANY, NULL);
1047 if (IS_ERR(op_data))
1048 RETURN(PTR_ERR(op_data));
1050 /* before starting file resync, it's necessary to clean up page cache
1051 * in client memory, otherwise once the layout version is increased,
1052 * writing back cached data will be denied the OSTs. */
1053 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1057 op_data->op_handle = och->och_lease_handle;
1058 rc = md_file_resync(sbi->ll_md_exp, op_data);
1064 ll_finish_md_op_data(op_data);
1068 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1070 struct ll_inode_info *lli = ll_i2info(inode);
1071 struct cl_object *obj = lli->lli_clob;
1072 struct cl_attr *attr = vvp_env_thread_attr(env);
1080 ll_inode_size_lock(inode);
1082 /* Merge timestamps the most recently obtained from MDS with
1083 * timestamps obtained from OSTs.
1085 * Do not overwrite atime of inode because it may be refreshed
1086 * by file_accessed() function. If the read was served by cache
1087 * data, there is no RPC to be sent so that atime may not be
1088 * transferred to OSTs at all. MDT only updates atime at close time
1089 * if it's at least 'mdd.*.atime_diff' older.
1090 * All in all, the atime in Lustre does not strictly comply with
1091 * POSIX. Solving this problem needs to send an RPC to MDT for each
1092 * read, this will hurt performance. */
1093 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1094 LTIME_S(inode->i_atime) = lli->lli_atime;
1095 lli->lli_update_atime = 0;
1097 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1098 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1100 atime = LTIME_S(inode->i_atime);
1101 mtime = LTIME_S(inode->i_mtime);
1102 ctime = LTIME_S(inode->i_ctime);
1104 cl_object_attr_lock(obj);
1105 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1108 rc = cl_object_attr_get(env, obj, attr);
1109 cl_object_attr_unlock(obj);
1112 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1114 if (atime < attr->cat_atime)
1115 atime = attr->cat_atime;
1117 if (ctime < attr->cat_ctime)
1118 ctime = attr->cat_ctime;
1120 if (mtime < attr->cat_mtime)
1121 mtime = attr->cat_mtime;
1123 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1124 PFID(&lli->lli_fid), attr->cat_size);
1126 i_size_write(inode, attr->cat_size);
1127 inode->i_blocks = attr->cat_blocks;
1129 LTIME_S(inode->i_atime) = atime;
1130 LTIME_S(inode->i_mtime) = mtime;
1131 LTIME_S(inode->i_ctime) = ctime;
1134 ll_inode_size_unlock(inode);
1140 * Set designated mirror for I/O.
1142 * So far only read, write, and truncated can support to issue I/O to
1143 * designated mirror.
1145 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1147 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1149 /* clear layout version for generic(non-resync) I/O in case it carries
1150 * stale layout version due to I/O restart */
1151 io->ci_layout_version = 0;
1153 /* FLR: disable non-delay for designated mirror I/O because obviously
1154 * only one mirror is available */
1155 if (fd->fd_designated_mirror > 0) {
1157 io->ci_designated_mirror = fd->fd_designated_mirror;
1158 io->ci_layout_version = fd->fd_layout_version;
1159 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1163 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1164 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1167 static bool file_is_noatime(const struct file *file)
1169 const struct vfsmount *mnt = file->f_path.mnt;
1170 const struct inode *inode = file_inode((struct file *)file);
1172 /* Adapted from file_accessed() and touch_atime().*/
1173 if (file->f_flags & O_NOATIME)
1176 if (inode->i_flags & S_NOATIME)
1179 if (IS_NOATIME(inode))
1182 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1185 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1188 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1194 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1196 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1198 struct inode *inode = file_inode(file);
1199 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1201 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1202 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1203 io->u.ci_rw.rw_file = file;
1204 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1205 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1206 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1208 if (iot == CIT_WRITE) {
1209 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1210 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1211 file->f_flags & O_DIRECT ||
1214 io->ci_obj = ll_i2info(inode)->lli_clob;
1215 io->ci_lockreq = CILR_MAYBE;
1216 if (ll_file_nolock(file)) {
1217 io->ci_lockreq = CILR_NEVER;
1218 io->ci_no_srvlock = 1;
1219 } else if (file->f_flags & O_APPEND) {
1220 io->ci_lockreq = CILR_MANDATORY;
1222 io->ci_noatime = file_is_noatime(file);
1223 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1224 io->ci_pio = !io->u.ci_rw.rw_append;
1228 /* FLR: only use non-delay I/O for read as there is only one
1229 * avaliable mirror for write. */
1230 io->ci_ndelay = !(iot == CIT_WRITE);
1232 ll_io_set_mirror(io, file);
1235 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1237 struct cl_io_pt *pt = ptask->pt_cbdata;
1238 struct file *file = pt->cip_file;
1241 loff_t pos = pt->cip_pos;
1246 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1247 file_dentry(file)->d_name.name,
1248 pt->cip_iot == CIT_READ ? "read" : "write",
1249 pos, pos + pt->cip_count);
1251 env = cl_env_get(&refcheck);
1253 RETURN(PTR_ERR(env));
1255 io = vvp_env_thread_io(env);
1256 ll_io_init(io, file, pt->cip_iot);
1257 io->u.ci_rw.rw_iter = pt->cip_iter;
1258 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1259 io->ci_pio = 0; /* It's already in parallel task */
1261 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1262 pt->cip_count - pt->cip_result);
1264 struct vvp_io *vio = vvp_env_io(env);
1266 vio->vui_io_subtype = IO_NORMAL;
1267 vio->vui_fd = LUSTRE_FPRIVATE(file);
1269 ll_cl_add(file, env, io, LCC_RW);
1270 rc = cl_io_loop(env, io);
1271 ll_cl_remove(file, env);
1273 /* cl_io_rw_init() handled IO */
1277 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1283 if (io->ci_nob > 0) {
1284 pt->cip_result += io->ci_nob;
1285 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1287 pt->cip_iocb.ki_pos = pos;
1288 #ifdef HAVE_KIOCB_KI_LEFT
1289 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1290 #elif defined(HAVE_KI_NBYTES)
1291 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1295 cl_io_fini(env, io);
1296 cl_env_put(env, &refcheck);
1298 pt->cip_need_restart = io->ci_need_restart;
1300 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1301 file_dentry(file)->d_name.name,
1302 pt->cip_iot == CIT_READ ? "read" : "write",
1303 pt->cip_result, rc);
1305 RETURN(pt->cip_result > 0 ? 0 : rc);
1309 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1310 struct file *file, enum cl_io_type iot,
1311 loff_t *ppos, size_t count)
1313 struct range_lock range;
1314 struct vvp_io *vio = vvp_env_io(env);
1315 struct inode *inode = file_inode(file);
1316 struct ll_inode_info *lli = ll_i2info(inode);
1317 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1322 unsigned retried = 0;
1323 bool restarted = false;
1327 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1328 file_dentry(file)->d_name.name,
1329 iot == CIT_READ ? "read" : "write", pos, pos + count);
1332 io = vvp_env_thread_io(env);
1333 ll_io_init(io, file, iot);
1334 if (args->via_io_subtype == IO_NORMAL) {
1335 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1336 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1338 if (args->via_io_subtype != IO_NORMAL || restarted)
1340 io->ci_ndelay_tried = retried;
1342 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1343 bool range_locked = false;
1345 if (file->f_flags & O_APPEND)
1346 range_lock_init(&range, 0, LUSTRE_EOF);
1348 range_lock_init(&range, pos, pos + count - 1);
1350 vio->vui_fd = LUSTRE_FPRIVATE(file);
1351 vio->vui_io_subtype = args->via_io_subtype;
1353 switch (vio->vui_io_subtype) {
1355 /* Direct IO reads must also take range lock,
1356 * or multiple reads will try to work on the same pages
1357 * See LU-6227 for details. */
1358 if (((iot == CIT_WRITE) ||
1359 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1360 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1361 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1363 rc = range_lock(&lli->lli_write_tree, &range);
1367 range_locked = true;
1371 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1372 vio->u.splice.vui_flags = args->u.splice.via_flags;
1375 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1379 ll_cl_add(file, env, io, LCC_RW);
1380 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1381 !lli->lli_inode_locked) {
1383 lli->lli_inode_locked = 1;
1385 rc = cl_io_loop(env, io);
1386 if (lli->lli_inode_locked) {
1387 lli->lli_inode_locked = 0;
1388 inode_unlock(inode);
1390 ll_cl_remove(file, env);
1393 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1395 range_unlock(&lli->lli_write_tree, &range);
1398 /* cl_io_rw_init() handled IO */
1402 if (io->ci_nob > 0) {
1403 result += io->ci_nob;
1404 count -= io->ci_nob;
1406 if (args->via_io_subtype == IO_NORMAL) {
1407 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1409 args->u.normal.via_iocb->ki_pos = pos;
1410 #ifdef HAVE_KIOCB_KI_LEFT
1411 args->u.normal.via_iocb->ki_left = count;
1412 #elif defined(HAVE_KI_NBYTES)
1413 args->u.normal.via_iocb->ki_nbytes = count;
1417 pos = io->u.ci_rw.rw_range.cir_pos;
1421 cl_io_fini(env, io);
1424 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1425 file->f_path.dentry->d_name.name,
1426 iot, rc, result, io->ci_need_restart);
1428 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1430 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1431 file_dentry(file)->d_name.name,
1432 iot == CIT_READ ? "read" : "write",
1433 pos, pos + count, result, rc);
1434 /* preserve the tried count for FLR */
1435 retried = io->ci_ndelay_tried;
1440 if (iot == CIT_READ) {
1442 ll_stats_ops_tally(ll_i2sbi(inode),
1443 LPROC_LL_READ_BYTES, result);
1444 } else if (iot == CIT_WRITE) {
1446 ll_stats_ops_tally(ll_i2sbi(inode),
1447 LPROC_LL_WRITE_BYTES, result);
1448 fd->fd_write_failed = false;
1449 } else if (result == 0 && rc == 0) {
1452 fd->fd_write_failed = true;
1454 fd->fd_write_failed = false;
1455 } else if (rc != -ERESTARTSYS) {
1456 fd->fd_write_failed = true;
1460 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1461 file_dentry(file)->d_name.name,
1462 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1466 RETURN(result > 0 ? result : rc);
1470 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1471 * especially for small I/O.
1473 * To serve a read request, CLIO has to create and initialize a cl_io and
1474 * then request DLM lock. This has turned out to have siginificant overhead
1475 * and affects the performance of small I/O dramatically.
1477 * It's not necessary to create a cl_io for each I/O. Under the help of read
1478 * ahead, most of the pages being read are already in memory cache and we can
1479 * read those pages directly because if the pages exist, the corresponding DLM
1480 * lock must exist so that page content must be valid.
1482 * In fast read implementation, the llite speculatively finds and reads pages
1483 * in memory cache. There are three scenarios for fast read:
1484 * - If the page exists and is uptodate, kernel VM will provide the data and
1485 * CLIO won't be intervened;
1486 * - If the page was brought into memory by read ahead, it will be exported
1487 * and read ahead parameters will be updated;
1488 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1489 * it will go back and invoke normal read, i.e., a cl_io will be created
1490 * and DLM lock will be requested.
1492 * POSIX compliance: posix standard states that read is intended to be atomic.
1493 * Lustre read implementation is in line with Linux kernel read implementation
1494 * and neither of them complies with POSIX standard in this matter. Fast read
1495 * doesn't make the situation worse on single node but it may interleave write
1496 * results from multiple nodes due to short read handling in ll_file_aio_read().
1498 * \param env - lu_env
1499 * \param iocb - kiocb from kernel
1500 * \param iter - user space buffers where the data will be copied
1502 * \retval - number of bytes have been read, or error code if error occurred.
1505 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1509 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1512 /* NB: we can't do direct IO for fast read because it will need a lock
1513 * to make IO engine happy. */
1514 if (iocb->ki_filp->f_flags & O_DIRECT)
1517 result = generic_file_read_iter(iocb, iter);
1519 /* If the first page is not in cache, generic_file_aio_read() will be
1520 * returned with -ENODATA.
1521 * See corresponding code in ll_readpage(). */
1522 if (result == -ENODATA)
1526 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1527 LPROC_LL_READ_BYTES, result);
1533 * Read from a file (through the page cache).
1535 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1538 struct vvp_io_args *args;
1543 result = ll_do_fast_read(iocb, to);
1544 if (result < 0 || iov_iter_count(to) == 0)
1547 env = cl_env_get(&refcheck);
1549 return PTR_ERR(env);
1551 args = ll_env_args(env, IO_NORMAL);
1552 args->u.normal.via_iter = to;
1553 args->u.normal.via_iocb = iocb;
1555 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1556 &iocb->ki_pos, iov_iter_count(to));
1559 else if (result == 0)
1562 cl_env_put(env, &refcheck);
1568 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1569 * If a page is already in the page cache and dirty (and some other things -
1570 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1571 * write to it without doing a full I/O, because Lustre already knows about it
1572 * and will write it out. This saves a lot of processing time.
1574 * All writes here are within one page, so exclusion is handled by the page
1575 * lock on the vm page. Exception is appending, which requires locking the
1576 * full file to handle size issues. We do not do tiny writes for writes which
1577 * touch multiple pages because it's very unlikely multiple sequential pages
1578 * are already dirty.
1580 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1581 * and are unlikely to be to already dirty pages.
1583 * Attribute updates are important here, we do it in ll_tiny_write_end.
1585 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1587 ssize_t count = iov_iter_count(iter);
1588 struct file *file = iocb->ki_filp;
1589 struct inode *inode = file_inode(file);
1590 struct ll_inode_info *lli = ll_i2info(inode);
1591 struct range_lock range;
1593 bool append = false;
1597 /* NB: we can't do direct IO for tiny writes because they use the page
1598 * cache, and we can't do sync writes because tiny writes can't flush
1601 if (file->f_flags & (O_DIRECT | O_SYNC))
1604 /* It is relatively unlikely we will overwrite a full dirty page, so
1605 * limit tiny writes to < PAGE_SIZE
1607 if (count >= PAGE_SIZE)
1610 /* For append writes, we must take the range lock to protect size
1611 * and also move pos to current size before writing.
1613 if (file->f_flags & O_APPEND) {
1618 range_lock_init(&range, 0, LUSTRE_EOF);
1619 result = range_lock(&lli->lli_write_tree, &range);
1622 env = cl_env_get(&refcheck);
1624 GOTO(out, result = PTR_ERR(env));
1625 ll_merge_attr(env, inode);
1626 cl_env_put(env, &refcheck);
1627 iocb->ki_pos = i_size_read(inode);
1630 /* Does this write touch multiple pages?
1632 * This partly duplicates the PAGE_SIZE check above, but must come
1633 * after range locking for append writes because it depends on the
1634 * write position (ki_pos).
1636 if ((iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1639 result = __generic_file_write_iter(iocb, iter);
1641 /* If the page is not already dirty, ll_tiny_write_begin returns
1642 * -ENODATA. We continue on to normal write.
1644 if (result == -ENODATA)
1648 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1650 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1655 range_unlock(&lli->lli_write_tree, &range);
1657 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1663 * Write to a file (through the page cache).
1665 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1667 struct vvp_io_args *args;
1669 ssize_t rc_tiny, rc_normal;
1674 rc_tiny = ll_do_tiny_write(iocb, from);
1676 /* In case of error, go on and try normal write - Only stop if tiny
1677 * write completed I/O.
1679 if (iov_iter_count(from) == 0)
1680 GOTO(out, rc_normal = rc_tiny);
1682 env = cl_env_get(&refcheck);
1684 return PTR_ERR(env);
1686 args = ll_env_args(env, IO_NORMAL);
1687 args->u.normal.via_iter = from;
1688 args->u.normal.via_iocb = iocb;
1690 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1691 &iocb->ki_pos, iov_iter_count(from));
1693 /* On success, combine bytes written. */
1694 if (rc_tiny >= 0 && rc_normal > 0)
1695 rc_normal += rc_tiny;
1696 /* On error, only return error from normal write if tiny write did not
1697 * write any bytes. Otherwise return bytes written by tiny write.
1699 else if (rc_tiny > 0)
1700 rc_normal = rc_tiny;
1702 cl_env_put(env, &refcheck);
1707 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1709 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1711 static int ll_file_get_iov_count(const struct iovec *iov,
1712 unsigned long *nr_segs, size_t *count)
1717 for (seg = 0; seg < *nr_segs; seg++) {
1718 const struct iovec *iv = &iov[seg];
1721 * If any segment has a negative length, or the cumulative
1722 * length ever wraps negative then return -EINVAL.
1725 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1727 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1732 cnt -= iv->iov_len; /* This segment is no good */
1739 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1740 unsigned long nr_segs, loff_t pos)
1747 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1751 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1752 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1753 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1754 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1755 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1757 result = ll_file_read_iter(iocb, &to);
1762 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1765 struct iovec iov = { .iov_base = buf, .iov_len = count };
1770 init_sync_kiocb(&kiocb, file);
1771 kiocb.ki_pos = *ppos;
1772 #ifdef HAVE_KIOCB_KI_LEFT
1773 kiocb.ki_left = count;
1774 #elif defined(HAVE_KI_NBYTES)
1775 kiocb.i_nbytes = count;
1778 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1779 *ppos = kiocb.ki_pos;
1785 * Write to a file (through the page cache).
1788 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1789 unsigned long nr_segs, loff_t pos)
1791 struct iov_iter from;
1796 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1800 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1801 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1802 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1803 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1804 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1806 result = ll_file_write_iter(iocb, &from);
1811 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1812 size_t count, loff_t *ppos)
1814 struct iovec iov = { .iov_base = (void __user *)buf,
1821 init_sync_kiocb(&kiocb, file);
1822 kiocb.ki_pos = *ppos;
1823 #ifdef HAVE_KIOCB_KI_LEFT
1824 kiocb.ki_left = count;
1825 #elif defined(HAVE_KI_NBYTES)
1826 kiocb.ki_nbytes = count;
1829 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1830 *ppos = kiocb.ki_pos;
1834 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1837 * Send file content (through pagecache) somewhere with helper
1839 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1840 struct pipe_inode_info *pipe, size_t count,
1844 struct vvp_io_args *args;
1849 env = cl_env_get(&refcheck);
1851 RETURN(PTR_ERR(env));
1853 args = ll_env_args(env, IO_SPLICE);
1854 args->u.splice.via_pipe = pipe;
1855 args->u.splice.via_flags = flags;
1857 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1858 cl_env_put(env, &refcheck);
1862 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1863 __u64 flags, struct lov_user_md *lum, int lum_size)
1865 struct lookup_intent oit = {
1867 .it_flags = flags | MDS_OPEN_BY_FID,
1872 ll_inode_size_lock(inode);
1873 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1875 GOTO(out_unlock, rc);
1877 ll_release_openhandle(dentry, &oit);
1880 ll_inode_size_unlock(inode);
1881 ll_intent_release(&oit);
1886 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1887 struct lov_mds_md **lmmp, int *lmm_size,
1888 struct ptlrpc_request **request)
1890 struct ll_sb_info *sbi = ll_i2sbi(inode);
1891 struct mdt_body *body;
1892 struct lov_mds_md *lmm = NULL;
1893 struct ptlrpc_request *req = NULL;
1894 struct md_op_data *op_data;
1897 rc = ll_get_default_mdsize(sbi, &lmmsize);
1901 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1902 strlen(filename), lmmsize,
1903 LUSTRE_OPC_ANY, NULL);
1904 if (IS_ERR(op_data))
1905 RETURN(PTR_ERR(op_data));
1907 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1908 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1909 ll_finish_md_op_data(op_data);
1911 CDEBUG(D_INFO, "md_getattr_name failed "
1912 "on %s: rc %d\n", filename, rc);
1916 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1917 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1919 lmmsize = body->mbo_eadatasize;
1921 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1923 GOTO(out, rc = -ENODATA);
1926 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1927 LASSERT(lmm != NULL);
1929 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1930 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1931 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1932 GOTO(out, rc = -EPROTO);
1935 * This is coming from the MDS, so is probably in
1936 * little endian. We convert it to host endian before
1937 * passing it to userspace.
1939 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1942 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1943 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1944 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1945 if (le32_to_cpu(lmm->lmm_pattern) &
1946 LOV_PATTERN_F_RELEASED)
1950 /* if function called for directory - we should
1951 * avoid swab not existent lsm objects */
1952 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1953 lustre_swab_lov_user_md_v1(
1954 (struct lov_user_md_v1 *)lmm);
1955 if (S_ISREG(body->mbo_mode))
1956 lustre_swab_lov_user_md_objects(
1957 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1959 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1960 lustre_swab_lov_user_md_v3(
1961 (struct lov_user_md_v3 *)lmm);
1962 if (S_ISREG(body->mbo_mode))
1963 lustre_swab_lov_user_md_objects(
1964 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1966 } else if (lmm->lmm_magic ==
1967 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1968 lustre_swab_lov_comp_md_v1(
1969 (struct lov_comp_md_v1 *)lmm);
1975 *lmm_size = lmmsize;
1980 static int ll_lov_setea(struct inode *inode, struct file *file,
1983 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1984 struct lov_user_md *lump;
1985 int lum_size = sizeof(struct lov_user_md) +
1986 sizeof(struct lov_user_ost_data);
1990 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1993 OBD_ALLOC_LARGE(lump, lum_size);
1997 if (copy_from_user(lump, arg, lum_size))
1998 GOTO(out_lump, rc = -EFAULT);
2000 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2002 cl_lov_delay_create_clear(&file->f_flags);
2005 OBD_FREE_LARGE(lump, lum_size);
2009 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2016 env = cl_env_get(&refcheck);
2018 RETURN(PTR_ERR(env));
2020 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2021 cl_env_put(env, &refcheck);
2025 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2028 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2029 struct lov_user_md *klum;
2031 __u64 flags = FMODE_WRITE;
2034 rc = ll_copy_user_md(lum, &klum);
2039 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2044 rc = put_user(0, &lum->lmm_stripe_count);
2048 rc = ll_layout_refresh(inode, &gen);
2052 rc = ll_file_getstripe(inode, arg, lum_size);
2054 cl_lov_delay_create_clear(&file->f_flags);
2057 OBD_FREE(klum, lum_size);
2062 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2064 struct ll_inode_info *lli = ll_i2info(inode);
2065 struct cl_object *obj = lli->lli_clob;
2066 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2067 struct ll_grouplock grouplock;
2072 CWARN("group id for group lock must not be 0\n");
2076 if (ll_file_nolock(file))
2077 RETURN(-EOPNOTSUPP);
2079 spin_lock(&lli->lli_lock);
2080 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2081 CWARN("group lock already existed with gid %lu\n",
2082 fd->fd_grouplock.lg_gid);
2083 spin_unlock(&lli->lli_lock);
2086 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2087 spin_unlock(&lli->lli_lock);
2090 * XXX: group lock needs to protect all OST objects while PFL
2091 * can add new OST objects during the IO, so we'd instantiate
2092 * all OST objects before getting its group lock.
2097 struct cl_layout cl = {
2098 .cl_is_composite = false,
2100 struct lu_extent ext = {
2102 .e_end = OBD_OBJECT_EOF,
2105 env = cl_env_get(&refcheck);
2107 RETURN(PTR_ERR(env));
2109 rc = cl_object_layout_get(env, obj, &cl);
2110 if (!rc && cl.cl_is_composite)
2111 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2114 cl_env_put(env, &refcheck);
2119 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2120 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2124 spin_lock(&lli->lli_lock);
2125 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2126 spin_unlock(&lli->lli_lock);
2127 CERROR("another thread just won the race\n");
2128 cl_put_grouplock(&grouplock);
2132 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2133 fd->fd_grouplock = grouplock;
2134 spin_unlock(&lli->lli_lock);
2136 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2140 static int ll_put_grouplock(struct inode *inode, struct file *file,
2143 struct ll_inode_info *lli = ll_i2info(inode);
2144 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2145 struct ll_grouplock grouplock;
2148 spin_lock(&lli->lli_lock);
2149 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2150 spin_unlock(&lli->lli_lock);
2151 CWARN("no group lock held\n");
2155 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2157 if (fd->fd_grouplock.lg_gid != arg) {
2158 CWARN("group lock %lu doesn't match current id %lu\n",
2159 arg, fd->fd_grouplock.lg_gid);
2160 spin_unlock(&lli->lli_lock);
2164 grouplock = fd->fd_grouplock;
2165 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2166 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2167 spin_unlock(&lli->lli_lock);
2169 cl_put_grouplock(&grouplock);
2170 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2175 * Close inode open handle
2177 * \param dentry [in] dentry which contains the inode
2178 * \param it [in,out] intent which contains open info and result
2181 * \retval <0 failure
2183 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2185 struct inode *inode = dentry->d_inode;
2186 struct obd_client_handle *och;
2192 /* Root ? Do nothing. */
2193 if (dentry->d_inode->i_sb->s_root == dentry)
2196 /* No open handle to close? Move away */
2197 if (!it_disposition(it, DISP_OPEN_OPEN))
2200 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2202 OBD_ALLOC(och, sizeof(*och));
2204 GOTO(out, rc = -ENOMEM);
2206 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2208 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2210 /* this one is in place of ll_file_open */
2211 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2212 ptlrpc_req_finished(it->it_request);
2213 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2219 * Get size for inode for which FIEMAP mapping is requested.
2220 * Make the FIEMAP get_info call and returns the result.
2221 * \param fiemap kernel buffer to hold extens
2222 * \param num_bytes kernel buffer size
2224 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2230 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2233 /* Checks for fiemap flags */
2234 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2235 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2239 /* Check for FIEMAP_FLAG_SYNC */
2240 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2241 rc = filemap_fdatawrite(inode->i_mapping);
2246 env = cl_env_get(&refcheck);
2248 RETURN(PTR_ERR(env));
2250 if (i_size_read(inode) == 0) {
2251 rc = ll_glimpse_size(inode);
2256 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2257 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2258 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2260 /* If filesize is 0, then there would be no objects for mapping */
2261 if (fmkey.lfik_oa.o_size == 0) {
2262 fiemap->fm_mapped_extents = 0;
2266 fmkey.lfik_fiemap = *fiemap;
2268 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2269 &fmkey, fiemap, &num_bytes);
2271 cl_env_put(env, &refcheck);
2275 int ll_fid2path(struct inode *inode, void __user *arg)
2277 struct obd_export *exp = ll_i2mdexp(inode);
2278 const struct getinfo_fid2path __user *gfin = arg;
2280 struct getinfo_fid2path *gfout;
2286 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2287 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2290 /* Only need to get the buflen */
2291 if (get_user(pathlen, &gfin->gf_pathlen))
2294 if (pathlen > PATH_MAX)
2297 outsize = sizeof(*gfout) + pathlen;
2298 OBD_ALLOC(gfout, outsize);
2302 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2303 GOTO(gf_free, rc = -EFAULT);
2304 /* append root FID after gfout to let MDT know the root FID so that it
2305 * can lookup the correct path, this is mainly for fileset.
2306 * old server without fileset mount support will ignore this. */
2307 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2309 /* Call mdc_iocontrol */
2310 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2314 if (copy_to_user(arg, gfout, outsize))
2318 OBD_FREE(gfout, outsize);
2323 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2325 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2333 ioc->idv_version = 0;
2334 ioc->idv_layout_version = UINT_MAX;
2336 /* If no file object initialized, we consider its version is 0. */
2340 env = cl_env_get(&refcheck);
2342 RETURN(PTR_ERR(env));
2344 io = vvp_env_thread_io(env);
2346 io->u.ci_data_version.dv_data_version = 0;
2347 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2348 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2351 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2352 result = cl_io_loop(env, io);
2354 result = io->ci_result;
2356 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2357 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2359 cl_io_fini(env, io);
2361 if (unlikely(io->ci_need_restart))
2364 cl_env_put(env, &refcheck);
2370 * Read the data_version for inode.
2372 * This value is computed using stripe object version on OST.
2373 * Version is computed using server side locking.
2375 * @param flags if do sync on the OST side;
2377 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2378 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2380 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2382 struct ioc_data_version ioc = { .idv_flags = flags };
2385 rc = ll_ioc_data_version(inode, &ioc);
2387 *data_version = ioc.idv_version;
2393 * Trigger a HSM release request for the provided inode.
2395 int ll_hsm_release(struct inode *inode)
2398 struct obd_client_handle *och = NULL;
2399 __u64 data_version = 0;
2404 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2405 ll_get_fsname(inode->i_sb, NULL, 0),
2406 PFID(&ll_i2info(inode)->lli_fid));
2408 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2410 GOTO(out, rc = PTR_ERR(och));
2412 /* Grab latest data_version and [am]time values */
2413 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2417 env = cl_env_get(&refcheck);
2419 GOTO(out, rc = PTR_ERR(env));
2421 rc = ll_merge_attr(env, inode);
2422 cl_env_put(env, &refcheck);
2424 /* If error happen, we have the wrong size for a file.
2430 /* Release the file.
2431 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2432 * we still need it to pack l_remote_handle to MDT. */
2433 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2439 if (och != NULL && !IS_ERR(och)) /* close the file */
2440 ll_lease_close(och, inode, NULL);
2445 struct ll_swap_stack {
2448 struct inode *inode1;
2449 struct inode *inode2;
2454 static int ll_swap_layouts(struct file *file1, struct file *file2,
2455 struct lustre_swap_layouts *lsl)
2457 struct mdc_swap_layouts msl;
2458 struct md_op_data *op_data;
2461 struct ll_swap_stack *llss = NULL;
2464 OBD_ALLOC_PTR(llss);
2468 llss->inode1 = file_inode(file1);
2469 llss->inode2 = file_inode(file2);
2471 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2475 /* we use 2 bool because it is easier to swap than 2 bits */
2476 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2477 llss->check_dv1 = true;
2479 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2480 llss->check_dv2 = true;
2482 /* we cannot use lsl->sl_dvX directly because we may swap them */
2483 llss->dv1 = lsl->sl_dv1;
2484 llss->dv2 = lsl->sl_dv2;
2486 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2487 if (rc == 0) /* same file, done! */
2490 if (rc < 0) { /* sequentialize it */
2491 swap(llss->inode1, llss->inode2);
2493 swap(llss->dv1, llss->dv2);
2494 swap(llss->check_dv1, llss->check_dv2);
2498 if (gid != 0) { /* application asks to flush dirty cache */
2499 rc = ll_get_grouplock(llss->inode1, file1, gid);
2503 rc = ll_get_grouplock(llss->inode2, file2, gid);
2505 ll_put_grouplock(llss->inode1, file1, gid);
2510 /* ultimate check, before swaping the layouts we check if
2511 * dataversion has changed (if requested) */
2512 if (llss->check_dv1) {
2513 rc = ll_data_version(llss->inode1, &dv, 0);
2516 if (dv != llss->dv1)
2517 GOTO(putgl, rc = -EAGAIN);
2520 if (llss->check_dv2) {
2521 rc = ll_data_version(llss->inode2, &dv, 0);
2524 if (dv != llss->dv2)
2525 GOTO(putgl, rc = -EAGAIN);
2528 /* struct md_op_data is used to send the swap args to the mdt
2529 * only flags is missing, so we use struct mdc_swap_layouts
2530 * through the md_op_data->op_data */
2531 /* flags from user space have to be converted before they are send to
2532 * server, no flag is sent today, they are only used on the client */
2535 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2536 0, LUSTRE_OPC_ANY, &msl);
2537 if (IS_ERR(op_data))
2538 GOTO(free, rc = PTR_ERR(op_data));
2540 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2541 sizeof(*op_data), op_data, NULL);
2542 ll_finish_md_op_data(op_data);
2549 ll_put_grouplock(llss->inode2, file2, gid);
2550 ll_put_grouplock(llss->inode1, file1, gid);
2560 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2562 struct md_op_data *op_data;
2566 /* Detect out-of range masks */
2567 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2570 /* Non-root users are forbidden to set or clear flags which are
2571 * NOT defined in HSM_USER_MASK. */
2572 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2573 !cfs_capable(CFS_CAP_SYS_ADMIN))
2576 /* Detect out-of range archive id */
2577 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2578 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2581 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2582 LUSTRE_OPC_ANY, hss);
2583 if (IS_ERR(op_data))
2584 RETURN(PTR_ERR(op_data));
2586 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2587 sizeof(*op_data), op_data, NULL);
2589 ll_finish_md_op_data(op_data);
2594 static int ll_hsm_import(struct inode *inode, struct file *file,
2595 struct hsm_user_import *hui)
2597 struct hsm_state_set *hss = NULL;
2598 struct iattr *attr = NULL;
2602 if (!S_ISREG(inode->i_mode))
2608 GOTO(out, rc = -ENOMEM);
2610 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2611 hss->hss_archive_id = hui->hui_archive_id;
2612 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2613 rc = ll_hsm_state_set(inode, hss);
2617 OBD_ALLOC_PTR(attr);
2619 GOTO(out, rc = -ENOMEM);
2621 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2622 attr->ia_mode |= S_IFREG;
2623 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2624 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2625 attr->ia_size = hui->hui_size;
2626 attr->ia_mtime.tv_sec = hui->hui_mtime;
2627 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2628 attr->ia_atime.tv_sec = hui->hui_atime;
2629 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2631 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2632 ATTR_UID | ATTR_GID |
2633 ATTR_MTIME | ATTR_MTIME_SET |
2634 ATTR_ATIME | ATTR_ATIME_SET;
2638 rc = ll_setattr_raw(file_dentry(file), attr, true);
2642 inode_unlock(inode);
2654 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2656 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2657 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2660 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2662 struct inode *inode = file_inode(file);
2664 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2665 ATTR_MTIME | ATTR_MTIME_SET |
2666 ATTR_CTIME | ATTR_CTIME_SET,
2668 .tv_sec = lfu->lfu_atime_sec,
2669 .tv_nsec = lfu->lfu_atime_nsec,
2672 .tv_sec = lfu->lfu_mtime_sec,
2673 .tv_nsec = lfu->lfu_mtime_nsec,
2676 .tv_sec = lfu->lfu_ctime_sec,
2677 .tv_nsec = lfu->lfu_ctime_nsec,
2683 if (!capable(CAP_SYS_ADMIN))
2686 if (!S_ISREG(inode->i_mode))
2690 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2691 inode_unlock(inode);
2696 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2699 case MODE_READ_USER:
2701 case MODE_WRITE_USER:
2708 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2710 /* Used to allow the upper layers of the client to request an LDLM lock
2711 * without doing an actual read or write.
2713 * Used for ladvise lockahead to manually request specific locks.
2715 * \param[in] file file this ladvise lock request is on
2716 * \param[in] ladvise ladvise struct describing this lock request
2718 * \retval 0 success, no detailed result available (sync requests
2719 * and requests sent to the server [not handled locally]
2720 * cannot return detailed results)
2721 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2722 * see definitions for details.
2723 * \retval negative negative errno on error
2725 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2727 struct lu_env *env = NULL;
2728 struct cl_io *io = NULL;
2729 struct cl_lock *lock = NULL;
2730 struct cl_lock_descr *descr = NULL;
2731 struct dentry *dentry = file->f_path.dentry;
2732 struct inode *inode = dentry->d_inode;
2733 enum cl_lock_mode cl_mode;
2734 off_t start = ladvise->lla_start;
2735 off_t end = ladvise->lla_end;
2741 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2742 "start=%llu, end=%llu\n", dentry->d_name.len,
2743 dentry->d_name.name, dentry->d_inode,
2744 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2747 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2749 GOTO(out, result = cl_mode);
2751 /* Get IO environment */
2752 result = cl_io_get(inode, &env, &io, &refcheck);
2756 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2759 * nothing to do for this io. This currently happens when
2760 * stripe sub-object's are not yet created.
2762 result = io->ci_result;
2763 } else if (result == 0) {
2764 lock = vvp_env_lock(env);
2765 descr = &lock->cll_descr;
2767 descr->cld_obj = io->ci_obj;
2768 /* Convert byte offsets to pages */
2769 descr->cld_start = cl_index(io->ci_obj, start);
2770 descr->cld_end = cl_index(io->ci_obj, end);
2771 descr->cld_mode = cl_mode;
2772 /* CEF_MUST is used because we do not want to convert a
2773 * lockahead request to a lockless lock */
2774 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2777 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2778 descr->cld_enq_flags |= CEF_SPECULATIVE;
2780 result = cl_lock_request(env, io, lock);
2782 /* On success, we need to release the lock */
2784 cl_lock_release(env, lock);
2786 cl_io_fini(env, io);
2787 cl_env_put(env, &refcheck);
2789 /* -ECANCELED indicates a matching lock with a different extent
2790 * was already present, and -EEXIST indicates a matching lock
2791 * on exactly the same extent was already present.
2792 * We convert them to positive values for userspace to make
2793 * recognizing true errors easier.
2794 * Note we can only return these detailed results on async requests,
2795 * as sync requests look the same as i/o requests for locking. */
2796 if (result == -ECANCELED)
2797 result = LLA_RESULT_DIFFERENT;
2798 else if (result == -EEXIST)
2799 result = LLA_RESULT_SAME;
2804 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2806 static int ll_ladvise_sanity(struct inode *inode,
2807 struct llapi_lu_ladvise *ladvise)
2809 enum lu_ladvise_type advice = ladvise->lla_advice;
2810 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2811 * be in the first 32 bits of enum ladvise_flags */
2812 __u32 flags = ladvise->lla_peradvice_flags;
2813 /* 3 lines at 80 characters per line, should be plenty */
2816 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2818 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2819 "last supported advice is %s (value '%d'): rc = %d\n",
2820 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2821 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2825 /* Per-advice checks */
2827 case LU_LADVISE_LOCKNOEXPAND:
2828 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2830 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2832 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2833 ladvise_names[advice], rc);
2837 case LU_LADVISE_LOCKAHEAD:
2838 /* Currently only READ and WRITE modes can be requested */
2839 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2840 ladvise->lla_lockahead_mode == 0) {
2842 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2844 ll_get_fsname(inode->i_sb, NULL, 0),
2845 ladvise->lla_lockahead_mode,
2846 ladvise_names[advice], rc);
2849 case LU_LADVISE_WILLREAD:
2850 case LU_LADVISE_DONTNEED:
2852 /* Note fall through above - These checks apply to all advices
2853 * except LOCKNOEXPAND */
2854 if (flags & ~LF_DEFAULT_MASK) {
2856 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2858 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2859 ladvise_names[advice], rc);
2862 if (ladvise->lla_start >= ladvise->lla_end) {
2864 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2865 "for %s: rc = %d\n",
2866 ll_get_fsname(inode->i_sb, NULL, 0),
2867 ladvise->lla_start, ladvise->lla_end,
2868 ladvise_names[advice], rc);
2880 * Give file access advices
2882 * The ladvise interface is similar to Linux fadvise() system call, except it
2883 * forwards the advices directly from Lustre client to server. The server side
2884 * codes will apply appropriate read-ahead and caching techniques for the
2885 * corresponding files.
2887 * A typical workload for ladvise is e.g. a bunch of different clients are
2888 * doing small random reads of a file, so prefetching pages into OSS cache
2889 * with big linear reads before the random IO is a net benefit. Fetching
2890 * all that data into each client cache with fadvise() may not be, due to
2891 * much more data being sent to the client.
2893 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2894 struct llapi_lu_ladvise *ladvise)
2898 struct cl_ladvise_io *lio;
2903 env = cl_env_get(&refcheck);
2905 RETURN(PTR_ERR(env));
2907 io = vvp_env_thread_io(env);
2908 io->ci_obj = ll_i2info(inode)->lli_clob;
2910 /* initialize parameters for ladvise */
2911 lio = &io->u.ci_ladvise;
2912 lio->li_start = ladvise->lla_start;
2913 lio->li_end = ladvise->lla_end;
2914 lio->li_fid = ll_inode2fid(inode);
2915 lio->li_advice = ladvise->lla_advice;
2916 lio->li_flags = flags;
2918 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2919 rc = cl_io_loop(env, io);
2923 cl_io_fini(env, io);
2924 cl_env_put(env, &refcheck);
2928 static int ll_lock_noexpand(struct file *file, int flags)
2930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2932 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2937 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2940 struct fsxattr fsxattr;
2942 if (copy_from_user(&fsxattr,
2943 (const struct fsxattr __user *)arg,
2947 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2948 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2949 if (copy_to_user((struct fsxattr __user *)arg,
2950 &fsxattr, sizeof(fsxattr)))
2956 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2960 struct md_op_data *op_data;
2961 struct ptlrpc_request *req = NULL;
2963 struct fsxattr fsxattr;
2964 struct cl_object *obj;
2966 /* only root could change project ID */
2967 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2970 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2971 LUSTRE_OPC_ANY, NULL);
2972 if (IS_ERR(op_data))
2973 RETURN(PTR_ERR(op_data));
2975 if (copy_from_user(&fsxattr,
2976 (const struct fsxattr __user *)arg,
2978 GOTO(out_fsxattr1, rc = -EFAULT);
2980 op_data->op_attr_flags = fsxattr.fsx_xflags;
2981 op_data->op_projid = fsxattr.fsx_projid;
2982 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2983 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2985 ptlrpc_req_finished(req);
2987 obj = ll_i2info(inode)->lli_clob;
2991 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2992 OBD_ALLOC_PTR(attr);
2994 GOTO(out_fsxattr1, rc = -ENOMEM);
2995 attr->ia_valid = ATTR_ATTR_FLAG;
2996 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3001 ll_finish_md_op_data(op_data);
3005 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3008 struct inode *inode = file_inode(file);
3009 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3010 struct ll_inode_info *lli = ll_i2info(inode);
3011 struct obd_client_handle *och = NULL;
3012 struct split_param sp;
3015 enum mds_op_bias bias = 0;
3016 struct file *layout_file = NULL;
3018 size_t data_size = 0;
3022 mutex_lock(&lli->lli_och_mutex);
3023 if (fd->fd_lease_och != NULL) {
3024 och = fd->fd_lease_och;
3025 fd->fd_lease_och = NULL;
3027 mutex_unlock(&lli->lli_och_mutex);
3030 GOTO(out, rc = -ENOLCK);
3032 fmode = och->och_flags;
3034 switch (ioc->lil_flags) {
3035 case LL_LEASE_RESYNC_DONE:
3036 if (ioc->lil_count > IOC_IDS_MAX)
3037 GOTO(out, rc = -EINVAL);
3039 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3040 OBD_ALLOC(data, data_size);
3042 GOTO(out, rc = -ENOMEM);
3044 if (copy_from_user(data, (void __user *)arg, data_size))
3045 GOTO(out, rc = -EFAULT);
3047 bias = MDS_CLOSE_RESYNC_DONE;
3049 case LL_LEASE_LAYOUT_MERGE: {
3052 if (ioc->lil_count != 1)
3053 GOTO(out, rc = -EINVAL);
3055 arg += sizeof(*ioc);
3056 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3057 GOTO(out, rc = -EFAULT);
3059 layout_file = fget(fd);
3061 GOTO(out, rc = -EBADF);
3063 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3064 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3065 GOTO(out, rc = -EPERM);
3067 data = file_inode(layout_file);
3068 bias = MDS_CLOSE_LAYOUT_MERGE;
3071 case LL_LEASE_LAYOUT_SPLIT: {
3075 if (ioc->lil_count != 2)
3076 GOTO(out, rc = -EINVAL);
3078 arg += sizeof(*ioc);
3079 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3080 GOTO(out, rc = -EFAULT);
3082 arg += sizeof(__u32);
3083 if (copy_from_user(&mirror_id, (void __user *)arg,
3085 GOTO(out, rc = -EFAULT);
3087 layout_file = fget(fdv);
3089 GOTO(out, rc = -EBADF);
3091 sp.sp_inode = file_inode(layout_file);
3092 sp.sp_mirror_id = (__u16)mirror_id;
3094 bias = MDS_CLOSE_LAYOUT_SPLIT;
3098 /* without close intent */
3102 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3106 rc = ll_lease_och_release(inode, file);
3115 switch (ioc->lil_flags) {
3116 case LL_LEASE_RESYNC_DONE:
3118 OBD_FREE(data, data_size);
3120 case LL_LEASE_LAYOUT_MERGE:
3121 case LL_LEASE_LAYOUT_SPLIT:
3128 rc = ll_lease_type_from_fmode(fmode);
3132 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3135 struct inode *inode = file_inode(file);
3136 struct ll_inode_info *lli = ll_i2info(inode);
3137 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3138 struct obd_client_handle *och = NULL;
3139 __u64 open_flags = 0;
3145 switch (ioc->lil_mode) {
3146 case LL_LEASE_WRLCK:
3147 if (!(file->f_mode & FMODE_WRITE))
3149 fmode = FMODE_WRITE;
3151 case LL_LEASE_RDLCK:
3152 if (!(file->f_mode & FMODE_READ))
3156 case LL_LEASE_UNLCK:
3157 RETURN(ll_file_unlock_lease(file, ioc, arg));
3162 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3164 /* apply for lease */
3165 if (ioc->lil_flags & LL_LEASE_RESYNC)
3166 open_flags = MDS_OPEN_RESYNC;
3167 och = ll_lease_open(inode, file, fmode, open_flags);
3169 RETURN(PTR_ERR(och));
3171 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3172 rc = ll_lease_file_resync(och, inode);
3174 ll_lease_close(och, inode, NULL);
3177 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3179 ll_lease_close(och, inode, NULL);
3185 mutex_lock(&lli->lli_och_mutex);
3186 if (fd->fd_lease_och == NULL) {
3187 fd->fd_lease_och = och;
3190 mutex_unlock(&lli->lli_och_mutex);
3192 /* impossible now that only excl is supported for now */
3193 ll_lease_close(och, inode, &lease_broken);
3200 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3202 struct inode *inode = file_inode(file);
3203 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3207 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3208 PFID(ll_inode2fid(inode)), inode, cmd);
3209 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3211 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3212 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3216 case LL_IOC_GETFLAGS:
3217 /* Get the current value of the file flags */
3218 return put_user(fd->fd_flags, (int __user *)arg);
3219 case LL_IOC_SETFLAGS:
3220 case LL_IOC_CLRFLAGS:
3221 /* Set or clear specific file flags */
3222 /* XXX This probably needs checks to ensure the flags are
3223 * not abused, and to handle any flag side effects.
3225 if (get_user(flags, (int __user *) arg))
3228 if (cmd == LL_IOC_SETFLAGS) {
3229 if ((flags & LL_FILE_IGNORE_LOCK) &&
3230 !(file->f_flags & O_DIRECT)) {
3231 CERROR("%s: unable to disable locking on "
3232 "non-O_DIRECT file\n", current->comm);
3236 fd->fd_flags |= flags;
3238 fd->fd_flags &= ~flags;
3241 case LL_IOC_LOV_SETSTRIPE:
3242 case LL_IOC_LOV_SETSTRIPE_NEW:
3243 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3244 case LL_IOC_LOV_SETEA:
3245 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3246 case LL_IOC_LOV_SWAP_LAYOUTS: {
3248 struct lustre_swap_layouts lsl;
3250 if (copy_from_user(&lsl, (char __user *)arg,
3251 sizeof(struct lustre_swap_layouts)))
3254 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3257 file2 = fget(lsl.sl_fd);
3261 /* O_WRONLY or O_RDWR */
3262 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3263 GOTO(out, rc = -EPERM);
3265 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3266 struct inode *inode2;
3267 struct ll_inode_info *lli;
3268 struct obd_client_handle *och = NULL;
3270 lli = ll_i2info(inode);
3271 mutex_lock(&lli->lli_och_mutex);
3272 if (fd->fd_lease_och != NULL) {
3273 och = fd->fd_lease_och;
3274 fd->fd_lease_och = NULL;
3276 mutex_unlock(&lli->lli_och_mutex);
3278 GOTO(out, rc = -ENOLCK);
3279 inode2 = file_inode(file2);
3280 rc = ll_swap_layouts_close(och, inode, inode2);
3282 rc = ll_swap_layouts(file, file2, &lsl);
3288 case LL_IOC_LOV_GETSTRIPE:
3289 case LL_IOC_LOV_GETSTRIPE_NEW:
3290 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3291 case FSFILT_IOC_GETFLAGS:
3292 case FSFILT_IOC_SETFLAGS:
3293 RETURN(ll_iocontrol(inode, file, cmd, arg));
3294 case FSFILT_IOC_GETVERSION_OLD:
3295 case FSFILT_IOC_GETVERSION:
3296 RETURN(put_user(inode->i_generation, (int __user *)arg));
3297 case LL_IOC_GROUP_LOCK:
3298 RETURN(ll_get_grouplock(inode, file, arg));
3299 case LL_IOC_GROUP_UNLOCK:
3300 RETURN(ll_put_grouplock(inode, file, arg));
3301 case IOC_OBD_STATFS:
3302 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3304 /* We need to special case any other ioctls we want to handle,
3305 * to send them to the MDS/OST as appropriate and to properly
3306 * network encode the arg field.
3307 case FSFILT_IOC_SETVERSION_OLD:
3308 case FSFILT_IOC_SETVERSION:
3310 case LL_IOC_FLUSHCTX:
3311 RETURN(ll_flush_ctx(inode));
3312 case LL_IOC_PATH2FID: {
3313 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3314 sizeof(struct lu_fid)))
3319 case LL_IOC_GETPARENT:
3320 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3322 case OBD_IOC_FID2PATH:
3323 RETURN(ll_fid2path(inode, (void __user *)arg));
3324 case LL_IOC_DATA_VERSION: {
3325 struct ioc_data_version idv;
3328 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3331 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3332 rc = ll_ioc_data_version(inode, &idv);
3335 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3341 case LL_IOC_GET_MDTIDX: {
3344 mdtidx = ll_get_mdt_idx(inode);
3348 if (put_user((int)mdtidx, (int __user *)arg))
3353 case OBD_IOC_GETDTNAME:
3354 case OBD_IOC_GETMDNAME:
3355 RETURN(ll_get_obd_name(inode, cmd, arg));
3356 case LL_IOC_HSM_STATE_GET: {
3357 struct md_op_data *op_data;
3358 struct hsm_user_state *hus;
3365 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3366 LUSTRE_OPC_ANY, hus);
3367 if (IS_ERR(op_data)) {
3369 RETURN(PTR_ERR(op_data));
3372 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3375 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3378 ll_finish_md_op_data(op_data);
3382 case LL_IOC_HSM_STATE_SET: {
3383 struct hsm_state_set *hss;
3390 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3395 rc = ll_hsm_state_set(inode, hss);
3400 case LL_IOC_HSM_ACTION: {
3401 struct md_op_data *op_data;
3402 struct hsm_current_action *hca;
3409 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3410 LUSTRE_OPC_ANY, hca);
3411 if (IS_ERR(op_data)) {
3413 RETURN(PTR_ERR(op_data));
3416 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3419 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3422 ll_finish_md_op_data(op_data);
3426 case LL_IOC_SET_LEASE_OLD: {
3427 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3429 RETURN(ll_file_set_lease(file, &ioc, 0));
3431 case LL_IOC_SET_LEASE: {
3432 struct ll_ioc_lease ioc;
3434 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3437 RETURN(ll_file_set_lease(file, &ioc, arg));
3439 case LL_IOC_GET_LEASE: {
3440 struct ll_inode_info *lli = ll_i2info(inode);
3441 struct ldlm_lock *lock = NULL;
3444 mutex_lock(&lli->lli_och_mutex);
3445 if (fd->fd_lease_och != NULL) {
3446 struct obd_client_handle *och = fd->fd_lease_och;
3448 lock = ldlm_handle2lock(&och->och_lease_handle);
3450 lock_res_and_lock(lock);
3451 if (!ldlm_is_cancel(lock))
3452 fmode = och->och_flags;
3454 unlock_res_and_lock(lock);
3455 LDLM_LOCK_PUT(lock);
3458 mutex_unlock(&lli->lli_och_mutex);
3460 RETURN(ll_lease_type_from_fmode(fmode));
3462 case LL_IOC_HSM_IMPORT: {
3463 struct hsm_user_import *hui;
3469 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3474 rc = ll_hsm_import(inode, file, hui);
3479 case LL_IOC_FUTIMES_3: {
3480 struct ll_futimes_3 lfu;
3482 if (copy_from_user(&lfu,
3483 (const struct ll_futimes_3 __user *)arg,
3487 RETURN(ll_file_futimes_3(file, &lfu));
3489 case LL_IOC_LADVISE: {
3490 struct llapi_ladvise_hdr *k_ladvise_hdr;
3491 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3494 int alloc_size = sizeof(*k_ladvise_hdr);
3497 u_ladvise_hdr = (void __user *)arg;
3498 OBD_ALLOC_PTR(k_ladvise_hdr);
3499 if (k_ladvise_hdr == NULL)
3502 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3503 GOTO(out_ladvise, rc = -EFAULT);
3505 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3506 k_ladvise_hdr->lah_count < 1)
3507 GOTO(out_ladvise, rc = -EINVAL);
3509 num_advise = k_ladvise_hdr->lah_count;
3510 if (num_advise >= LAH_COUNT_MAX)
3511 GOTO(out_ladvise, rc = -EFBIG);
3513 OBD_FREE_PTR(k_ladvise_hdr);
3514 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3515 lah_advise[num_advise]);
3516 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3517 if (k_ladvise_hdr == NULL)
3521 * TODO: submit multiple advices to one server in a single RPC
3523 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3524 GOTO(out_ladvise, rc = -EFAULT);
3526 for (i = 0; i < num_advise; i++) {
3527 struct llapi_lu_ladvise *k_ladvise =
3528 &k_ladvise_hdr->lah_advise[i];
3529 struct llapi_lu_ladvise __user *u_ladvise =
3530 &u_ladvise_hdr->lah_advise[i];
3532 rc = ll_ladvise_sanity(inode, k_ladvise);
3534 GOTO(out_ladvise, rc);
3536 switch (k_ladvise->lla_advice) {
3537 case LU_LADVISE_LOCKNOEXPAND:
3538 rc = ll_lock_noexpand(file,
3539 k_ladvise->lla_peradvice_flags);
3540 GOTO(out_ladvise, rc);
3541 case LU_LADVISE_LOCKAHEAD:
3543 rc = ll_file_lock_ahead(file, k_ladvise);
3546 GOTO(out_ladvise, rc);
3549 &u_ladvise->lla_lockahead_result))
3550 GOTO(out_ladvise, rc = -EFAULT);
3553 rc = ll_ladvise(inode, file,
3554 k_ladvise_hdr->lah_flags,
3557 GOTO(out_ladvise, rc);
3564 OBD_FREE(k_ladvise_hdr, alloc_size);
3567 case LL_IOC_FLR_SET_MIRROR: {
3568 /* mirror I/O must be direct to avoid polluting page cache
3570 if (!(file->f_flags & O_DIRECT))
3573 fd->fd_designated_mirror = (__u32)arg;
3576 case LL_IOC_FSGETXATTR:
3577 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3578 case LL_IOC_FSSETXATTR:
3579 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3581 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3583 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3584 (void __user *)arg));
3588 #ifndef HAVE_FILE_LLSEEK_SIZE
3589 static inline loff_t
3590 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3592 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3594 if (offset > maxsize)
3597 if (offset != file->f_pos) {
3598 file->f_pos = offset;
3599 file->f_version = 0;
3605 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3606 loff_t maxsize, loff_t eof)
3608 struct inode *inode = file_inode(file);
3616 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3617 * position-querying operation. Avoid rewriting the "same"
3618 * f_pos value back to the file because a concurrent read(),
3619 * write() or lseek() might have altered it
3624 * f_lock protects against read/modify/write race with other
3625 * SEEK_CURs. Note that parallel writes and reads behave
3629 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3630 inode_unlock(inode);
3634 * In the generic case the entire file is data, so as long as
3635 * offset isn't at the end of the file then the offset is data.
3642 * There is a virtual hole at the end of the file, so as long as
3643 * offset isn't i_size or larger, return i_size.
3651 return llseek_execute(file, offset, maxsize);
3655 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3657 struct inode *inode = file_inode(file);
3658 loff_t retval, eof = 0;
3661 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3662 (origin == SEEK_CUR) ? file->f_pos : 0);
3663 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3664 PFID(ll_inode2fid(inode)), inode, retval, retval,
3666 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3668 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3669 retval = ll_glimpse_size(inode);
3672 eof = i_size_read(inode);
3675 retval = ll_generic_file_llseek_size(file, offset, origin,
3676 ll_file_maxbytes(inode), eof);
3680 static int ll_flush(struct file *file, fl_owner_t id)
3682 struct inode *inode = file_inode(file);
3683 struct ll_inode_info *lli = ll_i2info(inode);
3684 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3687 LASSERT(!S_ISDIR(inode->i_mode));
3689 /* catch async errors that were recorded back when async writeback
3690 * failed for pages in this mapping. */
3691 rc = lli->lli_async_rc;
3692 lli->lli_async_rc = 0;
3693 if (lli->lli_clob != NULL) {
3694 err = lov_read_and_clear_async_rc(lli->lli_clob);
3699 /* The application has been told write failure already.
3700 * Do not report failure again. */
3701 if (fd->fd_write_failed)
3703 return rc ? -EIO : 0;
3707 * Called to make sure a portion of file has been written out.
3708 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3710 * Return how many pages have been written.
3712 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3713 enum cl_fsync_mode mode, int ignore_layout)
3717 struct cl_fsync_io *fio;
3722 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3723 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3726 env = cl_env_get(&refcheck);
3728 RETURN(PTR_ERR(env));
3730 io = vvp_env_thread_io(env);
3731 io->ci_obj = ll_i2info(inode)->lli_clob;
3732 io->ci_ignore_layout = ignore_layout;
3734 /* initialize parameters for sync */
3735 fio = &io->u.ci_fsync;
3736 fio->fi_start = start;
3738 fio->fi_fid = ll_inode2fid(inode);
3739 fio->fi_mode = mode;
3740 fio->fi_nr_written = 0;
3742 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3743 result = cl_io_loop(env, io);
3745 result = io->ci_result;
3747 result = fio->fi_nr_written;
3748 cl_io_fini(env, io);
3749 cl_env_put(env, &refcheck);
3755 * When dentry is provided (the 'else' case), file_dentry() may be
3756 * null and dentry must be used directly rather than pulled from
3757 * file_dentry() as is done otherwise.
3760 #ifdef HAVE_FILE_FSYNC_4ARGS
3761 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3763 struct dentry *dentry = file_dentry(file);
3765 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3766 int ll_fsync(struct file *file, int datasync)
3768 struct dentry *dentry = file_dentry(file);
3770 loff_t end = LLONG_MAX;
3772 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3775 loff_t end = LLONG_MAX;
3777 struct inode *inode = dentry->d_inode;
3778 struct ll_inode_info *lli = ll_i2info(inode);
3779 struct ptlrpc_request *req;
3783 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3784 PFID(ll_inode2fid(inode)), inode);
3785 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3787 #ifdef HAVE_FILE_FSYNC_4ARGS
3788 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3789 lock_inode = !lli->lli_inode_locked;
3793 /* fsync's caller has already called _fdata{sync,write}, we want
3794 * that IO to finish before calling the osc and mdc sync methods */
3795 rc = filemap_fdatawait(inode->i_mapping);
3798 /* catch async errors that were recorded back when async writeback
3799 * failed for pages in this mapping. */
3800 if (!S_ISDIR(inode->i_mode)) {
3801 err = lli->lli_async_rc;
3802 lli->lli_async_rc = 0;
3805 if (lli->lli_clob != NULL) {
3806 err = lov_read_and_clear_async_rc(lli->lli_clob);
3812 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3816 ptlrpc_req_finished(req);
3818 if (S_ISREG(inode->i_mode)) {
3819 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3821 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3822 if (rc == 0 && err < 0)
3825 fd->fd_write_failed = true;
3827 fd->fd_write_failed = false;
3830 #ifdef HAVE_FILE_FSYNC_4ARGS
3832 inode_unlock(inode);
3838 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3840 struct inode *inode = file_inode(file);
3841 struct ll_sb_info *sbi = ll_i2sbi(inode);
3842 struct ldlm_enqueue_info einfo = {
3843 .ei_type = LDLM_FLOCK,
3844 .ei_cb_cp = ldlm_flock_completion_ast,
3845 .ei_cbdata = file_lock,
3847 struct md_op_data *op_data;
3848 struct lustre_handle lockh = { 0 };
3849 union ldlm_policy_data flock = { { 0 } };
3850 int fl_type = file_lock->fl_type;
3856 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3857 PFID(ll_inode2fid(inode)), file_lock);
3859 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3861 if (file_lock->fl_flags & FL_FLOCK) {
3862 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3863 /* flocks are whole-file locks */
3864 flock.l_flock.end = OFFSET_MAX;
3865 /* For flocks owner is determined by the local file desctiptor*/
3866 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3867 } else if (file_lock->fl_flags & FL_POSIX) {
3868 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3869 flock.l_flock.start = file_lock->fl_start;
3870 flock.l_flock.end = file_lock->fl_end;
3874 flock.l_flock.pid = file_lock->fl_pid;
3876 /* Somewhat ugly workaround for svc lockd.
3877 * lockd installs custom fl_lmops->lm_compare_owner that checks
3878 * for the fl_owner to be the same (which it always is on local node
3879 * I guess between lockd processes) and then compares pid.
3880 * As such we assign pid to the owner field to make it all work,
3881 * conflict with normal locks is unlikely since pid space and
3882 * pointer space for current->files are not intersecting */
3883 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3884 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3888 einfo.ei_mode = LCK_PR;
3891 /* An unlock request may or may not have any relation to
3892 * existing locks so we may not be able to pass a lock handle
3893 * via a normal ldlm_lock_cancel() request. The request may even
3894 * unlock a byte range in the middle of an existing lock. In
3895 * order to process an unlock request we need all of the same
3896 * information that is given with a normal read or write record
3897 * lock request. To avoid creating another ldlm unlock (cancel)
3898 * message we'll treat a LCK_NL flock request as an unlock. */
3899 einfo.ei_mode = LCK_NL;
3902 einfo.ei_mode = LCK_PW;
3905 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3920 flags = LDLM_FL_BLOCK_NOWAIT;
3926 flags = LDLM_FL_TEST_LOCK;
3929 CERROR("unknown fcntl lock command: %d\n", cmd);
3933 /* Save the old mode so that if the mode in the lock changes we
3934 * can decrement the appropriate reader or writer refcount. */
3935 file_lock->fl_type = einfo.ei_mode;
3937 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3938 LUSTRE_OPC_ANY, NULL);
3939 if (IS_ERR(op_data))
3940 RETURN(PTR_ERR(op_data));
3942 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3943 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3944 flock.l_flock.pid, flags, einfo.ei_mode,
3945 flock.l_flock.start, flock.l_flock.end);
3947 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3950 /* Restore the file lock type if not TEST lock. */
3951 if (!(flags & LDLM_FL_TEST_LOCK))
3952 file_lock->fl_type = fl_type;
3954 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3955 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3956 !(flags & LDLM_FL_TEST_LOCK))
3957 rc2 = locks_lock_file_wait(file, file_lock);
3959 if ((file_lock->fl_flags & FL_FLOCK) &&
3960 (rc == 0 || file_lock->fl_type == F_UNLCK))
3961 rc2 = flock_lock_file_wait(file, file_lock);
3962 if ((file_lock->fl_flags & FL_POSIX) &&
3963 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3964 !(flags & LDLM_FL_TEST_LOCK))
3965 rc2 = posix_lock_file_wait(file, file_lock);
3966 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3968 if (rc2 && file_lock->fl_type != F_UNLCK) {
3969 einfo.ei_mode = LCK_NL;
3970 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3975 ll_finish_md_op_data(op_data);
3980 int ll_get_fid_by_name(struct inode *parent, const char *name,
3981 int namelen, struct lu_fid *fid,
3982 struct inode **inode)
3984 struct md_op_data *op_data = NULL;
3985 struct mdt_body *body;
3986 struct ptlrpc_request *req;
3990 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3991 LUSTRE_OPC_ANY, NULL);
3992 if (IS_ERR(op_data))
3993 RETURN(PTR_ERR(op_data));
3995 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3996 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3997 ll_finish_md_op_data(op_data);
4001 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4003 GOTO(out_req, rc = -EFAULT);
4005 *fid = body->mbo_fid1;
4008 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4010 ptlrpc_req_finished(req);
4014 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
4015 const char *name, int namelen)
4017 struct dentry *dchild = NULL;
4018 struct inode *child_inode = NULL;
4019 struct md_op_data *op_data;
4020 struct ptlrpc_request *request = NULL;
4021 struct obd_client_handle *och = NULL;
4023 struct mdt_body *body;
4025 __u64 data_version = 0;
4028 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4029 name, PFID(ll_inode2fid(parent)), mdtidx);
4031 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4032 0, LUSTRE_OPC_ANY, NULL);
4033 if (IS_ERR(op_data))
4034 RETURN(PTR_ERR(op_data));
4036 /* Get child FID first */
4037 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4040 dchild = d_lookup(file_dentry(file), &qstr);
4041 if (dchild != NULL) {
4042 if (dchild->d_inode != NULL)
4043 child_inode = igrab(dchild->d_inode);
4047 if (child_inode == NULL) {
4048 rc = ll_get_fid_by_name(parent, name, namelen,
4049 &op_data->op_fid3, &child_inode);
4054 if (child_inode == NULL)
4055 GOTO(out_free, rc = -EINVAL);
4058 * lfs migrate command needs to be blocked on the client
4059 * by checking the migrate FID against the FID of the
4062 if (child_inode == parent->i_sb->s_root->d_inode)
4063 GOTO(out_iput, rc = -EINVAL);
4065 inode_lock(child_inode);
4066 op_data->op_fid3 = *ll_inode2fid(child_inode);
4067 if (!fid_is_sane(&op_data->op_fid3)) {
4068 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4069 ll_get_fsname(parent->i_sb, NULL, 0), name,
4070 PFID(&op_data->op_fid3));
4071 GOTO(out_unlock, rc = -EINVAL);
4074 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4076 GOTO(out_unlock, rc);
4079 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4080 PFID(&op_data->op_fid3), mdtidx);
4081 GOTO(out_unlock, rc = 0);
4084 if (S_ISREG(child_inode->i_mode)) {
4085 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4089 GOTO(out_unlock, rc);
4092 rc = ll_data_version(child_inode, &data_version,
4095 GOTO(out_close, rc);
4097 op_data->op_handle = och->och_fh;
4098 op_data->op_data = och->och_mod;
4099 op_data->op_data_version = data_version;
4100 op_data->op_lease_handle = och->och_lease_handle;
4101 op_data->op_bias |= MDS_RENAME_MIGRATE;
4104 op_data->op_mds = mdtidx;
4105 op_data->op_cli_flags = CLI_MIGRATE;
4106 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4107 namelen, name, namelen, &request);
4109 LASSERT(request != NULL);
4110 ll_update_times(request, parent);
4112 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4113 LASSERT(body != NULL);
4115 /* If the server does release layout lock, then we cleanup
4116 * the client och here, otherwise release it in out_close: */
4118 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4119 obd_mod_put(och->och_mod);
4120 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4122 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4128 if (request != NULL) {
4129 ptlrpc_req_finished(request);
4133 /* Try again if the file layout has changed. */
4134 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4138 if (och != NULL) /* close the file */
4139 ll_lease_close(och, child_inode, NULL);
4141 clear_nlink(child_inode);
4143 inode_unlock(child_inode);
4147 ll_finish_md_op_data(op_data);
4152 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4160 * test if some locks matching bits and l_req_mode are acquired
4161 * - bits can be in different locks
4162 * - if found clear the common lock bits in *bits
4163 * - the bits not found, are kept in *bits
4165 * \param bits [IN] searched lock bits [IN]
4166 * \param l_req_mode [IN] searched lock mode
4167 * \retval boolean, true iff all bits are found
4169 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4171 struct lustre_handle lockh;
4172 union ldlm_policy_data policy;
4173 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4174 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4183 fid = &ll_i2info(inode)->lli_fid;
4184 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4185 ldlm_lockname[mode]);
4187 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4188 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4189 policy.l_inodebits.bits = *bits & (1 << i);
4190 if (policy.l_inodebits.bits == 0)
4193 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4194 &policy, mode, &lockh)) {
4195 struct ldlm_lock *lock;
4197 lock = ldlm_handle2lock(&lockh);
4200 ~(lock->l_policy_data.l_inodebits.bits);
4201 LDLM_LOCK_PUT(lock);
4203 *bits &= ~policy.l_inodebits.bits;
4210 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4211 struct lustre_handle *lockh, __u64 flags,
4212 enum ldlm_mode mode)
4214 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4219 fid = &ll_i2info(inode)->lli_fid;
4220 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4222 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4223 fid, LDLM_IBITS, &policy, mode, lockh);
4228 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4230 /* Already unlinked. Just update nlink and return success */
4231 if (rc == -ENOENT) {
4233 /* If it is striped directory, and there is bad stripe
4234 * Let's revalidate the dentry again, instead of returning
4236 if (S_ISDIR(inode->i_mode) &&
4237 ll_i2info(inode)->lli_lsm_md != NULL)
4240 /* This path cannot be hit for regular files unless in
4241 * case of obscure races, so no need to to validate
4243 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4245 } else if (rc != 0) {
4246 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4247 "%s: revalidate FID "DFID" error: rc = %d\n",
4248 ll_get_fsname(inode->i_sb, NULL, 0),
4249 PFID(ll_inode2fid(inode)), rc);
4255 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4257 struct inode *inode = dentry->d_inode;
4258 struct obd_export *exp = ll_i2mdexp(inode);
4259 struct lookup_intent oit = {
4262 struct ptlrpc_request *req = NULL;
4263 struct md_op_data *op_data;
4267 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4268 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4270 /* Call getattr by fid, so do not provide name at all. */
4271 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4272 LUSTRE_OPC_ANY, NULL);
4273 if (IS_ERR(op_data))
4274 RETURN(PTR_ERR(op_data));
4276 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4277 ll_finish_md_op_data(op_data);
4279 rc = ll_inode_revalidate_fini(inode, rc);
4283 rc = ll_revalidate_it_finish(req, &oit, dentry);
4285 ll_intent_release(&oit);
4289 /* Unlinked? Unhash dentry, so it is not picked up later by
4290 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4291 * here to preserve get_cwd functionality on 2.6.
4293 if (!dentry->d_inode->i_nlink) {
4294 ll_lock_dcache(inode);
4295 d_lustre_invalidate(dentry, 0);
4296 ll_unlock_dcache(inode);
4299 ll_lookup_finish_locks(&oit, dentry);
4301 ptlrpc_req_finished(req);
4306 static int ll_merge_md_attr(struct inode *inode)
4308 struct cl_attr attr = { 0 };
4311 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4312 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4313 &attr, ll_md_blocking_ast);
4317 set_nlink(inode, attr.cat_nlink);
4318 inode->i_blocks = attr.cat_blocks;
4319 i_size_write(inode, attr.cat_size);
4321 ll_i2info(inode)->lli_atime = attr.cat_atime;
4322 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4323 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4328 static inline dev_t ll_compat_encode_dev(dev_t dev)
4330 /* The compat_sys_*stat*() syscalls will fail unless the
4331 * device majors and minors are both less than 256. Note that
4332 * the value returned here will be passed through
4333 * old_encode_dev() in cp_compat_stat(). And so we are not
4334 * trying to return a valid compat (u16) device number, just
4335 * one that will pass the old_valid_dev() check. */
4337 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4340 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4341 int ll_getattr(const struct path *path, struct kstat *stat,
4342 u32 request_mask, unsigned int flags)
4344 struct dentry *de = path->dentry;
4346 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4349 struct inode *inode = de->d_inode;
4350 struct ll_sb_info *sbi = ll_i2sbi(inode);
4351 struct ll_inode_info *lli = ll_i2info(inode);
4354 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4356 rc = ll_inode_revalidate(de, IT_GETATTR);
4360 if (S_ISREG(inode->i_mode)) {
4361 /* In case of restore, the MDT has the right size and has
4362 * already send it back without granting the layout lock,
4363 * inode is up-to-date so glimpse is useless.
4364 * Also to glimpse we need the layout, in case of a running
4365 * restore the MDT holds the layout lock so the glimpse will
4366 * block up to the end of restore (getattr will block)
4368 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4369 rc = ll_glimpse_size(inode);
4374 /* If object isn't regular a file then don't validate size. */
4375 if (S_ISDIR(inode->i_mode) &&
4376 lli->lli_lsm_md != NULL) {
4377 rc = ll_merge_md_attr(inode);
4382 LTIME_S(inode->i_atime) = lli->lli_atime;
4383 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4384 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4387 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4389 if (ll_need_32bit_api(sbi)) {
4390 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4391 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4392 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4394 stat->ino = inode->i_ino;
4395 stat->dev = inode->i_sb->s_dev;
4396 stat->rdev = inode->i_rdev;
4399 stat->mode = inode->i_mode;
4400 stat->uid = inode->i_uid;
4401 stat->gid = inode->i_gid;
4402 stat->atime = inode->i_atime;
4403 stat->mtime = inode->i_mtime;
4404 stat->ctime = inode->i_ctime;
4405 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4407 stat->nlink = inode->i_nlink;
4408 stat->size = i_size_read(inode);
4409 stat->blocks = inode->i_blocks;
4414 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4415 __u64 start, __u64 len)
4419 struct fiemap *fiemap;
4420 unsigned int extent_count = fieinfo->fi_extents_max;
4422 num_bytes = sizeof(*fiemap) + (extent_count *
4423 sizeof(struct fiemap_extent));
4424 OBD_ALLOC_LARGE(fiemap, num_bytes);
4429 fiemap->fm_flags = fieinfo->fi_flags;
4430 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4431 fiemap->fm_start = start;
4432 fiemap->fm_length = len;
4433 if (extent_count > 0 &&
4434 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4435 sizeof(struct fiemap_extent)) != 0)
4436 GOTO(out, rc = -EFAULT);
4438 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4440 fieinfo->fi_flags = fiemap->fm_flags;
4441 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4442 if (extent_count > 0 &&
4443 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4444 fiemap->fm_mapped_extents *
4445 sizeof(struct fiemap_extent)) != 0)
4446 GOTO(out, rc = -EFAULT);
4448 OBD_FREE_LARGE(fiemap, num_bytes);
4452 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4454 struct ll_inode_info *lli = ll_i2info(inode);
4455 struct posix_acl *acl = NULL;
4458 spin_lock(&lli->lli_lock);
4459 /* VFS' acl_permission_check->check_acl will release the refcount */
4460 acl = posix_acl_dup(lli->lli_posix_acl);
4461 spin_unlock(&lli->lli_lock);
4466 #ifdef HAVE_IOP_SET_ACL
4467 #ifdef CONFIG_FS_POSIX_ACL
4468 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4470 const char *name = NULL;
4477 case ACL_TYPE_ACCESS:
4479 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4483 name = XATTR_NAME_POSIX_ACL_ACCESS;
4485 case ACL_TYPE_DEFAULT:
4486 if (!S_ISDIR(inode->i_mode))
4487 GOTO(out, rc = acl ? -EACCES : 0);
4488 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4491 GOTO(out, rc = -EINVAL);
4495 size = posix_acl_xattr_size(acl->a_count);
4496 value = kmalloc(size, GFP_NOFS);
4498 GOTO(out, rc = -ENOMEM);
4500 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4505 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4506 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4511 set_cached_acl(inode, type, acl);
4513 forget_cached_acl(inode, type);
4516 #endif /* CONFIG_FS_POSIX_ACL */
4517 #endif /* HAVE_IOP_SET_ACL */
4519 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4521 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4522 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4524 ll_check_acl(struct inode *inode, int mask)
4527 # ifdef CONFIG_FS_POSIX_ACL
4528 struct posix_acl *acl;
4532 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4533 if (flags & IPERM_FLAG_RCU)
4536 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4541 rc = posix_acl_permission(inode, acl, mask);
4542 posix_acl_release(acl);
4545 # else /* !CONFIG_FS_POSIX_ACL */
4547 # endif /* CONFIG_FS_POSIX_ACL */
4549 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4551 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4552 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4554 # ifdef HAVE_INODE_PERMISION_2ARGS
4555 int ll_inode_permission(struct inode *inode, int mask)
4557 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4562 struct ll_sb_info *sbi;
4563 struct root_squash_info *squash;
4564 struct cred *cred = NULL;
4565 const struct cred *old_cred = NULL;
4567 bool squash_id = false;
4570 #ifdef MAY_NOT_BLOCK
4571 if (mask & MAY_NOT_BLOCK)
4573 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4574 if (flags & IPERM_FLAG_RCU)
4578 /* as root inode are NOT getting validated in lookup operation,
4579 * need to do it before permission check. */
4581 if (inode == inode->i_sb->s_root->d_inode) {
4582 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4587 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4588 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4590 /* squash fsuid/fsgid if needed */
4591 sbi = ll_i2sbi(inode);
4592 squash = &sbi->ll_squash;
4593 if (unlikely(squash->rsi_uid != 0 &&
4594 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4595 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4599 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4600 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4601 squash->rsi_uid, squash->rsi_gid);
4603 /* update current process's credentials
4604 * and FS capability */
4605 cred = prepare_creds();
4609 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4610 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4611 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4612 if ((1 << cap) & CFS_CAP_FS_MASK)
4613 cap_lower(cred->cap_effective, cap);
4615 old_cred = override_creds(cred);
4618 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4619 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4620 /* restore current process's credentials and FS capability */
4622 revert_creds(old_cred);
4629 /* -o localflock - only provides locally consistent flock locks */
4630 struct file_operations ll_file_operations = {
4631 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4632 # ifdef HAVE_SYNC_READ_WRITE
4633 .read = new_sync_read,
4634 .write = new_sync_write,
4636 .read_iter = ll_file_read_iter,
4637 .write_iter = ll_file_write_iter,
4638 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4639 .read = ll_file_read,
4640 .aio_read = ll_file_aio_read,
4641 .write = ll_file_write,
4642 .aio_write = ll_file_aio_write,
4643 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4644 .unlocked_ioctl = ll_file_ioctl,
4645 .open = ll_file_open,
4646 .release = ll_file_release,
4647 .mmap = ll_file_mmap,
4648 .llseek = ll_file_seek,
4649 .splice_read = ll_file_splice_read,
4654 struct file_operations ll_file_operations_flock = {
4655 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4656 # ifdef HAVE_SYNC_READ_WRITE
4657 .read = new_sync_read,
4658 .write = new_sync_write,
4659 # endif /* HAVE_SYNC_READ_WRITE */
4660 .read_iter = ll_file_read_iter,
4661 .write_iter = ll_file_write_iter,
4662 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4663 .read = ll_file_read,
4664 .aio_read = ll_file_aio_read,
4665 .write = ll_file_write,
4666 .aio_write = ll_file_aio_write,
4667 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4668 .unlocked_ioctl = ll_file_ioctl,
4669 .open = ll_file_open,
4670 .release = ll_file_release,
4671 .mmap = ll_file_mmap,
4672 .llseek = ll_file_seek,
4673 .splice_read = ll_file_splice_read,
4676 .flock = ll_file_flock,
4677 .lock = ll_file_flock
4680 /* These are for -o noflock - to return ENOSYS on flock calls */
4681 struct file_operations ll_file_operations_noflock = {
4682 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4683 # ifdef HAVE_SYNC_READ_WRITE
4684 .read = new_sync_read,
4685 .write = new_sync_write,
4686 # endif /* HAVE_SYNC_READ_WRITE */
4687 .read_iter = ll_file_read_iter,
4688 .write_iter = ll_file_write_iter,
4689 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4690 .read = ll_file_read,
4691 .aio_read = ll_file_aio_read,
4692 .write = ll_file_write,
4693 .aio_write = ll_file_aio_write,
4694 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4695 .unlocked_ioctl = ll_file_ioctl,
4696 .open = ll_file_open,
4697 .release = ll_file_release,
4698 .mmap = ll_file_mmap,
4699 .llseek = ll_file_seek,
4700 .splice_read = ll_file_splice_read,
4703 .flock = ll_file_noflock,
4704 .lock = ll_file_noflock
4707 struct inode_operations ll_file_inode_operations = {
4708 .setattr = ll_setattr,
4709 .getattr = ll_getattr,
4710 .permission = ll_inode_permission,
4711 #ifdef HAVE_IOP_XATTR
4712 .setxattr = ll_setxattr,
4713 .getxattr = ll_getxattr,
4714 .removexattr = ll_removexattr,
4716 .listxattr = ll_listxattr,
4717 .fiemap = ll_fiemap,
4718 #ifdef HAVE_IOP_GET_ACL
4719 .get_acl = ll_get_acl,
4721 #ifdef HAVE_IOP_SET_ACL
4722 .set_acl = ll_set_acl,
4726 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4728 struct ll_inode_info *lli = ll_i2info(inode);
4729 struct cl_object *obj = lli->lli_clob;
4738 env = cl_env_get(&refcheck);
4740 RETURN(PTR_ERR(env));
4742 rc = cl_conf_set(env, lli->lli_clob, conf);
4746 if (conf->coc_opc == OBJECT_CONF_SET) {
4747 struct ldlm_lock *lock = conf->coc_lock;
4748 struct cl_layout cl = {
4752 LASSERT(lock != NULL);
4753 LASSERT(ldlm_has_layout(lock));
4755 /* it can only be allowed to match after layout is
4756 * applied to inode otherwise false layout would be
4757 * seen. Applying layout shoud happen before dropping
4758 * the intent lock. */
4759 ldlm_lock_allow_match(lock);
4761 rc = cl_object_layout_get(env, obj, &cl);
4766 DFID": layout version change: %u -> %u\n",
4767 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4769 ll_layout_version_set(lli, cl.cl_layout_gen);
4773 cl_env_put(env, &refcheck);
4778 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4779 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4782 struct ll_sb_info *sbi = ll_i2sbi(inode);
4783 struct ptlrpc_request *req;
4784 struct mdt_body *body;
4791 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4792 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4793 lock->l_lvb_data, lock->l_lvb_len);
4795 if (lock->l_lvb_data != NULL)
4798 /* if layout lock was granted right away, the layout is returned
4799 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4800 * blocked and then granted via completion ast, we have to fetch
4801 * layout here. Please note that we can't use the LVB buffer in
4802 * completion AST because it doesn't have a large enough buffer */
4803 rc = ll_get_default_mdsize(sbi, &lmmsize);
4805 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4806 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4811 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4813 GOTO(out, rc = -EPROTO);
4815 lmmsize = body->mbo_eadatasize;
4816 if (lmmsize == 0) /* empty layout */
4819 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4821 GOTO(out, rc = -EFAULT);
4823 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4824 if (lvbdata == NULL)
4825 GOTO(out, rc = -ENOMEM);
4827 memcpy(lvbdata, lmm, lmmsize);
4828 lock_res_and_lock(lock);
4829 if (unlikely(lock->l_lvb_data == NULL)) {
4830 lock->l_lvb_type = LVB_T_LAYOUT;
4831 lock->l_lvb_data = lvbdata;
4832 lock->l_lvb_len = lmmsize;
4835 unlock_res_and_lock(lock);
4838 OBD_FREE_LARGE(lvbdata, lmmsize);
4843 ptlrpc_req_finished(req);
4848 * Apply the layout to the inode. Layout lock is held and will be released
4851 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4852 struct inode *inode)
4854 struct ll_inode_info *lli = ll_i2info(inode);
4855 struct ll_sb_info *sbi = ll_i2sbi(inode);
4856 struct ldlm_lock *lock;
4857 struct cl_object_conf conf;
4860 bool wait_layout = false;
4863 LASSERT(lustre_handle_is_used(lockh));
4865 lock = ldlm_handle2lock(lockh);
4866 LASSERT(lock != NULL);
4867 LASSERT(ldlm_has_layout(lock));
4869 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4870 PFID(&lli->lli_fid), inode);
4872 /* in case this is a caching lock and reinstate with new inode */
4873 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4875 lock_res_and_lock(lock);
4876 lvb_ready = ldlm_is_lvb_ready(lock);
4877 unlock_res_and_lock(lock);
4879 /* checking lvb_ready is racy but this is okay. The worst case is
4880 * that multi processes may configure the file on the same time. */
4884 rc = ll_layout_fetch(inode, lock);
4888 /* for layout lock, lmm is stored in lock's lvb.
4889 * lvb_data is immutable if the lock is held so it's safe to access it
4892 * set layout to file. Unlikely this will fail as old layout was
4893 * surely eliminated */
4894 memset(&conf, 0, sizeof conf);
4895 conf.coc_opc = OBJECT_CONF_SET;
4896 conf.coc_inode = inode;
4897 conf.coc_lock = lock;
4898 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4899 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4900 rc = ll_layout_conf(inode, &conf);
4902 /* refresh layout failed, need to wait */
4903 wait_layout = rc == -EBUSY;
4906 LDLM_LOCK_PUT(lock);
4907 ldlm_lock_decref(lockh, mode);
4909 /* wait for IO to complete if it's still being used. */
4911 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4912 ll_get_fsname(inode->i_sb, NULL, 0),
4913 PFID(&lli->lli_fid), inode);
4915 memset(&conf, 0, sizeof conf);
4916 conf.coc_opc = OBJECT_CONF_WAIT;
4917 conf.coc_inode = inode;
4918 rc = ll_layout_conf(inode, &conf);
4922 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4923 ll_get_fsname(inode->i_sb, NULL, 0),
4924 PFID(&lli->lli_fid), rc);
4930 * Issue layout intent RPC to MDS.
4931 * \param inode [in] file inode
4932 * \param intent [in] layout intent
4934 * \retval 0 on success
4935 * \retval < 0 error code
4937 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4939 struct ll_inode_info *lli = ll_i2info(inode);
4940 struct ll_sb_info *sbi = ll_i2sbi(inode);
4941 struct md_op_data *op_data;
4942 struct lookup_intent it;
4943 struct ptlrpc_request *req;
4947 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4948 0, 0, LUSTRE_OPC_ANY, NULL);
4949 if (IS_ERR(op_data))
4950 RETURN(PTR_ERR(op_data));
4952 op_data->op_data = intent;
4953 op_data->op_data_size = sizeof(*intent);
4955 memset(&it, 0, sizeof(it));
4956 it.it_op = IT_LAYOUT;
4957 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4958 intent->li_opc == LAYOUT_INTENT_TRUNC)
4959 it.it_flags = FMODE_WRITE;
4961 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4962 ll_get_fsname(inode->i_sb, NULL, 0),
4963 PFID(&lli->lli_fid), inode);
4965 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4966 &ll_md_blocking_ast, 0);
4967 if (it.it_request != NULL)
4968 ptlrpc_req_finished(it.it_request);
4969 it.it_request = NULL;
4971 ll_finish_md_op_data(op_data);
4973 /* set lock data in case this is a new lock */
4975 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4977 ll_intent_drop_lock(&it);
4983 * This function checks if there exists a LAYOUT lock on the client side,
4984 * or enqueues it if it doesn't have one in cache.
4986 * This function will not hold layout lock so it may be revoked any time after
4987 * this function returns. Any operations depend on layout should be redone
4990 * This function should be called before lov_io_init() to get an uptodate
4991 * layout version, the caller should save the version number and after IO
4992 * is finished, this function should be called again to verify that layout
4993 * is not changed during IO time.
4995 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4997 struct ll_inode_info *lli = ll_i2info(inode);
4998 struct ll_sb_info *sbi = ll_i2sbi(inode);
4999 struct lustre_handle lockh;
5000 struct layout_intent intent = {
5001 .li_opc = LAYOUT_INTENT_ACCESS,
5003 enum ldlm_mode mode;
5007 *gen = ll_layout_version_get(lli);
5008 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5012 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5013 LASSERT(S_ISREG(inode->i_mode));
5015 /* take layout lock mutex to enqueue layout lock exclusively. */
5016 mutex_lock(&lli->lli_layout_mutex);
5019 /* mostly layout lock is caching on the local side, so try to
5020 * match it before grabbing layout lock mutex. */
5021 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5022 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5023 if (mode != 0) { /* hit cached lock */
5024 rc = ll_layout_lock_set(&lockh, mode, inode);
5030 rc = ll_layout_intent(inode, &intent);
5036 *gen = ll_layout_version_get(lli);
5037 mutex_unlock(&lli->lli_layout_mutex);
5043 * Issue layout intent RPC indicating where in a file an IO is about to write.
5045 * \param[in] inode file inode.
5046 * \param[in] ext write range with start offset of fille in bytes where
5047 * an IO is about to write, and exclusive end offset in
5050 * \retval 0 on success
5051 * \retval < 0 error code
5053 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5054 struct lu_extent *ext)
5056 struct layout_intent intent = {
5058 .li_extent.e_start = ext->e_start,
5059 .li_extent.e_end = ext->e_end,
5064 rc = ll_layout_intent(inode, &intent);
5070 * This function send a restore request to the MDT
5072 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5074 struct hsm_user_request *hur;
5078 len = sizeof(struct hsm_user_request) +
5079 sizeof(struct hsm_user_item);
5080 OBD_ALLOC(hur, len);
5084 hur->hur_request.hr_action = HUA_RESTORE;
5085 hur->hur_request.hr_archive_id = 0;
5086 hur->hur_request.hr_flags = 0;
5087 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5088 sizeof(hur->hur_user_item[0].hui_fid));
5089 hur->hur_user_item[0].hui_extent.offset = offset;
5090 hur->hur_user_item[0].hui_extent.length = length;
5091 hur->hur_request.hr_itemcount = 1;
5092 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,