4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 rc = md_close(md_exp, op_data, och->och_mod, &req);
203 if (rc != 0 && rc != -EINTR)
204 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
205 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
207 if (rc == 0 && op_data->op_bias & bias) {
208 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
215 ll_finish_md_op_data(op_data);
219 md_clear_open_replay_data(md_exp, och);
220 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
223 ptlrpc_req_finished(req); /* This is close request */
227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
229 struct ll_inode_info *lli = ll_i2info(inode);
230 struct obd_client_handle **och_p;
231 struct obd_client_handle *och;
236 if (fmode & FMODE_WRITE) {
237 och_p = &lli->lli_mds_write_och;
238 och_usecount = &lli->lli_open_fd_write_count;
239 } else if (fmode & FMODE_EXEC) {
240 och_p = &lli->lli_mds_exec_och;
241 och_usecount = &lli->lli_open_fd_exec_count;
243 LASSERT(fmode & FMODE_READ);
244 och_p = &lli->lli_mds_read_och;
245 och_usecount = &lli->lli_open_fd_read_count;
248 mutex_lock(&lli->lli_och_mutex);
249 if (*och_usecount > 0) {
250 /* There are still users of this handle, so skip
252 mutex_unlock(&lli->lli_och_mutex);
258 mutex_unlock(&lli->lli_och_mutex);
261 /* There might be a race and this handle may already
263 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
269 static int ll_md_close(struct inode *inode, struct file *file)
271 union ldlm_policy_data policy = {
272 .l_inodebits = { MDS_INODELOCK_OPEN },
274 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
275 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
276 struct ll_inode_info *lli = ll_i2info(inode);
277 struct lustre_handle lockh;
278 enum ldlm_mode lockmode;
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 mutex_lock(&lli->lli_och_mutex);
307 if (fd->fd_omode & FMODE_WRITE) {
309 LASSERT(lli->lli_open_fd_write_count);
310 lli->lli_open_fd_write_count--;
311 } else if (fd->fd_omode & FMODE_EXEC) {
313 LASSERT(lli->lli_open_fd_exec_count);
314 lli->lli_open_fd_exec_count--;
317 LASSERT(lli->lli_open_fd_read_count);
318 lli->lli_open_fd_read_count--;
320 mutex_unlock(&lli->lli_och_mutex);
322 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
323 LDLM_IBITS, &policy, lockmode, &lockh))
324 rc = ll_md_real_close(inode, fd->fd_omode);
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
333 /* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here. Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
338 int ll_file_release(struct inode *inode, struct file *file)
340 struct ll_file_data *fd;
341 struct ll_sb_info *sbi = ll_i2sbi(inode);
342 struct ll_inode_info *lli = ll_i2info(inode);
346 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
347 PFID(ll_inode2fid(inode)), inode);
349 if (inode->i_sb->s_root != file_dentry(file))
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead,
355 * because parent and child process can share the same file handle. */
356 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
357 ll_deauthorize_statahead(inode, fd);
359 if (inode->i_sb->s_root == file_dentry(file)) {
360 LUSTRE_FPRIVATE(file) = NULL;
361 ll_file_data_put(fd);
365 if (!S_ISDIR(inode->i_mode)) {
366 if (lli->lli_clob != NULL)
367 lov_read_and_clear_async_rc(lli->lli_clob);
368 lli->lli_async_rc = 0;
371 rc = ll_md_close(inode, file);
373 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
374 libcfs_debug_dumplog();
379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
380 struct lookup_intent *itp)
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
440 /* We did open by fid, but by the time we got to the server,
441 * the object disappeared. If this is a create, we cannot really
442 * tell the userspace that the file it was trying to create
443 * does not exist. Instead let's return -ESTALE, and the VFS will
444 * retry the create with LOOKUP_REVAL that we are going to catch
445 * in ll_revalidate_dentry() and use lookup then.
447 if (rc == -ENOENT && itp->it_op & IT_CREAT)
453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
454 struct obd_client_handle *och)
456 struct mdt_body *body;
458 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
459 och->och_fh = body->mbo_handle;
460 och->och_fid = body->mbo_fid1;
461 och->och_lease_handle.cookie = it->it_lock_handle;
462 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
463 och->och_flags = it->it_flags;
465 return md_set_open_replay_data(md_exp, och, it);
468 static int ll_local_open(struct file *file, struct lookup_intent *it,
469 struct ll_file_data *fd, struct obd_client_handle *och)
471 struct inode *inode = file_inode(file);
474 LASSERT(!LUSTRE_FPRIVATE(file));
481 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
486 LUSTRE_FPRIVATE(file) = fd;
487 ll_readahead_init(inode, &fd->fd_ras);
488 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
490 /* ll_cl_context initialize */
491 rwlock_init(&fd->fd_lock);
492 INIT_LIST_HEAD(&fd->fd_lccs);
497 /* Open a file, and (for the very first open) create objects on the OSTs at
498 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
499 * creation or open until ll_lov_setstripe() ioctl is called.
501 * If we already have the stripe MD locally then we don't request it in
502 * md_open(), by passing a lmm_size = 0.
504 * It is up to the application to ensure no other processes open this file
505 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
506 * used. We might be able to avoid races of that sort by getting lli_open_sem
507 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
508 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
510 int ll_file_open(struct inode *inode, struct file *file)
512 struct ll_inode_info *lli = ll_i2info(inode);
513 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
514 .it_flags = file->f_flags };
515 struct obd_client_handle **och_p = NULL;
516 __u64 *och_usecount = NULL;
517 struct ll_file_data *fd;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
522 PFID(ll_inode2fid(inode)), inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_nofiledata, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode))
533 ll_authorize_statahead(inode, fd);
535 if (inode->i_sb->s_root == file_dentry(file)) {
536 LUSTRE_FPRIVATE(file) = fd;
540 if (!it || !it->it_disposition) {
541 /* Convert f_flags into access mode. We cannot use file->f_mode,
542 * because everything but O_ACCMODE mask was stripped from
544 if ((oit.it_flags + 1) & O_ACCMODE)
546 if (file->f_flags & O_TRUNC)
547 oit.it_flags |= FMODE_WRITE;
549 /* kernel only call f_op->open in dentry_open. filp_open calls
550 * dentry_open after call to open_namei that checks permissions.
551 * Only nfsd_open call dentry_open directly without checking
552 * permissions and because of that this code below is safe. */
553 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
554 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556 /* We do not want O_EXCL here, presumably we opened the file
557 * already? XXX - NFS implications? */
558 oit.it_flags &= ~O_EXCL;
560 /* bug20584, if "it_flags" contains O_CREAT, the file will be
561 * created if necessary, then "IT_CREAT" should be set to keep
562 * consistent with it */
563 if (oit.it_flags & O_CREAT)
564 oit.it_op |= IT_CREAT;
570 /* Let's see if we have file open on MDS already. */
571 if (it->it_flags & FMODE_WRITE) {
572 och_p = &lli->lli_mds_write_och;
573 och_usecount = &lli->lli_open_fd_write_count;
574 } else if (it->it_flags & FMODE_EXEC) {
575 och_p = &lli->lli_mds_exec_och;
576 och_usecount = &lli->lli_open_fd_exec_count;
578 och_p = &lli->lli_mds_read_och;
579 och_usecount = &lli->lli_open_fd_read_count;
582 mutex_lock(&lli->lli_och_mutex);
583 if (*och_p) { /* Open handle is present */
584 if (it_disposition(it, DISP_OPEN_OPEN)) {
585 /* Well, there's extra open request that we do not need,
586 let's close it somehow. This will decref request. */
587 rc = it_open_error(DISP_OPEN_OPEN, it);
589 mutex_unlock(&lli->lli_och_mutex);
590 GOTO(out_openerr, rc);
593 ll_release_openhandle(file_dentry(file), it);
597 rc = ll_local_open(file, it, fd, NULL);
600 mutex_unlock(&lli->lli_och_mutex);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->it_disposition) {
606 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
607 /* We cannot just request lock handle now, new ELC code
608 means that one of other OPEN locks for this file
609 could be cancelled, and since blocking ast handler
610 would attempt to grab och_mutex as well, that would
611 result in a deadlock */
612 mutex_unlock(&lli->lli_och_mutex);
614 * Normally called under two situations:
616 * 2. A race/condition on MDS resulting in no open
617 * handle to be returned from LOOKUP|OPEN request,
618 * for example if the target entry was a symlink.
620 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
621 * marked by a bit set in ll_iget_for_nfs. Clear the
622 * bit so that it's not confusing later callers.
624 * NB; when ldd is NULL, it must have come via normal
625 * lookup path only, since ll_iget_for_nfs always calls
628 if (ldd && ldd->lld_nfs_dentry) {
629 ldd->lld_nfs_dentry = 0;
630 it->it_flags |= MDS_OPEN_LOCK;
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
641 GOTO(out_openerr, rc);
645 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
647 GOTO(out_och_free, rc = -ENOMEM);
651 /* md_intent_lock() didn't get a request ref if there was an
652 * open error, so don't do cleanup on the request here
654 /* XXX (green): Should not we bail out on any error here, not
655 * just open error? */
656 rc = it_open_error(DISP_OPEN_OPEN, it);
658 GOTO(out_och_free, rc);
660 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
661 "inode %p: disposition %x, status %d\n", inode,
662 it_disposition(it, ~0), it->it_status);
664 rc = ll_local_open(file, it, fd, *och_p);
666 GOTO(out_och_free, rc);
668 mutex_unlock(&lli->lli_och_mutex);
671 /* Must do this outside lli_och_mutex lock to prevent deadlock where
672 different kind of OPEN lock for this same inode gets cancelled
673 by ldlm_cancel_lru */
674 if (!S_ISREG(inode->i_mode))
675 GOTO(out_och_free, rc);
677 cl_lov_delay_create_clear(&file->f_flags);
678 GOTO(out_och_free, rc);
682 if (och_p && *och_p) {
683 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
684 *och_p = NULL; /* OBD_FREE writes some magic there */
687 mutex_unlock(&lli->lli_och_mutex);
690 if (lli->lli_opendir_key == fd)
691 ll_deauthorize_statahead(inode, fd);
693 ll_file_data_put(fd);
695 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
699 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
700 ptlrpc_req_finished(it->it_request);
701 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
707 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
708 struct ldlm_lock_desc *desc, void *data, int flag)
711 struct lustre_handle lockh;
715 case LDLM_CB_BLOCKING:
716 ldlm_lock2handle(lock, &lockh);
717 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
719 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
723 case LDLM_CB_CANCELING:
731 * When setting a lease on a file, we take ownership of the lli_mds_*_och
732 * and save it as fd->fd_och so as to force client to reopen the file even
733 * if it has an open lock in cache already.
735 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
736 struct lustre_handle *old_handle)
738 struct ll_inode_info *lli = ll_i2info(inode);
739 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
740 struct obd_client_handle **och_p;
745 /* Get the openhandle of the file */
746 mutex_lock(&lli->lli_och_mutex);
747 if (fd->fd_lease_och != NULL)
748 GOTO(out_unlock, rc = -EBUSY);
750 if (fd->fd_och == NULL) {
751 if (file->f_mode & FMODE_WRITE) {
752 LASSERT(lli->lli_mds_write_och != NULL);
753 och_p = &lli->lli_mds_write_och;
754 och_usecount = &lli->lli_open_fd_write_count;
756 LASSERT(lli->lli_mds_read_och != NULL);
757 och_p = &lli->lli_mds_read_och;
758 och_usecount = &lli->lli_open_fd_read_count;
761 if (*och_usecount > 1)
762 GOTO(out_unlock, rc = -EBUSY);
769 *old_handle = fd->fd_och->och_fh;
773 mutex_unlock(&lli->lli_och_mutex);
778 * Release ownership on lli_mds_*_och when putting back a file lease.
780 static int ll_lease_och_release(struct inode *inode, struct file *file)
782 struct ll_inode_info *lli = ll_i2info(inode);
783 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
784 struct obd_client_handle **och_p;
785 struct obd_client_handle *old_och = NULL;
790 mutex_lock(&lli->lli_och_mutex);
791 if (file->f_mode & FMODE_WRITE) {
792 och_p = &lli->lli_mds_write_och;
793 och_usecount = &lli->lli_open_fd_write_count;
795 och_p = &lli->lli_mds_read_och;
796 och_usecount = &lli->lli_open_fd_read_count;
799 /* The file may have been open by another process (broken lease) so
800 * *och_p is not NULL. In this case we should simply increase usecount
803 if (*och_p != NULL) {
804 old_och = fd->fd_och;
811 mutex_unlock(&lli->lli_och_mutex);
814 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
820 * Acquire a lease and open the file.
822 static struct obd_client_handle *
823 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
826 struct lookup_intent it = { .it_op = IT_OPEN };
827 struct ll_sb_info *sbi = ll_i2sbi(inode);
828 struct md_op_data *op_data;
829 struct ptlrpc_request *req = NULL;
830 struct lustre_handle old_handle = { 0 };
831 struct obd_client_handle *och = NULL;
836 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
837 RETURN(ERR_PTR(-EINVAL));
840 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
841 RETURN(ERR_PTR(-EPERM));
843 rc = ll_lease_och_acquire(inode, file, &old_handle);
850 RETURN(ERR_PTR(-ENOMEM));
852 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
853 LUSTRE_OPC_ANY, NULL);
855 GOTO(out, rc = PTR_ERR(op_data));
857 /* To tell the MDT this openhandle is from the same owner */
858 op_data->op_handle = old_handle;
860 it.it_flags = fmode | open_flags;
861 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
862 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
863 &ll_md_blocking_lease_ast,
864 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
865 * it can be cancelled which may mislead applications that the lease is
867 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
868 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
869 * doesn't deal with openhandle, so normal openhandle will be leaked. */
870 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
871 ll_finish_md_op_data(op_data);
872 ptlrpc_req_finished(req);
874 GOTO(out_release_it, rc);
876 if (it_disposition(&it, DISP_LOOKUP_NEG))
877 GOTO(out_release_it, rc = -ENOENT);
879 rc = it_open_error(DISP_OPEN_OPEN, &it);
881 GOTO(out_release_it, rc);
883 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
884 ll_och_fill(sbi->ll_md_exp, &it, och);
886 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
887 GOTO(out_close, rc = -EOPNOTSUPP);
889 /* already get lease, handle lease lock */
890 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
891 if (it.it_lock_mode == 0 ||
892 it.it_lock_bits != MDS_INODELOCK_OPEN) {
893 /* open lock must return for lease */
894 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
895 PFID(ll_inode2fid(inode)), it.it_lock_mode,
897 GOTO(out_close, rc = -EPROTO);
900 ll_intent_release(&it);
904 /* Cancel open lock */
905 if (it.it_lock_mode != 0) {
906 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
909 och->och_lease_handle.cookie = 0ULL;
911 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
913 CERROR("%s: error closing file "DFID": %d\n",
914 ll_get_fsname(inode->i_sb, NULL, 0),
915 PFID(&ll_i2info(inode)->lli_fid), rc2);
916 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
918 ll_intent_release(&it);
926 * Check whether a layout swap can be done between two inodes.
928 * \param[in] inode1 First inode to check
929 * \param[in] inode2 Second inode to check
931 * \retval 0 on success, layout swap can be performed between both inodes
932 * \retval negative error code if requirements are not met
934 static int ll_check_swap_layouts_validity(struct inode *inode1,
935 struct inode *inode2)
937 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
940 if (inode_permission(inode1, MAY_WRITE) ||
941 inode_permission(inode2, MAY_WRITE))
944 if (inode1->i_sb != inode2->i_sb)
950 static int ll_swap_layouts_close(struct obd_client_handle *och,
951 struct inode *inode, struct inode *inode2)
953 const struct lu_fid *fid1 = ll_inode2fid(inode);
954 const struct lu_fid *fid2;
958 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
959 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
961 rc = ll_check_swap_layouts_validity(inode, inode2);
963 GOTO(out_free_och, rc);
965 /* We now know that inode2 is a lustre inode */
966 fid2 = ll_inode2fid(inode2);
968 rc = lu_fid_cmp(fid1, fid2);
970 GOTO(out_free_och, rc = -EINVAL);
972 /* Close the file and {swap,merge} layouts between inode & inode2.
973 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
974 * because we still need it to pack l_remote_handle to MDT. */
975 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
978 och = NULL; /* freed in ll_close_inode_openhandle() */
988 * Release lease and close the file.
989 * It will check if the lease has ever broken.
991 static int ll_lease_close_intent(struct obd_client_handle *och,
993 bool *lease_broken, enum mds_op_bias bias,
996 struct ldlm_lock *lock;
997 bool cancelled = true;
1001 lock = ldlm_handle2lock(&och->och_lease_handle);
1003 lock_res_and_lock(lock);
1004 cancelled = ldlm_is_cancel(lock);
1005 unlock_res_and_lock(lock);
1006 LDLM_LOCK_PUT(lock);
1009 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1010 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1012 if (lease_broken != NULL)
1013 *lease_broken = cancelled;
1015 if (!cancelled && !bias)
1016 ldlm_cli_cancel(&och->och_lease_handle, 0);
1018 if (cancelled) { /* no need to excute intent */
1023 rc = ll_close_inode_openhandle(inode, och, bias, data);
1027 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1030 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1034 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1036 static int ll_lease_file_resync(struct obd_client_handle *och,
1037 struct inode *inode)
1039 struct ll_sb_info *sbi = ll_i2sbi(inode);
1040 struct md_op_data *op_data;
1041 __u64 data_version_unused;
1045 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1046 LUSTRE_OPC_ANY, NULL);
1047 if (IS_ERR(op_data))
1048 RETURN(PTR_ERR(op_data));
1050 /* before starting file resync, it's necessary to clean up page cache
1051 * in client memory, otherwise once the layout version is increased,
1052 * writing back cached data will be denied the OSTs. */
1053 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1057 op_data->op_handle = och->och_lease_handle;
1058 rc = md_file_resync(sbi->ll_md_exp, op_data);
1064 ll_finish_md_op_data(op_data);
1068 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1070 struct ll_inode_info *lli = ll_i2info(inode);
1071 struct cl_object *obj = lli->lli_clob;
1072 struct cl_attr *attr = vvp_env_thread_attr(env);
1080 ll_inode_size_lock(inode);
1082 /* Merge timestamps the most recently obtained from MDS with
1083 * timestamps obtained from OSTs.
1085 * Do not overwrite atime of inode because it may be refreshed
1086 * by file_accessed() function. If the read was served by cache
1087 * data, there is no RPC to be sent so that atime may not be
1088 * transferred to OSTs at all. MDT only updates atime at close time
1089 * if it's at least 'mdd.*.atime_diff' older.
1090 * All in all, the atime in Lustre does not strictly comply with
1091 * POSIX. Solving this problem needs to send an RPC to MDT for each
1092 * read, this will hurt performance. */
1093 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1094 LTIME_S(inode->i_atime) = lli->lli_atime;
1095 lli->lli_update_atime = 0;
1097 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1098 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1100 atime = LTIME_S(inode->i_atime);
1101 mtime = LTIME_S(inode->i_mtime);
1102 ctime = LTIME_S(inode->i_ctime);
1104 cl_object_attr_lock(obj);
1105 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1108 rc = cl_object_attr_get(env, obj, attr);
1109 cl_object_attr_unlock(obj);
1112 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1114 if (atime < attr->cat_atime)
1115 atime = attr->cat_atime;
1117 if (ctime < attr->cat_ctime)
1118 ctime = attr->cat_ctime;
1120 if (mtime < attr->cat_mtime)
1121 mtime = attr->cat_mtime;
1123 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1124 PFID(&lli->lli_fid), attr->cat_size);
1126 i_size_write(inode, attr->cat_size);
1127 inode->i_blocks = attr->cat_blocks;
1129 LTIME_S(inode->i_atime) = atime;
1130 LTIME_S(inode->i_mtime) = mtime;
1131 LTIME_S(inode->i_ctime) = ctime;
1134 ll_inode_size_unlock(inode);
1140 * Set designated mirror for I/O.
1142 * So far only read, write, and truncated can support to issue I/O to
1143 * designated mirror.
1145 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1147 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1149 /* clear layout version for generic(non-resync) I/O in case it carries
1150 * stale layout version due to I/O restart */
1151 io->ci_layout_version = 0;
1153 /* FLR: disable non-delay for designated mirror I/O because obviously
1154 * only one mirror is available */
1155 if (fd->fd_designated_mirror > 0) {
1157 io->ci_designated_mirror = fd->fd_designated_mirror;
1158 io->ci_layout_version = fd->fd_layout_version;
1159 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1163 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1164 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1167 static bool file_is_noatime(const struct file *file)
1169 const struct vfsmount *mnt = file->f_path.mnt;
1170 const struct inode *inode = file_inode((struct file *)file);
1172 /* Adapted from file_accessed() and touch_atime().*/
1173 if (file->f_flags & O_NOATIME)
1176 if (inode->i_flags & S_NOATIME)
1179 if (IS_NOATIME(inode))
1182 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1185 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1188 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1194 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1196 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1198 struct inode *inode = file_inode(file);
1199 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1201 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1202 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1203 io->u.ci_rw.rw_file = file;
1204 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1205 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1206 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1208 if (iot == CIT_WRITE) {
1209 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1210 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1211 file->f_flags & O_DIRECT ||
1214 io->ci_obj = ll_i2info(inode)->lli_clob;
1215 io->ci_lockreq = CILR_MAYBE;
1216 if (ll_file_nolock(file)) {
1217 io->ci_lockreq = CILR_NEVER;
1218 io->ci_no_srvlock = 1;
1219 } else if (file->f_flags & O_APPEND) {
1220 io->ci_lockreq = CILR_MANDATORY;
1222 io->ci_noatime = file_is_noatime(file);
1223 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1224 io->ci_pio = !io->u.ci_rw.rw_append;
1228 /* FLR: only use non-delay I/O for read as there is only one
1229 * avaliable mirror for write. */
1230 io->ci_ndelay = !(iot == CIT_WRITE);
1232 ll_io_set_mirror(io, file);
1235 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1237 struct cl_io_pt *pt = ptask->pt_cbdata;
1238 struct file *file = pt->cip_file;
1241 loff_t pos = pt->cip_pos;
1246 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1247 file_dentry(file)->d_name.name,
1248 pt->cip_iot == CIT_READ ? "read" : "write",
1249 pos, pos + pt->cip_count);
1251 env = cl_env_get(&refcheck);
1253 RETURN(PTR_ERR(env));
1255 io = vvp_env_thread_io(env);
1256 ll_io_init(io, file, pt->cip_iot);
1257 io->u.ci_rw.rw_iter = pt->cip_iter;
1258 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1259 io->ci_pio = 0; /* It's already in parallel task */
1261 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1262 pt->cip_count - pt->cip_result);
1264 struct vvp_io *vio = vvp_env_io(env);
1266 vio->vui_io_subtype = IO_NORMAL;
1267 vio->vui_fd = LUSTRE_FPRIVATE(file);
1269 ll_cl_add(file, env, io, LCC_RW);
1270 rc = cl_io_loop(env, io);
1271 ll_cl_remove(file, env);
1273 /* cl_io_rw_init() handled IO */
1277 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1283 if (io->ci_nob > 0) {
1284 pt->cip_result += io->ci_nob;
1285 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1287 pt->cip_iocb.ki_pos = pos;
1288 #ifdef HAVE_KIOCB_KI_LEFT
1289 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1290 #elif defined(HAVE_KI_NBYTES)
1291 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1295 cl_io_fini(env, io);
1296 cl_env_put(env, &refcheck);
1298 pt->cip_need_restart = io->ci_need_restart;
1300 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1301 file_dentry(file)->d_name.name,
1302 pt->cip_iot == CIT_READ ? "read" : "write",
1303 pt->cip_result, rc);
1305 RETURN(pt->cip_result > 0 ? 0 : rc);
1309 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1310 struct file *file, enum cl_io_type iot,
1311 loff_t *ppos, size_t count)
1313 struct range_lock range;
1314 struct vvp_io *vio = vvp_env_io(env);
1315 struct inode *inode = file_inode(file);
1316 struct ll_inode_info *lli = ll_i2info(inode);
1317 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1322 unsigned retried = 0;
1323 bool restarted = false;
1327 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1328 file_dentry(file)->d_name.name,
1329 iot == CIT_READ ? "read" : "write", pos, pos + count);
1332 io = vvp_env_thread_io(env);
1333 ll_io_init(io, file, iot);
1334 if (args->via_io_subtype == IO_NORMAL) {
1335 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1336 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1338 if (args->via_io_subtype != IO_NORMAL || restarted)
1340 io->ci_ndelay_tried = retried;
1342 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1343 bool range_locked = false;
1345 if (file->f_flags & O_APPEND)
1346 range_lock_init(&range, 0, LUSTRE_EOF);
1348 range_lock_init(&range, pos, pos + count - 1);
1350 vio->vui_fd = LUSTRE_FPRIVATE(file);
1351 vio->vui_io_subtype = args->via_io_subtype;
1353 switch (vio->vui_io_subtype) {
1355 /* Direct IO reads must also take range lock,
1356 * or multiple reads will try to work on the same pages
1357 * See LU-6227 for details. */
1358 if (((iot == CIT_WRITE) ||
1359 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1360 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1361 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1363 rc = range_lock(&lli->lli_write_tree, &range);
1367 range_locked = true;
1371 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1372 vio->u.splice.vui_flags = args->u.splice.via_flags;
1375 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1379 ll_cl_add(file, env, io, LCC_RW);
1380 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1381 !lli->lli_inode_locked) {
1383 lli->lli_inode_locked = 1;
1385 rc = cl_io_loop(env, io);
1386 if (lli->lli_inode_locked) {
1387 lli->lli_inode_locked = 0;
1388 inode_unlock(inode);
1390 ll_cl_remove(file, env);
1393 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1395 range_unlock(&lli->lli_write_tree, &range);
1398 /* cl_io_rw_init() handled IO */
1402 if (io->ci_nob > 0) {
1403 result += io->ci_nob;
1404 count -= io->ci_nob;
1406 if (args->via_io_subtype == IO_NORMAL) {
1407 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1409 /* CLIO is too complicated. See LU-11069. */
1410 if (cl_io_is_append(io))
1411 pos = io->u.ci_rw.rw_iocb.ki_pos;
1415 args->u.normal.via_iocb->ki_pos = pos;
1416 #ifdef HAVE_KIOCB_KI_LEFT
1417 args->u.normal.via_iocb->ki_left = count;
1418 #elif defined(HAVE_KI_NBYTES)
1419 args->u.normal.via_iocb->ki_nbytes = count;
1423 pos = io->u.ci_rw.rw_range.cir_pos;
1427 cl_io_fini(env, io);
1430 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1431 file->f_path.dentry->d_name.name,
1432 iot, rc, result, io->ci_need_restart);
1434 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1436 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1437 file_dentry(file)->d_name.name,
1438 iot == CIT_READ ? "read" : "write",
1439 pos, pos + count, result, rc);
1440 /* preserve the tried count for FLR */
1441 retried = io->ci_ndelay_tried;
1446 if (iot == CIT_READ) {
1448 ll_stats_ops_tally(ll_i2sbi(inode),
1449 LPROC_LL_READ_BYTES, result);
1450 } else if (iot == CIT_WRITE) {
1452 ll_stats_ops_tally(ll_i2sbi(inode),
1453 LPROC_LL_WRITE_BYTES, result);
1454 fd->fd_write_failed = false;
1455 } else if (result == 0 && rc == 0) {
1458 fd->fd_write_failed = true;
1460 fd->fd_write_failed = false;
1461 } else if (rc != -ERESTARTSYS) {
1462 fd->fd_write_failed = true;
1466 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1467 file_dentry(file)->d_name.name,
1468 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1472 RETURN(result > 0 ? result : rc);
1476 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1477 * especially for small I/O.
1479 * To serve a read request, CLIO has to create and initialize a cl_io and
1480 * then request DLM lock. This has turned out to have siginificant overhead
1481 * and affects the performance of small I/O dramatically.
1483 * It's not necessary to create a cl_io for each I/O. Under the help of read
1484 * ahead, most of the pages being read are already in memory cache and we can
1485 * read those pages directly because if the pages exist, the corresponding DLM
1486 * lock must exist so that page content must be valid.
1488 * In fast read implementation, the llite speculatively finds and reads pages
1489 * in memory cache. There are three scenarios for fast read:
1490 * - If the page exists and is uptodate, kernel VM will provide the data and
1491 * CLIO won't be intervened;
1492 * - If the page was brought into memory by read ahead, it will be exported
1493 * and read ahead parameters will be updated;
1494 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1495 * it will go back and invoke normal read, i.e., a cl_io will be created
1496 * and DLM lock will be requested.
1498 * POSIX compliance: posix standard states that read is intended to be atomic.
1499 * Lustre read implementation is in line with Linux kernel read implementation
1500 * and neither of them complies with POSIX standard in this matter. Fast read
1501 * doesn't make the situation worse on single node but it may interleave write
1502 * results from multiple nodes due to short read handling in ll_file_aio_read().
1504 * \param env - lu_env
1505 * \param iocb - kiocb from kernel
1506 * \param iter - user space buffers where the data will be copied
1508 * \retval - number of bytes have been read, or error code if error occurred.
1511 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1515 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1518 /* NB: we can't do direct IO for fast read because it will need a lock
1519 * to make IO engine happy. */
1520 if (iocb->ki_filp->f_flags & O_DIRECT)
1523 result = generic_file_read_iter(iocb, iter);
1525 /* If the first page is not in cache, generic_file_aio_read() will be
1526 * returned with -ENODATA.
1527 * See corresponding code in ll_readpage(). */
1528 if (result == -ENODATA)
1532 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1533 LPROC_LL_READ_BYTES, result);
1539 * Read from a file (through the page cache).
1541 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1544 struct vvp_io_args *args;
1549 result = ll_do_fast_read(iocb, to);
1550 if (result < 0 || iov_iter_count(to) == 0)
1553 env = cl_env_get(&refcheck);
1555 return PTR_ERR(env);
1557 args = ll_env_args(env, IO_NORMAL);
1558 args->u.normal.via_iter = to;
1559 args->u.normal.via_iocb = iocb;
1561 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1562 &iocb->ki_pos, iov_iter_count(to));
1565 else if (result == 0)
1568 cl_env_put(env, &refcheck);
1574 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1575 * If a page is already in the page cache and dirty (and some other things -
1576 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1577 * write to it without doing a full I/O, because Lustre already knows about it
1578 * and will write it out. This saves a lot of processing time.
1580 * All writes here are within one page, so exclusion is handled by the page
1581 * lock on the vm page. We do not do tiny writes for writes which touch
1582 * multiple pages because it's very unlikely multiple sequential pages are
1583 * are already dirty.
1585 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1586 * and are unlikely to be to already dirty pages.
1588 * Attribute updates are important here, we do them in ll_tiny_write_end.
1590 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1592 ssize_t count = iov_iter_count(iter);
1593 struct file *file = iocb->ki_filp;
1594 struct inode *inode = file_inode(file);
1599 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1600 * of function for why.
1602 if (count >= PAGE_SIZE ||
1603 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1606 result = __generic_file_write_iter(iocb, iter);
1608 /* If the page is not already dirty, ll_tiny_write_begin returns
1609 * -ENODATA. We continue on to normal write.
1611 if (result == -ENODATA)
1615 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1617 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1620 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1626 * Write to a file (through the page cache).
1628 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1630 struct vvp_io_args *args;
1632 ssize_t rc_tiny = 0, rc_normal;
1637 /* NB: we can't do direct IO for tiny writes because they use the page
1638 * cache, we can't do sync writes because tiny writes can't flush
1639 * pages, and we can't do append writes because we can't guarantee the
1640 * required DLM locks are held to protect file size.
1642 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1643 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1644 rc_tiny = ll_do_tiny_write(iocb, from);
1646 /* In case of error, go on and try normal write - Only stop if tiny
1647 * write completed I/O.
1649 if (iov_iter_count(from) == 0)
1650 GOTO(out, rc_normal = rc_tiny);
1652 env = cl_env_get(&refcheck);
1654 return PTR_ERR(env);
1656 args = ll_env_args(env, IO_NORMAL);
1657 args->u.normal.via_iter = from;
1658 args->u.normal.via_iocb = iocb;
1660 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1661 &iocb->ki_pos, iov_iter_count(from));
1663 /* On success, combine bytes written. */
1664 if (rc_tiny >= 0 && rc_normal > 0)
1665 rc_normal += rc_tiny;
1666 /* On error, only return error from normal write if tiny write did not
1667 * write any bytes. Otherwise return bytes written by tiny write.
1669 else if (rc_tiny > 0)
1670 rc_normal = rc_tiny;
1672 cl_env_put(env, &refcheck);
1677 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1679 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1681 static int ll_file_get_iov_count(const struct iovec *iov,
1682 unsigned long *nr_segs, size_t *count)
1687 for (seg = 0; seg < *nr_segs; seg++) {
1688 const struct iovec *iv = &iov[seg];
1691 * If any segment has a negative length, or the cumulative
1692 * length ever wraps negative then return -EINVAL.
1695 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1697 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1702 cnt -= iv->iov_len; /* This segment is no good */
1709 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1710 unsigned long nr_segs, loff_t pos)
1717 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1721 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1722 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1723 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1724 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1725 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1727 result = ll_file_read_iter(iocb, &to);
1732 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1735 struct iovec iov = { .iov_base = buf, .iov_len = count };
1740 init_sync_kiocb(&kiocb, file);
1741 kiocb.ki_pos = *ppos;
1742 #ifdef HAVE_KIOCB_KI_LEFT
1743 kiocb.ki_left = count;
1744 #elif defined(HAVE_KI_NBYTES)
1745 kiocb.i_nbytes = count;
1748 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1749 *ppos = kiocb.ki_pos;
1755 * Write to a file (through the page cache).
1758 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1759 unsigned long nr_segs, loff_t pos)
1761 struct iov_iter from;
1766 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1770 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1771 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1772 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1773 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1774 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1776 result = ll_file_write_iter(iocb, &from);
1781 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1782 size_t count, loff_t *ppos)
1784 struct iovec iov = { .iov_base = (void __user *)buf,
1791 init_sync_kiocb(&kiocb, file);
1792 kiocb.ki_pos = *ppos;
1793 #ifdef HAVE_KIOCB_KI_LEFT
1794 kiocb.ki_left = count;
1795 #elif defined(HAVE_KI_NBYTES)
1796 kiocb.ki_nbytes = count;
1799 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1800 *ppos = kiocb.ki_pos;
1804 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1807 * Send file content (through pagecache) somewhere with helper
1809 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1810 struct pipe_inode_info *pipe, size_t count,
1814 struct vvp_io_args *args;
1819 env = cl_env_get(&refcheck);
1821 RETURN(PTR_ERR(env));
1823 args = ll_env_args(env, IO_SPLICE);
1824 args->u.splice.via_pipe = pipe;
1825 args->u.splice.via_flags = flags;
1827 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1828 cl_env_put(env, &refcheck);
1832 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1833 __u64 flags, struct lov_user_md *lum, int lum_size)
1835 struct lookup_intent oit = {
1837 .it_flags = flags | MDS_OPEN_BY_FID,
1842 ll_inode_size_lock(inode);
1843 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1845 GOTO(out_unlock, rc);
1847 ll_release_openhandle(dentry, &oit);
1850 ll_inode_size_unlock(inode);
1851 ll_intent_release(&oit);
1856 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1857 struct lov_mds_md **lmmp, int *lmm_size,
1858 struct ptlrpc_request **request)
1860 struct ll_sb_info *sbi = ll_i2sbi(inode);
1861 struct mdt_body *body;
1862 struct lov_mds_md *lmm = NULL;
1863 struct ptlrpc_request *req = NULL;
1864 struct md_op_data *op_data;
1867 rc = ll_get_default_mdsize(sbi, &lmmsize);
1871 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1872 strlen(filename), lmmsize,
1873 LUSTRE_OPC_ANY, NULL);
1874 if (IS_ERR(op_data))
1875 RETURN(PTR_ERR(op_data));
1877 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1878 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1879 ll_finish_md_op_data(op_data);
1881 CDEBUG(D_INFO, "md_getattr_name failed "
1882 "on %s: rc %d\n", filename, rc);
1886 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1887 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1889 lmmsize = body->mbo_eadatasize;
1891 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1893 GOTO(out, rc = -ENODATA);
1896 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1897 LASSERT(lmm != NULL);
1899 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1900 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1901 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1902 GOTO(out, rc = -EPROTO);
1905 * This is coming from the MDS, so is probably in
1906 * little endian. We convert it to host endian before
1907 * passing it to userspace.
1909 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1912 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1913 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1914 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1915 if (le32_to_cpu(lmm->lmm_pattern) &
1916 LOV_PATTERN_F_RELEASED)
1920 /* if function called for directory - we should
1921 * avoid swab not existent lsm objects */
1922 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1923 lustre_swab_lov_user_md_v1(
1924 (struct lov_user_md_v1 *)lmm);
1925 if (S_ISREG(body->mbo_mode))
1926 lustre_swab_lov_user_md_objects(
1927 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1929 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1930 lustre_swab_lov_user_md_v3(
1931 (struct lov_user_md_v3 *)lmm);
1932 if (S_ISREG(body->mbo_mode))
1933 lustre_swab_lov_user_md_objects(
1934 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1936 } else if (lmm->lmm_magic ==
1937 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1938 lustre_swab_lov_comp_md_v1(
1939 (struct lov_comp_md_v1 *)lmm);
1945 *lmm_size = lmmsize;
1950 static int ll_lov_setea(struct inode *inode, struct file *file,
1953 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1954 struct lov_user_md *lump;
1955 int lum_size = sizeof(struct lov_user_md) +
1956 sizeof(struct lov_user_ost_data);
1960 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1963 OBD_ALLOC_LARGE(lump, lum_size);
1967 if (copy_from_user(lump, arg, lum_size))
1968 GOTO(out_lump, rc = -EFAULT);
1970 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1972 cl_lov_delay_create_clear(&file->f_flags);
1975 OBD_FREE_LARGE(lump, lum_size);
1979 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1986 env = cl_env_get(&refcheck);
1988 RETURN(PTR_ERR(env));
1990 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1991 cl_env_put(env, &refcheck);
1995 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1998 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1999 struct lov_user_md *klum;
2001 __u64 flags = FMODE_WRITE;
2004 rc = ll_copy_user_md(lum, &klum);
2009 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2014 rc = put_user(0, &lum->lmm_stripe_count);
2018 rc = ll_layout_refresh(inode, &gen);
2022 rc = ll_file_getstripe(inode, arg, lum_size);
2024 cl_lov_delay_create_clear(&file->f_flags);
2027 OBD_FREE(klum, lum_size);
2032 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2034 struct ll_inode_info *lli = ll_i2info(inode);
2035 struct cl_object *obj = lli->lli_clob;
2036 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2037 struct ll_grouplock grouplock;
2042 CWARN("group id for group lock must not be 0\n");
2046 if (ll_file_nolock(file))
2047 RETURN(-EOPNOTSUPP);
2049 spin_lock(&lli->lli_lock);
2050 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2051 CWARN("group lock already existed with gid %lu\n",
2052 fd->fd_grouplock.lg_gid);
2053 spin_unlock(&lli->lli_lock);
2056 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2057 spin_unlock(&lli->lli_lock);
2060 * XXX: group lock needs to protect all OST objects while PFL
2061 * can add new OST objects during the IO, so we'd instantiate
2062 * all OST objects before getting its group lock.
2067 struct cl_layout cl = {
2068 .cl_is_composite = false,
2070 struct lu_extent ext = {
2072 .e_end = OBD_OBJECT_EOF,
2075 env = cl_env_get(&refcheck);
2077 RETURN(PTR_ERR(env));
2079 rc = cl_object_layout_get(env, obj, &cl);
2080 if (!rc && cl.cl_is_composite)
2081 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2084 cl_env_put(env, &refcheck);
2089 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2090 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2094 spin_lock(&lli->lli_lock);
2095 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2096 spin_unlock(&lli->lli_lock);
2097 CERROR("another thread just won the race\n");
2098 cl_put_grouplock(&grouplock);
2102 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2103 fd->fd_grouplock = grouplock;
2104 spin_unlock(&lli->lli_lock);
2106 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2110 static int ll_put_grouplock(struct inode *inode, struct file *file,
2113 struct ll_inode_info *lli = ll_i2info(inode);
2114 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2115 struct ll_grouplock grouplock;
2118 spin_lock(&lli->lli_lock);
2119 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2120 spin_unlock(&lli->lli_lock);
2121 CWARN("no group lock held\n");
2125 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2127 if (fd->fd_grouplock.lg_gid != arg) {
2128 CWARN("group lock %lu doesn't match current id %lu\n",
2129 arg, fd->fd_grouplock.lg_gid);
2130 spin_unlock(&lli->lli_lock);
2134 grouplock = fd->fd_grouplock;
2135 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2136 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2137 spin_unlock(&lli->lli_lock);
2139 cl_put_grouplock(&grouplock);
2140 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2145 * Close inode open handle
2147 * \param dentry [in] dentry which contains the inode
2148 * \param it [in,out] intent which contains open info and result
2151 * \retval <0 failure
2153 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2155 struct inode *inode = dentry->d_inode;
2156 struct obd_client_handle *och;
2162 /* Root ? Do nothing. */
2163 if (dentry->d_inode->i_sb->s_root == dentry)
2166 /* No open handle to close? Move away */
2167 if (!it_disposition(it, DISP_OPEN_OPEN))
2170 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2172 OBD_ALLOC(och, sizeof(*och));
2174 GOTO(out, rc = -ENOMEM);
2176 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2178 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2180 /* this one is in place of ll_file_open */
2181 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2182 ptlrpc_req_finished(it->it_request);
2183 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2189 * Get size for inode for which FIEMAP mapping is requested.
2190 * Make the FIEMAP get_info call and returns the result.
2191 * \param fiemap kernel buffer to hold extens
2192 * \param num_bytes kernel buffer size
2194 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2200 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2203 /* Checks for fiemap flags */
2204 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2205 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2209 /* Check for FIEMAP_FLAG_SYNC */
2210 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2211 rc = filemap_fdatawrite(inode->i_mapping);
2216 env = cl_env_get(&refcheck);
2218 RETURN(PTR_ERR(env));
2220 if (i_size_read(inode) == 0) {
2221 rc = ll_glimpse_size(inode);
2226 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2227 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2228 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2230 /* If filesize is 0, then there would be no objects for mapping */
2231 if (fmkey.lfik_oa.o_size == 0) {
2232 fiemap->fm_mapped_extents = 0;
2236 fmkey.lfik_fiemap = *fiemap;
2238 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2239 &fmkey, fiemap, &num_bytes);
2241 cl_env_put(env, &refcheck);
2245 int ll_fid2path(struct inode *inode, void __user *arg)
2247 struct obd_export *exp = ll_i2mdexp(inode);
2248 const struct getinfo_fid2path __user *gfin = arg;
2250 struct getinfo_fid2path *gfout;
2256 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2257 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2260 /* Only need to get the buflen */
2261 if (get_user(pathlen, &gfin->gf_pathlen))
2264 if (pathlen > PATH_MAX)
2267 outsize = sizeof(*gfout) + pathlen;
2268 OBD_ALLOC(gfout, outsize);
2272 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2273 GOTO(gf_free, rc = -EFAULT);
2274 /* append root FID after gfout to let MDT know the root FID so that it
2275 * can lookup the correct path, this is mainly for fileset.
2276 * old server without fileset mount support will ignore this. */
2277 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2279 /* Call mdc_iocontrol */
2280 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2284 if (copy_to_user(arg, gfout, outsize))
2288 OBD_FREE(gfout, outsize);
2293 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2295 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2303 ioc->idv_version = 0;
2304 ioc->idv_layout_version = UINT_MAX;
2306 /* If no file object initialized, we consider its version is 0. */
2310 env = cl_env_get(&refcheck);
2312 RETURN(PTR_ERR(env));
2314 io = vvp_env_thread_io(env);
2316 io->u.ci_data_version.dv_data_version = 0;
2317 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2318 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2321 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2322 result = cl_io_loop(env, io);
2324 result = io->ci_result;
2326 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2327 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2329 cl_io_fini(env, io);
2331 if (unlikely(io->ci_need_restart))
2334 cl_env_put(env, &refcheck);
2340 * Read the data_version for inode.
2342 * This value is computed using stripe object version on OST.
2343 * Version is computed using server side locking.
2345 * @param flags if do sync on the OST side;
2347 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2348 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2350 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2352 struct ioc_data_version ioc = { .idv_flags = flags };
2355 rc = ll_ioc_data_version(inode, &ioc);
2357 *data_version = ioc.idv_version;
2363 * Trigger a HSM release request for the provided inode.
2365 int ll_hsm_release(struct inode *inode)
2368 struct obd_client_handle *och = NULL;
2369 __u64 data_version = 0;
2374 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2375 ll_get_fsname(inode->i_sb, NULL, 0),
2376 PFID(&ll_i2info(inode)->lli_fid));
2378 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2380 GOTO(out, rc = PTR_ERR(och));
2382 /* Grab latest data_version and [am]time values */
2383 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2387 env = cl_env_get(&refcheck);
2389 GOTO(out, rc = PTR_ERR(env));
2391 rc = ll_merge_attr(env, inode);
2392 cl_env_put(env, &refcheck);
2394 /* If error happen, we have the wrong size for a file.
2400 /* Release the file.
2401 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2402 * we still need it to pack l_remote_handle to MDT. */
2403 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2409 if (och != NULL && !IS_ERR(och)) /* close the file */
2410 ll_lease_close(och, inode, NULL);
2415 struct ll_swap_stack {
2418 struct inode *inode1;
2419 struct inode *inode2;
2424 static int ll_swap_layouts(struct file *file1, struct file *file2,
2425 struct lustre_swap_layouts *lsl)
2427 struct mdc_swap_layouts msl;
2428 struct md_op_data *op_data;
2431 struct ll_swap_stack *llss = NULL;
2434 OBD_ALLOC_PTR(llss);
2438 llss->inode1 = file_inode(file1);
2439 llss->inode2 = file_inode(file2);
2441 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2445 /* we use 2 bool because it is easier to swap than 2 bits */
2446 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2447 llss->check_dv1 = true;
2449 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2450 llss->check_dv2 = true;
2452 /* we cannot use lsl->sl_dvX directly because we may swap them */
2453 llss->dv1 = lsl->sl_dv1;
2454 llss->dv2 = lsl->sl_dv2;
2456 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2457 if (rc == 0) /* same file, done! */
2460 if (rc < 0) { /* sequentialize it */
2461 swap(llss->inode1, llss->inode2);
2463 swap(llss->dv1, llss->dv2);
2464 swap(llss->check_dv1, llss->check_dv2);
2468 if (gid != 0) { /* application asks to flush dirty cache */
2469 rc = ll_get_grouplock(llss->inode1, file1, gid);
2473 rc = ll_get_grouplock(llss->inode2, file2, gid);
2475 ll_put_grouplock(llss->inode1, file1, gid);
2480 /* ultimate check, before swaping the layouts we check if
2481 * dataversion has changed (if requested) */
2482 if (llss->check_dv1) {
2483 rc = ll_data_version(llss->inode1, &dv, 0);
2486 if (dv != llss->dv1)
2487 GOTO(putgl, rc = -EAGAIN);
2490 if (llss->check_dv2) {
2491 rc = ll_data_version(llss->inode2, &dv, 0);
2494 if (dv != llss->dv2)
2495 GOTO(putgl, rc = -EAGAIN);
2498 /* struct md_op_data is used to send the swap args to the mdt
2499 * only flags is missing, so we use struct mdc_swap_layouts
2500 * through the md_op_data->op_data */
2501 /* flags from user space have to be converted before they are send to
2502 * server, no flag is sent today, they are only used on the client */
2505 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2506 0, LUSTRE_OPC_ANY, &msl);
2507 if (IS_ERR(op_data))
2508 GOTO(free, rc = PTR_ERR(op_data));
2510 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2511 sizeof(*op_data), op_data, NULL);
2512 ll_finish_md_op_data(op_data);
2519 ll_put_grouplock(llss->inode2, file2, gid);
2520 ll_put_grouplock(llss->inode1, file1, gid);
2530 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2532 struct md_op_data *op_data;
2536 /* Detect out-of range masks */
2537 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2540 /* Non-root users are forbidden to set or clear flags which are
2541 * NOT defined in HSM_USER_MASK. */
2542 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2543 !cfs_capable(CFS_CAP_SYS_ADMIN))
2546 /* Detect out-of range archive id */
2547 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2548 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2551 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2552 LUSTRE_OPC_ANY, hss);
2553 if (IS_ERR(op_data))
2554 RETURN(PTR_ERR(op_data));
2556 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2557 sizeof(*op_data), op_data, NULL);
2559 ll_finish_md_op_data(op_data);
2564 static int ll_hsm_import(struct inode *inode, struct file *file,
2565 struct hsm_user_import *hui)
2567 struct hsm_state_set *hss = NULL;
2568 struct iattr *attr = NULL;
2572 if (!S_ISREG(inode->i_mode))
2578 GOTO(out, rc = -ENOMEM);
2580 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2581 hss->hss_archive_id = hui->hui_archive_id;
2582 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2583 rc = ll_hsm_state_set(inode, hss);
2587 OBD_ALLOC_PTR(attr);
2589 GOTO(out, rc = -ENOMEM);
2591 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2592 attr->ia_mode |= S_IFREG;
2593 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2594 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2595 attr->ia_size = hui->hui_size;
2596 attr->ia_mtime.tv_sec = hui->hui_mtime;
2597 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2598 attr->ia_atime.tv_sec = hui->hui_atime;
2599 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2601 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2602 ATTR_UID | ATTR_GID |
2603 ATTR_MTIME | ATTR_MTIME_SET |
2604 ATTR_ATIME | ATTR_ATIME_SET;
2608 rc = ll_setattr_raw(file_dentry(file), attr, true);
2612 inode_unlock(inode);
2624 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2626 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2627 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2630 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2632 struct inode *inode = file_inode(file);
2634 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2635 ATTR_MTIME | ATTR_MTIME_SET |
2636 ATTR_CTIME | ATTR_CTIME_SET,
2638 .tv_sec = lfu->lfu_atime_sec,
2639 .tv_nsec = lfu->lfu_atime_nsec,
2642 .tv_sec = lfu->lfu_mtime_sec,
2643 .tv_nsec = lfu->lfu_mtime_nsec,
2646 .tv_sec = lfu->lfu_ctime_sec,
2647 .tv_nsec = lfu->lfu_ctime_nsec,
2653 if (!capable(CAP_SYS_ADMIN))
2656 if (!S_ISREG(inode->i_mode))
2660 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2661 inode_unlock(inode);
2666 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2669 case MODE_READ_USER:
2671 case MODE_WRITE_USER:
2678 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2680 /* Used to allow the upper layers of the client to request an LDLM lock
2681 * without doing an actual read or write.
2683 * Used for ladvise lockahead to manually request specific locks.
2685 * \param[in] file file this ladvise lock request is on
2686 * \param[in] ladvise ladvise struct describing this lock request
2688 * \retval 0 success, no detailed result available (sync requests
2689 * and requests sent to the server [not handled locally]
2690 * cannot return detailed results)
2691 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2692 * see definitions for details.
2693 * \retval negative negative errno on error
2695 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2697 struct lu_env *env = NULL;
2698 struct cl_io *io = NULL;
2699 struct cl_lock *lock = NULL;
2700 struct cl_lock_descr *descr = NULL;
2701 struct dentry *dentry = file->f_path.dentry;
2702 struct inode *inode = dentry->d_inode;
2703 enum cl_lock_mode cl_mode;
2704 off_t start = ladvise->lla_start;
2705 off_t end = ladvise->lla_end;
2711 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2712 "start=%llu, end=%llu\n", dentry->d_name.len,
2713 dentry->d_name.name, dentry->d_inode,
2714 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2717 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2719 GOTO(out, result = cl_mode);
2721 /* Get IO environment */
2722 result = cl_io_get(inode, &env, &io, &refcheck);
2726 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2729 * nothing to do for this io. This currently happens when
2730 * stripe sub-object's are not yet created.
2732 result = io->ci_result;
2733 } else if (result == 0) {
2734 lock = vvp_env_lock(env);
2735 descr = &lock->cll_descr;
2737 descr->cld_obj = io->ci_obj;
2738 /* Convert byte offsets to pages */
2739 descr->cld_start = cl_index(io->ci_obj, start);
2740 descr->cld_end = cl_index(io->ci_obj, end);
2741 descr->cld_mode = cl_mode;
2742 /* CEF_MUST is used because we do not want to convert a
2743 * lockahead request to a lockless lock */
2744 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2747 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2748 descr->cld_enq_flags |= CEF_SPECULATIVE;
2750 result = cl_lock_request(env, io, lock);
2752 /* On success, we need to release the lock */
2754 cl_lock_release(env, lock);
2756 cl_io_fini(env, io);
2757 cl_env_put(env, &refcheck);
2759 /* -ECANCELED indicates a matching lock with a different extent
2760 * was already present, and -EEXIST indicates a matching lock
2761 * on exactly the same extent was already present.
2762 * We convert them to positive values for userspace to make
2763 * recognizing true errors easier.
2764 * Note we can only return these detailed results on async requests,
2765 * as sync requests look the same as i/o requests for locking. */
2766 if (result == -ECANCELED)
2767 result = LLA_RESULT_DIFFERENT;
2768 else if (result == -EEXIST)
2769 result = LLA_RESULT_SAME;
2774 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2776 static int ll_ladvise_sanity(struct inode *inode,
2777 struct llapi_lu_ladvise *ladvise)
2779 enum lu_ladvise_type advice = ladvise->lla_advice;
2780 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2781 * be in the first 32 bits of enum ladvise_flags */
2782 __u32 flags = ladvise->lla_peradvice_flags;
2783 /* 3 lines at 80 characters per line, should be plenty */
2786 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2788 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2789 "last supported advice is %s (value '%d'): rc = %d\n",
2790 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2791 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2795 /* Per-advice checks */
2797 case LU_LADVISE_LOCKNOEXPAND:
2798 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2800 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2802 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2803 ladvise_names[advice], rc);
2807 case LU_LADVISE_LOCKAHEAD:
2808 /* Currently only READ and WRITE modes can be requested */
2809 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2810 ladvise->lla_lockahead_mode == 0) {
2812 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2814 ll_get_fsname(inode->i_sb, NULL, 0),
2815 ladvise->lla_lockahead_mode,
2816 ladvise_names[advice], rc);
2819 case LU_LADVISE_WILLREAD:
2820 case LU_LADVISE_DONTNEED:
2822 /* Note fall through above - These checks apply to all advices
2823 * except LOCKNOEXPAND */
2824 if (flags & ~LF_DEFAULT_MASK) {
2826 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2828 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2829 ladvise_names[advice], rc);
2832 if (ladvise->lla_start >= ladvise->lla_end) {
2834 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2835 "for %s: rc = %d\n",
2836 ll_get_fsname(inode->i_sb, NULL, 0),
2837 ladvise->lla_start, ladvise->lla_end,
2838 ladvise_names[advice], rc);
2850 * Give file access advices
2852 * The ladvise interface is similar to Linux fadvise() system call, except it
2853 * forwards the advices directly from Lustre client to server. The server side
2854 * codes will apply appropriate read-ahead and caching techniques for the
2855 * corresponding files.
2857 * A typical workload for ladvise is e.g. a bunch of different clients are
2858 * doing small random reads of a file, so prefetching pages into OSS cache
2859 * with big linear reads before the random IO is a net benefit. Fetching
2860 * all that data into each client cache with fadvise() may not be, due to
2861 * much more data being sent to the client.
2863 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2864 struct llapi_lu_ladvise *ladvise)
2868 struct cl_ladvise_io *lio;
2873 env = cl_env_get(&refcheck);
2875 RETURN(PTR_ERR(env));
2877 io = vvp_env_thread_io(env);
2878 io->ci_obj = ll_i2info(inode)->lli_clob;
2880 /* initialize parameters for ladvise */
2881 lio = &io->u.ci_ladvise;
2882 lio->li_start = ladvise->lla_start;
2883 lio->li_end = ladvise->lla_end;
2884 lio->li_fid = ll_inode2fid(inode);
2885 lio->li_advice = ladvise->lla_advice;
2886 lio->li_flags = flags;
2888 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2889 rc = cl_io_loop(env, io);
2893 cl_io_fini(env, io);
2894 cl_env_put(env, &refcheck);
2898 static int ll_lock_noexpand(struct file *file, int flags)
2900 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2902 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2907 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2910 struct fsxattr fsxattr;
2912 if (copy_from_user(&fsxattr,
2913 (const struct fsxattr __user *)arg,
2917 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2918 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2919 if (copy_to_user((struct fsxattr __user *)arg,
2920 &fsxattr, sizeof(fsxattr)))
2926 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2930 struct md_op_data *op_data;
2931 struct ptlrpc_request *req = NULL;
2933 struct fsxattr fsxattr;
2934 struct cl_object *obj;
2936 /* only root could change project ID */
2937 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2940 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2941 LUSTRE_OPC_ANY, NULL);
2942 if (IS_ERR(op_data))
2943 RETURN(PTR_ERR(op_data));
2945 if (copy_from_user(&fsxattr,
2946 (const struct fsxattr __user *)arg,
2948 GOTO(out_fsxattr1, rc = -EFAULT);
2950 op_data->op_attr_flags = fsxattr.fsx_xflags;
2951 op_data->op_projid = fsxattr.fsx_projid;
2952 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2953 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2955 ptlrpc_req_finished(req);
2957 obj = ll_i2info(inode)->lli_clob;
2961 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2962 OBD_ALLOC_PTR(attr);
2964 GOTO(out_fsxattr1, rc = -ENOMEM);
2965 attr->ia_valid = ATTR_ATTR_FLAG;
2966 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2971 ll_finish_md_op_data(op_data);
2975 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2978 struct inode *inode = file_inode(file);
2979 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2980 struct ll_inode_info *lli = ll_i2info(inode);
2981 struct obd_client_handle *och = NULL;
2982 struct split_param sp;
2985 enum mds_op_bias bias = 0;
2986 struct file *layout_file = NULL;
2988 size_t data_size = 0;
2992 mutex_lock(&lli->lli_och_mutex);
2993 if (fd->fd_lease_och != NULL) {
2994 och = fd->fd_lease_och;
2995 fd->fd_lease_och = NULL;
2997 mutex_unlock(&lli->lli_och_mutex);
3000 GOTO(out, rc = -ENOLCK);
3002 fmode = och->och_flags;
3004 switch (ioc->lil_flags) {
3005 case LL_LEASE_RESYNC_DONE:
3006 if (ioc->lil_count > IOC_IDS_MAX)
3007 GOTO(out, rc = -EINVAL);
3009 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3010 OBD_ALLOC(data, data_size);
3012 GOTO(out, rc = -ENOMEM);
3014 if (copy_from_user(data, (void __user *)arg, data_size))
3015 GOTO(out, rc = -EFAULT);
3017 bias = MDS_CLOSE_RESYNC_DONE;
3019 case LL_LEASE_LAYOUT_MERGE: {
3022 if (ioc->lil_count != 1)
3023 GOTO(out, rc = -EINVAL);
3025 arg += sizeof(*ioc);
3026 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3027 GOTO(out, rc = -EFAULT);
3029 layout_file = fget(fd);
3031 GOTO(out, rc = -EBADF);
3033 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3034 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3035 GOTO(out, rc = -EPERM);
3037 data = file_inode(layout_file);
3038 bias = MDS_CLOSE_LAYOUT_MERGE;
3041 case LL_LEASE_LAYOUT_SPLIT: {
3045 if (ioc->lil_count != 2)
3046 GOTO(out, rc = -EINVAL);
3048 arg += sizeof(*ioc);
3049 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3050 GOTO(out, rc = -EFAULT);
3052 arg += sizeof(__u32);
3053 if (copy_from_user(&mirror_id, (void __user *)arg,
3055 GOTO(out, rc = -EFAULT);
3057 layout_file = fget(fdv);
3059 GOTO(out, rc = -EBADF);
3061 sp.sp_inode = file_inode(layout_file);
3062 sp.sp_mirror_id = (__u16)mirror_id;
3064 bias = MDS_CLOSE_LAYOUT_SPLIT;
3068 /* without close intent */
3072 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3076 rc = ll_lease_och_release(inode, file);
3085 switch (ioc->lil_flags) {
3086 case LL_LEASE_RESYNC_DONE:
3088 OBD_FREE(data, data_size);
3090 case LL_LEASE_LAYOUT_MERGE:
3091 case LL_LEASE_LAYOUT_SPLIT:
3098 rc = ll_lease_type_from_fmode(fmode);
3102 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3105 struct inode *inode = file_inode(file);
3106 struct ll_inode_info *lli = ll_i2info(inode);
3107 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3108 struct obd_client_handle *och = NULL;
3109 __u64 open_flags = 0;
3115 switch (ioc->lil_mode) {
3116 case LL_LEASE_WRLCK:
3117 if (!(file->f_mode & FMODE_WRITE))
3119 fmode = FMODE_WRITE;
3121 case LL_LEASE_RDLCK:
3122 if (!(file->f_mode & FMODE_READ))
3126 case LL_LEASE_UNLCK:
3127 RETURN(ll_file_unlock_lease(file, ioc, arg));
3132 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3134 /* apply for lease */
3135 if (ioc->lil_flags & LL_LEASE_RESYNC)
3136 open_flags = MDS_OPEN_RESYNC;
3137 och = ll_lease_open(inode, file, fmode, open_flags);
3139 RETURN(PTR_ERR(och));
3141 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3142 rc = ll_lease_file_resync(och, inode);
3144 ll_lease_close(och, inode, NULL);
3147 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3149 ll_lease_close(och, inode, NULL);
3155 mutex_lock(&lli->lli_och_mutex);
3156 if (fd->fd_lease_och == NULL) {
3157 fd->fd_lease_och = och;
3160 mutex_unlock(&lli->lli_och_mutex);
3162 /* impossible now that only excl is supported for now */
3163 ll_lease_close(och, inode, &lease_broken);
3170 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3172 struct inode *inode = file_inode(file);
3173 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3177 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3178 PFID(ll_inode2fid(inode)), inode, cmd);
3179 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3181 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3182 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3186 case LL_IOC_GETFLAGS:
3187 /* Get the current value of the file flags */
3188 return put_user(fd->fd_flags, (int __user *)arg);
3189 case LL_IOC_SETFLAGS:
3190 case LL_IOC_CLRFLAGS:
3191 /* Set or clear specific file flags */
3192 /* XXX This probably needs checks to ensure the flags are
3193 * not abused, and to handle any flag side effects.
3195 if (get_user(flags, (int __user *) arg))
3198 if (cmd == LL_IOC_SETFLAGS) {
3199 if ((flags & LL_FILE_IGNORE_LOCK) &&
3200 !(file->f_flags & O_DIRECT)) {
3201 CERROR("%s: unable to disable locking on "
3202 "non-O_DIRECT file\n", current->comm);
3206 fd->fd_flags |= flags;
3208 fd->fd_flags &= ~flags;
3211 case LL_IOC_LOV_SETSTRIPE:
3212 case LL_IOC_LOV_SETSTRIPE_NEW:
3213 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3214 case LL_IOC_LOV_SETEA:
3215 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3216 case LL_IOC_LOV_SWAP_LAYOUTS: {
3218 struct lustre_swap_layouts lsl;
3220 if (copy_from_user(&lsl, (char __user *)arg,
3221 sizeof(struct lustre_swap_layouts)))
3224 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3227 file2 = fget(lsl.sl_fd);
3231 /* O_WRONLY or O_RDWR */
3232 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3233 GOTO(out, rc = -EPERM);
3235 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3236 struct inode *inode2;
3237 struct ll_inode_info *lli;
3238 struct obd_client_handle *och = NULL;
3240 lli = ll_i2info(inode);
3241 mutex_lock(&lli->lli_och_mutex);
3242 if (fd->fd_lease_och != NULL) {
3243 och = fd->fd_lease_och;
3244 fd->fd_lease_och = NULL;
3246 mutex_unlock(&lli->lli_och_mutex);
3248 GOTO(out, rc = -ENOLCK);
3249 inode2 = file_inode(file2);
3250 rc = ll_swap_layouts_close(och, inode, inode2);
3252 rc = ll_swap_layouts(file, file2, &lsl);
3258 case LL_IOC_LOV_GETSTRIPE:
3259 case LL_IOC_LOV_GETSTRIPE_NEW:
3260 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3261 case FS_IOC_GETFLAGS:
3262 case FS_IOC_SETFLAGS:
3263 RETURN(ll_iocontrol(inode, file, cmd, arg));
3264 case FSFILT_IOC_GETVERSION:
3265 case FS_IOC_GETVERSION:
3266 RETURN(put_user(inode->i_generation, (int __user *)arg));
3267 /* We need to special case any other ioctls we want to handle,
3268 * to send them to the MDS/OST as appropriate and to properly
3269 * network encode the arg field. */
3270 case FS_IOC_SETVERSION:
3273 case LL_IOC_GROUP_LOCK:
3274 RETURN(ll_get_grouplock(inode, file, arg));
3275 case LL_IOC_GROUP_UNLOCK:
3276 RETURN(ll_put_grouplock(inode, file, arg));
3277 case IOC_OBD_STATFS:
3278 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3280 case LL_IOC_FLUSHCTX:
3281 RETURN(ll_flush_ctx(inode));
3282 case LL_IOC_PATH2FID: {
3283 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3284 sizeof(struct lu_fid)))
3289 case LL_IOC_GETPARENT:
3290 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3292 case OBD_IOC_FID2PATH:
3293 RETURN(ll_fid2path(inode, (void __user *)arg));
3294 case LL_IOC_DATA_VERSION: {
3295 struct ioc_data_version idv;
3298 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3301 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3302 rc = ll_ioc_data_version(inode, &idv);
3305 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3311 case LL_IOC_GET_MDTIDX: {
3314 mdtidx = ll_get_mdt_idx(inode);
3318 if (put_user((int)mdtidx, (int __user *)arg))
3323 case OBD_IOC_GETDTNAME:
3324 case OBD_IOC_GETMDNAME:
3325 RETURN(ll_get_obd_name(inode, cmd, arg));
3326 case LL_IOC_HSM_STATE_GET: {
3327 struct md_op_data *op_data;
3328 struct hsm_user_state *hus;
3335 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3336 LUSTRE_OPC_ANY, hus);
3337 if (IS_ERR(op_data)) {
3339 RETURN(PTR_ERR(op_data));
3342 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3345 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3348 ll_finish_md_op_data(op_data);
3352 case LL_IOC_HSM_STATE_SET: {
3353 struct hsm_state_set *hss;
3360 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3365 rc = ll_hsm_state_set(inode, hss);
3370 case LL_IOC_HSM_ACTION: {
3371 struct md_op_data *op_data;
3372 struct hsm_current_action *hca;
3379 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3380 LUSTRE_OPC_ANY, hca);
3381 if (IS_ERR(op_data)) {
3383 RETURN(PTR_ERR(op_data));
3386 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3389 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3392 ll_finish_md_op_data(op_data);
3396 case LL_IOC_SET_LEASE_OLD: {
3397 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3399 RETURN(ll_file_set_lease(file, &ioc, 0));
3401 case LL_IOC_SET_LEASE: {
3402 struct ll_ioc_lease ioc;
3404 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3407 RETURN(ll_file_set_lease(file, &ioc, arg));
3409 case LL_IOC_GET_LEASE: {
3410 struct ll_inode_info *lli = ll_i2info(inode);
3411 struct ldlm_lock *lock = NULL;
3414 mutex_lock(&lli->lli_och_mutex);
3415 if (fd->fd_lease_och != NULL) {
3416 struct obd_client_handle *och = fd->fd_lease_och;
3418 lock = ldlm_handle2lock(&och->och_lease_handle);
3420 lock_res_and_lock(lock);
3421 if (!ldlm_is_cancel(lock))
3422 fmode = och->och_flags;
3424 unlock_res_and_lock(lock);
3425 LDLM_LOCK_PUT(lock);
3428 mutex_unlock(&lli->lli_och_mutex);
3430 RETURN(ll_lease_type_from_fmode(fmode));
3432 case LL_IOC_HSM_IMPORT: {
3433 struct hsm_user_import *hui;
3439 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3444 rc = ll_hsm_import(inode, file, hui);
3449 case LL_IOC_FUTIMES_3: {
3450 struct ll_futimes_3 lfu;
3452 if (copy_from_user(&lfu,
3453 (const struct ll_futimes_3 __user *)arg,
3457 RETURN(ll_file_futimes_3(file, &lfu));
3459 case LL_IOC_LADVISE: {
3460 struct llapi_ladvise_hdr *k_ladvise_hdr;
3461 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3464 int alloc_size = sizeof(*k_ladvise_hdr);
3467 u_ladvise_hdr = (void __user *)arg;
3468 OBD_ALLOC_PTR(k_ladvise_hdr);
3469 if (k_ladvise_hdr == NULL)
3472 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3473 GOTO(out_ladvise, rc = -EFAULT);
3475 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3476 k_ladvise_hdr->lah_count < 1)
3477 GOTO(out_ladvise, rc = -EINVAL);
3479 num_advise = k_ladvise_hdr->lah_count;
3480 if (num_advise >= LAH_COUNT_MAX)
3481 GOTO(out_ladvise, rc = -EFBIG);
3483 OBD_FREE_PTR(k_ladvise_hdr);
3484 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3485 lah_advise[num_advise]);
3486 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3487 if (k_ladvise_hdr == NULL)
3491 * TODO: submit multiple advices to one server in a single RPC
3493 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3494 GOTO(out_ladvise, rc = -EFAULT);
3496 for (i = 0; i < num_advise; i++) {
3497 struct llapi_lu_ladvise *k_ladvise =
3498 &k_ladvise_hdr->lah_advise[i];
3499 struct llapi_lu_ladvise __user *u_ladvise =
3500 &u_ladvise_hdr->lah_advise[i];
3502 rc = ll_ladvise_sanity(inode, k_ladvise);
3504 GOTO(out_ladvise, rc);
3506 switch (k_ladvise->lla_advice) {
3507 case LU_LADVISE_LOCKNOEXPAND:
3508 rc = ll_lock_noexpand(file,
3509 k_ladvise->lla_peradvice_flags);
3510 GOTO(out_ladvise, rc);
3511 case LU_LADVISE_LOCKAHEAD:
3513 rc = ll_file_lock_ahead(file, k_ladvise);
3516 GOTO(out_ladvise, rc);
3519 &u_ladvise->lla_lockahead_result))
3520 GOTO(out_ladvise, rc = -EFAULT);
3523 rc = ll_ladvise(inode, file,
3524 k_ladvise_hdr->lah_flags,
3527 GOTO(out_ladvise, rc);
3534 OBD_FREE(k_ladvise_hdr, alloc_size);
3537 case LL_IOC_FLR_SET_MIRROR: {
3538 /* mirror I/O must be direct to avoid polluting page cache
3540 if (!(file->f_flags & O_DIRECT))
3543 fd->fd_designated_mirror = (__u32)arg;
3546 case LL_IOC_FSGETXATTR:
3547 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3548 case LL_IOC_FSSETXATTR:
3549 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3551 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3553 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3554 (void __user *)arg));
3558 #ifndef HAVE_FILE_LLSEEK_SIZE
3559 static inline loff_t
3560 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3562 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3564 if (offset > maxsize)
3567 if (offset != file->f_pos) {
3568 file->f_pos = offset;
3569 file->f_version = 0;
3575 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3576 loff_t maxsize, loff_t eof)
3578 struct inode *inode = file_inode(file);
3586 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3587 * position-querying operation. Avoid rewriting the "same"
3588 * f_pos value back to the file because a concurrent read(),
3589 * write() or lseek() might have altered it
3594 * f_lock protects against read/modify/write race with other
3595 * SEEK_CURs. Note that parallel writes and reads behave
3599 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3600 inode_unlock(inode);
3604 * In the generic case the entire file is data, so as long as
3605 * offset isn't at the end of the file then the offset is data.
3612 * There is a virtual hole at the end of the file, so as long as
3613 * offset isn't i_size or larger, return i_size.
3621 return llseek_execute(file, offset, maxsize);
3625 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3627 struct inode *inode = file_inode(file);
3628 loff_t retval, eof = 0;
3631 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3632 (origin == SEEK_CUR) ? file->f_pos : 0);
3633 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3634 PFID(ll_inode2fid(inode)), inode, retval, retval,
3636 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3638 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3639 retval = ll_glimpse_size(inode);
3642 eof = i_size_read(inode);
3645 retval = ll_generic_file_llseek_size(file, offset, origin,
3646 ll_file_maxbytes(inode), eof);
3650 static int ll_flush(struct file *file, fl_owner_t id)
3652 struct inode *inode = file_inode(file);
3653 struct ll_inode_info *lli = ll_i2info(inode);
3654 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3657 LASSERT(!S_ISDIR(inode->i_mode));
3659 /* catch async errors that were recorded back when async writeback
3660 * failed for pages in this mapping. */
3661 rc = lli->lli_async_rc;
3662 lli->lli_async_rc = 0;
3663 if (lli->lli_clob != NULL) {
3664 err = lov_read_and_clear_async_rc(lli->lli_clob);
3669 /* The application has been told write failure already.
3670 * Do not report failure again. */
3671 if (fd->fd_write_failed)
3673 return rc ? -EIO : 0;
3677 * Called to make sure a portion of file has been written out.
3678 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3680 * Return how many pages have been written.
3682 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3683 enum cl_fsync_mode mode, int ignore_layout)
3687 struct cl_fsync_io *fio;
3692 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3693 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3696 env = cl_env_get(&refcheck);
3698 RETURN(PTR_ERR(env));
3700 io = vvp_env_thread_io(env);
3701 io->ci_obj = ll_i2info(inode)->lli_clob;
3702 io->ci_ignore_layout = ignore_layout;
3704 /* initialize parameters for sync */
3705 fio = &io->u.ci_fsync;
3706 fio->fi_start = start;
3708 fio->fi_fid = ll_inode2fid(inode);
3709 fio->fi_mode = mode;
3710 fio->fi_nr_written = 0;
3712 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3713 result = cl_io_loop(env, io);
3715 result = io->ci_result;
3717 result = fio->fi_nr_written;
3718 cl_io_fini(env, io);
3719 cl_env_put(env, &refcheck);
3725 * When dentry is provided (the 'else' case), file_dentry() may be
3726 * null and dentry must be used directly rather than pulled from
3727 * file_dentry() as is done otherwise.
3730 #ifdef HAVE_FILE_FSYNC_4ARGS
3731 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3733 struct dentry *dentry = file_dentry(file);
3735 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3736 int ll_fsync(struct file *file, int datasync)
3738 struct dentry *dentry = file_dentry(file);
3740 loff_t end = LLONG_MAX;
3742 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3745 loff_t end = LLONG_MAX;
3747 struct inode *inode = dentry->d_inode;
3748 struct ll_inode_info *lli = ll_i2info(inode);
3749 struct ptlrpc_request *req;
3753 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3754 PFID(ll_inode2fid(inode)), inode);
3755 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3757 #ifdef HAVE_FILE_FSYNC_4ARGS
3758 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3759 lock_inode = !lli->lli_inode_locked;
3763 /* fsync's caller has already called _fdata{sync,write}, we want
3764 * that IO to finish before calling the osc and mdc sync methods */
3765 rc = filemap_fdatawait(inode->i_mapping);
3768 /* catch async errors that were recorded back when async writeback
3769 * failed for pages in this mapping. */
3770 if (!S_ISDIR(inode->i_mode)) {
3771 err = lli->lli_async_rc;
3772 lli->lli_async_rc = 0;
3775 if (lli->lli_clob != NULL) {
3776 err = lov_read_and_clear_async_rc(lli->lli_clob);
3782 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3786 ptlrpc_req_finished(req);
3788 if (S_ISREG(inode->i_mode)) {
3789 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3791 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3792 if (rc == 0 && err < 0)
3795 fd->fd_write_failed = true;
3797 fd->fd_write_failed = false;
3800 #ifdef HAVE_FILE_FSYNC_4ARGS
3802 inode_unlock(inode);
3808 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3810 struct inode *inode = file_inode(file);
3811 struct ll_sb_info *sbi = ll_i2sbi(inode);
3812 struct ldlm_enqueue_info einfo = {
3813 .ei_type = LDLM_FLOCK,
3814 .ei_cb_cp = ldlm_flock_completion_ast,
3815 .ei_cbdata = file_lock,
3817 struct md_op_data *op_data;
3818 struct lustre_handle lockh = { 0 };
3819 union ldlm_policy_data flock = { { 0 } };
3820 int fl_type = file_lock->fl_type;
3826 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3827 PFID(ll_inode2fid(inode)), file_lock);
3829 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3831 if (file_lock->fl_flags & FL_FLOCK) {
3832 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3833 /* flocks are whole-file locks */
3834 flock.l_flock.end = OFFSET_MAX;
3835 /* For flocks owner is determined by the local file desctiptor*/
3836 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3837 } else if (file_lock->fl_flags & FL_POSIX) {
3838 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3839 flock.l_flock.start = file_lock->fl_start;
3840 flock.l_flock.end = file_lock->fl_end;
3844 flock.l_flock.pid = file_lock->fl_pid;
3846 /* Somewhat ugly workaround for svc lockd.
3847 * lockd installs custom fl_lmops->lm_compare_owner that checks
3848 * for the fl_owner to be the same (which it always is on local node
3849 * I guess between lockd processes) and then compares pid.
3850 * As such we assign pid to the owner field to make it all work,
3851 * conflict with normal locks is unlikely since pid space and
3852 * pointer space for current->files are not intersecting */
3853 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3854 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3858 einfo.ei_mode = LCK_PR;
3861 /* An unlock request may or may not have any relation to
3862 * existing locks so we may not be able to pass a lock handle
3863 * via a normal ldlm_lock_cancel() request. The request may even
3864 * unlock a byte range in the middle of an existing lock. In
3865 * order to process an unlock request we need all of the same
3866 * information that is given with a normal read or write record
3867 * lock request. To avoid creating another ldlm unlock (cancel)
3868 * message we'll treat a LCK_NL flock request as an unlock. */
3869 einfo.ei_mode = LCK_NL;
3872 einfo.ei_mode = LCK_PW;
3875 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3890 flags = LDLM_FL_BLOCK_NOWAIT;
3896 flags = LDLM_FL_TEST_LOCK;
3899 CERROR("unknown fcntl lock command: %d\n", cmd);
3903 /* Save the old mode so that if the mode in the lock changes we
3904 * can decrement the appropriate reader or writer refcount. */
3905 file_lock->fl_type = einfo.ei_mode;
3907 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3908 LUSTRE_OPC_ANY, NULL);
3909 if (IS_ERR(op_data))
3910 RETURN(PTR_ERR(op_data));
3912 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3913 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3914 flock.l_flock.pid, flags, einfo.ei_mode,
3915 flock.l_flock.start, flock.l_flock.end);
3917 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3920 /* Restore the file lock type if not TEST lock. */
3921 if (!(flags & LDLM_FL_TEST_LOCK))
3922 file_lock->fl_type = fl_type;
3924 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3925 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3926 !(flags & LDLM_FL_TEST_LOCK))
3927 rc2 = locks_lock_file_wait(file, file_lock);
3929 if ((file_lock->fl_flags & FL_FLOCK) &&
3930 (rc == 0 || file_lock->fl_type == F_UNLCK))
3931 rc2 = flock_lock_file_wait(file, file_lock);
3932 if ((file_lock->fl_flags & FL_POSIX) &&
3933 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3934 !(flags & LDLM_FL_TEST_LOCK))
3935 rc2 = posix_lock_file_wait(file, file_lock);
3936 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3938 if (rc2 && file_lock->fl_type != F_UNLCK) {
3939 einfo.ei_mode = LCK_NL;
3940 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3945 ll_finish_md_op_data(op_data);
3950 int ll_get_fid_by_name(struct inode *parent, const char *name,
3951 int namelen, struct lu_fid *fid,
3952 struct inode **inode)
3954 struct md_op_data *op_data = NULL;
3955 struct mdt_body *body;
3956 struct ptlrpc_request *req;
3960 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3961 LUSTRE_OPC_ANY, NULL);
3962 if (IS_ERR(op_data))
3963 RETURN(PTR_ERR(op_data));
3965 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3966 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3967 ll_finish_md_op_data(op_data);
3971 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3973 GOTO(out_req, rc = -EFAULT);
3975 *fid = body->mbo_fid1;
3978 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3980 ptlrpc_req_finished(req);
3984 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3985 const char *name, int namelen)
3987 struct dentry *dchild = NULL;
3988 struct inode *child_inode = NULL;
3989 struct md_op_data *op_data;
3990 struct ptlrpc_request *request = NULL;
3991 struct obd_client_handle *och = NULL;
3993 struct mdt_body *body;
3995 __u64 data_version = 0;
3998 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3999 name, PFID(ll_inode2fid(parent)), mdtidx);
4001 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4002 0, LUSTRE_OPC_ANY, NULL);
4003 if (IS_ERR(op_data))
4004 RETURN(PTR_ERR(op_data));
4006 /* Get child FID first */
4007 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4010 dchild = d_lookup(file_dentry(file), &qstr);
4011 if (dchild != NULL) {
4012 if (dchild->d_inode != NULL)
4013 child_inode = igrab(dchild->d_inode);
4017 if (child_inode == NULL) {
4018 rc = ll_get_fid_by_name(parent, name, namelen,
4019 &op_data->op_fid3, &child_inode);
4024 if (child_inode == NULL)
4025 GOTO(out_free, rc = -EINVAL);
4028 * lfs migrate command needs to be blocked on the client
4029 * by checking the migrate FID against the FID of the
4032 if (child_inode == parent->i_sb->s_root->d_inode)
4033 GOTO(out_iput, rc = -EINVAL);
4035 inode_lock(child_inode);
4036 op_data->op_fid3 = *ll_inode2fid(child_inode);
4037 if (!fid_is_sane(&op_data->op_fid3)) {
4038 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4039 ll_get_fsname(parent->i_sb, NULL, 0), name,
4040 PFID(&op_data->op_fid3));
4041 GOTO(out_unlock, rc = -EINVAL);
4044 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4046 GOTO(out_unlock, rc);
4049 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4050 PFID(&op_data->op_fid3), mdtidx);
4051 GOTO(out_unlock, rc = 0);
4054 if (S_ISREG(child_inode->i_mode)) {
4055 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4059 GOTO(out_unlock, rc);
4062 rc = ll_data_version(child_inode, &data_version,
4065 GOTO(out_close, rc);
4067 op_data->op_handle = och->och_fh;
4068 op_data->op_data = och->och_mod;
4069 op_data->op_data_version = data_version;
4070 op_data->op_lease_handle = och->och_lease_handle;
4071 op_data->op_bias |= MDS_RENAME_MIGRATE;
4074 op_data->op_mds = mdtidx;
4075 op_data->op_cli_flags = CLI_MIGRATE;
4076 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4077 namelen, name, namelen, &request);
4079 LASSERT(request != NULL);
4080 ll_update_times(request, parent);
4082 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4083 LASSERT(body != NULL);
4085 /* If the server does release layout lock, then we cleanup
4086 * the client och here, otherwise release it in out_close: */
4088 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4089 obd_mod_put(och->och_mod);
4090 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4092 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4098 if (request != NULL) {
4099 ptlrpc_req_finished(request);
4103 /* Try again if the file layout has changed. */
4104 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4108 if (och != NULL) /* close the file */
4109 ll_lease_close(och, child_inode, NULL);
4111 clear_nlink(child_inode);
4113 inode_unlock(child_inode);
4117 ll_finish_md_op_data(op_data);
4122 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4130 * test if some locks matching bits and l_req_mode are acquired
4131 * - bits can be in different locks
4132 * - if found clear the common lock bits in *bits
4133 * - the bits not found, are kept in *bits
4135 * \param bits [IN] searched lock bits [IN]
4136 * \param l_req_mode [IN] searched lock mode
4137 * \retval boolean, true iff all bits are found
4139 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4141 struct lustre_handle lockh;
4142 union ldlm_policy_data policy;
4143 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4144 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4153 fid = &ll_i2info(inode)->lli_fid;
4154 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4155 ldlm_lockname[mode]);
4157 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4158 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4159 policy.l_inodebits.bits = *bits & (1 << i);
4160 if (policy.l_inodebits.bits == 0)
4163 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4164 &policy, mode, &lockh)) {
4165 struct ldlm_lock *lock;
4167 lock = ldlm_handle2lock(&lockh);
4170 ~(lock->l_policy_data.l_inodebits.bits);
4171 LDLM_LOCK_PUT(lock);
4173 *bits &= ~policy.l_inodebits.bits;
4180 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4181 struct lustre_handle *lockh, __u64 flags,
4182 enum ldlm_mode mode)
4184 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4189 fid = &ll_i2info(inode)->lli_fid;
4190 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4192 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4193 fid, LDLM_IBITS, &policy, mode, lockh);
4198 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4200 /* Already unlinked. Just update nlink and return success */
4201 if (rc == -ENOENT) {
4203 /* If it is striped directory, and there is bad stripe
4204 * Let's revalidate the dentry again, instead of returning
4206 if (S_ISDIR(inode->i_mode) &&
4207 ll_i2info(inode)->lli_lsm_md != NULL)
4210 /* This path cannot be hit for regular files unless in
4211 * case of obscure races, so no need to to validate
4213 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4215 } else if (rc != 0) {
4216 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4217 "%s: revalidate FID "DFID" error: rc = %d\n",
4218 ll_get_fsname(inode->i_sb, NULL, 0),
4219 PFID(ll_inode2fid(inode)), rc);
4225 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4227 struct inode *inode = dentry->d_inode;
4228 struct obd_export *exp = ll_i2mdexp(inode);
4229 struct lookup_intent oit = {
4232 struct ptlrpc_request *req = NULL;
4233 struct md_op_data *op_data;
4237 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4238 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4240 /* Call getattr by fid, so do not provide name at all. */
4241 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4242 LUSTRE_OPC_ANY, NULL);
4243 if (IS_ERR(op_data))
4244 RETURN(PTR_ERR(op_data));
4246 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4247 ll_finish_md_op_data(op_data);
4249 rc = ll_inode_revalidate_fini(inode, rc);
4253 rc = ll_revalidate_it_finish(req, &oit, dentry);
4255 ll_intent_release(&oit);
4259 /* Unlinked? Unhash dentry, so it is not picked up later by
4260 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4261 * here to preserve get_cwd functionality on 2.6.
4263 if (!dentry->d_inode->i_nlink) {
4264 ll_lock_dcache(inode);
4265 d_lustre_invalidate(dentry, 0);
4266 ll_unlock_dcache(inode);
4269 ll_lookup_finish_locks(&oit, dentry);
4271 ptlrpc_req_finished(req);
4276 static int ll_merge_md_attr(struct inode *inode)
4278 struct cl_attr attr = { 0 };
4281 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4282 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4283 &attr, ll_md_blocking_ast);
4287 set_nlink(inode, attr.cat_nlink);
4288 inode->i_blocks = attr.cat_blocks;
4289 i_size_write(inode, attr.cat_size);
4291 ll_i2info(inode)->lli_atime = attr.cat_atime;
4292 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4293 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4298 static inline dev_t ll_compat_encode_dev(dev_t dev)
4300 /* The compat_sys_*stat*() syscalls will fail unless the
4301 * device majors and minors are both less than 256. Note that
4302 * the value returned here will be passed through
4303 * old_encode_dev() in cp_compat_stat(). And so we are not
4304 * trying to return a valid compat (u16) device number, just
4305 * one that will pass the old_valid_dev() check. */
4307 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4310 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4311 int ll_getattr(const struct path *path, struct kstat *stat,
4312 u32 request_mask, unsigned int flags)
4314 struct dentry *de = path->dentry;
4316 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4319 struct inode *inode = de->d_inode;
4320 struct ll_sb_info *sbi = ll_i2sbi(inode);
4321 struct ll_inode_info *lli = ll_i2info(inode);
4324 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4326 rc = ll_inode_revalidate(de, IT_GETATTR);
4330 if (S_ISREG(inode->i_mode)) {
4331 /* In case of restore, the MDT has the right size and has
4332 * already send it back without granting the layout lock,
4333 * inode is up-to-date so glimpse is useless.
4334 * Also to glimpse we need the layout, in case of a running
4335 * restore the MDT holds the layout lock so the glimpse will
4336 * block up to the end of restore (getattr will block)
4338 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4339 rc = ll_glimpse_size(inode);
4344 /* If object isn't regular a file then don't validate size. */
4345 if (S_ISDIR(inode->i_mode) &&
4346 lli->lli_lsm_md != NULL) {
4347 rc = ll_merge_md_attr(inode);
4352 LTIME_S(inode->i_atime) = lli->lli_atime;
4353 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4354 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4357 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4359 if (ll_need_32bit_api(sbi)) {
4360 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4361 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4362 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4364 stat->ino = inode->i_ino;
4365 stat->dev = inode->i_sb->s_dev;
4366 stat->rdev = inode->i_rdev;
4369 stat->mode = inode->i_mode;
4370 stat->uid = inode->i_uid;
4371 stat->gid = inode->i_gid;
4372 stat->atime = inode->i_atime;
4373 stat->mtime = inode->i_mtime;
4374 stat->ctime = inode->i_ctime;
4375 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4377 stat->nlink = inode->i_nlink;
4378 stat->size = i_size_read(inode);
4379 stat->blocks = inode->i_blocks;
4384 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4385 __u64 start, __u64 len)
4389 struct fiemap *fiemap;
4390 unsigned int extent_count = fieinfo->fi_extents_max;
4392 num_bytes = sizeof(*fiemap) + (extent_count *
4393 sizeof(struct fiemap_extent));
4394 OBD_ALLOC_LARGE(fiemap, num_bytes);
4399 fiemap->fm_flags = fieinfo->fi_flags;
4400 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4401 fiemap->fm_start = start;
4402 fiemap->fm_length = len;
4403 if (extent_count > 0 &&
4404 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4405 sizeof(struct fiemap_extent)) != 0)
4406 GOTO(out, rc = -EFAULT);
4408 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4410 fieinfo->fi_flags = fiemap->fm_flags;
4411 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4412 if (extent_count > 0 &&
4413 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4414 fiemap->fm_mapped_extents *
4415 sizeof(struct fiemap_extent)) != 0)
4416 GOTO(out, rc = -EFAULT);
4418 OBD_FREE_LARGE(fiemap, num_bytes);
4422 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4424 struct ll_inode_info *lli = ll_i2info(inode);
4425 struct posix_acl *acl = NULL;
4428 spin_lock(&lli->lli_lock);
4429 /* VFS' acl_permission_check->check_acl will release the refcount */
4430 acl = posix_acl_dup(lli->lli_posix_acl);
4431 spin_unlock(&lli->lli_lock);
4436 #ifdef HAVE_IOP_SET_ACL
4437 #ifdef CONFIG_FS_POSIX_ACL
4438 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4440 struct ll_sb_info *sbi = ll_i2sbi(inode);
4441 struct ptlrpc_request *req = NULL;
4442 const char *name = NULL;
4444 size_t value_size = 0;
4449 case ACL_TYPE_ACCESS:
4450 name = XATTR_NAME_POSIX_ACL_ACCESS;
4452 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4455 case ACL_TYPE_DEFAULT:
4456 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4457 if (!S_ISDIR(inode->i_mode))
4458 rc = acl ? -EACCES : 0;
4469 value_size = posix_acl_xattr_size(acl->a_count);
4470 value = kmalloc(value_size, GFP_NOFS);
4472 GOTO(out, rc = -ENOMEM);
4474 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4476 GOTO(out_value, rc);
4479 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4480 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4481 name, value, value_size, 0, 0, &req);
4483 ptlrpc_req_finished(req);
4488 forget_cached_acl(inode, type);
4490 set_cached_acl(inode, type, acl);
4493 #endif /* CONFIG_FS_POSIX_ACL */
4494 #endif /* HAVE_IOP_SET_ACL */
4496 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4498 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4499 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4501 ll_check_acl(struct inode *inode, int mask)
4504 # ifdef CONFIG_FS_POSIX_ACL
4505 struct posix_acl *acl;
4509 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4510 if (flags & IPERM_FLAG_RCU)
4513 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4518 rc = posix_acl_permission(inode, acl, mask);
4519 posix_acl_release(acl);
4522 # else /* !CONFIG_FS_POSIX_ACL */
4524 # endif /* CONFIG_FS_POSIX_ACL */
4526 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4528 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4529 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4531 # ifdef HAVE_INODE_PERMISION_2ARGS
4532 int ll_inode_permission(struct inode *inode, int mask)
4534 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4539 struct ll_sb_info *sbi;
4540 struct root_squash_info *squash;
4541 struct cred *cred = NULL;
4542 const struct cred *old_cred = NULL;
4544 bool squash_id = false;
4547 #ifdef MAY_NOT_BLOCK
4548 if (mask & MAY_NOT_BLOCK)
4550 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4551 if (flags & IPERM_FLAG_RCU)
4555 /* as root inode are NOT getting validated in lookup operation,
4556 * need to do it before permission check. */
4558 if (inode == inode->i_sb->s_root->d_inode) {
4559 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4564 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4565 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4567 /* squash fsuid/fsgid if needed */
4568 sbi = ll_i2sbi(inode);
4569 squash = &sbi->ll_squash;
4570 if (unlikely(squash->rsi_uid != 0 &&
4571 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4572 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4576 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4577 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4578 squash->rsi_uid, squash->rsi_gid);
4580 /* update current process's credentials
4581 * and FS capability */
4582 cred = prepare_creds();
4586 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4587 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4588 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4589 if ((1 << cap) & CFS_CAP_FS_MASK)
4590 cap_lower(cred->cap_effective, cap);
4592 old_cred = override_creds(cred);
4595 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4596 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4597 /* restore current process's credentials and FS capability */
4599 revert_creds(old_cred);
4606 /* -o localflock - only provides locally consistent flock locks */
4607 struct file_operations ll_file_operations = {
4608 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4609 # ifdef HAVE_SYNC_READ_WRITE
4610 .read = new_sync_read,
4611 .write = new_sync_write,
4613 .read_iter = ll_file_read_iter,
4614 .write_iter = ll_file_write_iter,
4615 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4616 .read = ll_file_read,
4617 .aio_read = ll_file_aio_read,
4618 .write = ll_file_write,
4619 .aio_write = ll_file_aio_write,
4620 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4621 .unlocked_ioctl = ll_file_ioctl,
4622 .open = ll_file_open,
4623 .release = ll_file_release,
4624 .mmap = ll_file_mmap,
4625 .llseek = ll_file_seek,
4626 .splice_read = ll_file_splice_read,
4631 struct file_operations ll_file_operations_flock = {
4632 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4633 # ifdef HAVE_SYNC_READ_WRITE
4634 .read = new_sync_read,
4635 .write = new_sync_write,
4636 # endif /* HAVE_SYNC_READ_WRITE */
4637 .read_iter = ll_file_read_iter,
4638 .write_iter = ll_file_write_iter,
4639 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4640 .read = ll_file_read,
4641 .aio_read = ll_file_aio_read,
4642 .write = ll_file_write,
4643 .aio_write = ll_file_aio_write,
4644 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4645 .unlocked_ioctl = ll_file_ioctl,
4646 .open = ll_file_open,
4647 .release = ll_file_release,
4648 .mmap = ll_file_mmap,
4649 .llseek = ll_file_seek,
4650 .splice_read = ll_file_splice_read,
4653 .flock = ll_file_flock,
4654 .lock = ll_file_flock
4657 /* These are for -o noflock - to return ENOSYS on flock calls */
4658 struct file_operations ll_file_operations_noflock = {
4659 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4660 # ifdef HAVE_SYNC_READ_WRITE
4661 .read = new_sync_read,
4662 .write = new_sync_write,
4663 # endif /* HAVE_SYNC_READ_WRITE */
4664 .read_iter = ll_file_read_iter,
4665 .write_iter = ll_file_write_iter,
4666 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4667 .read = ll_file_read,
4668 .aio_read = ll_file_aio_read,
4669 .write = ll_file_write,
4670 .aio_write = ll_file_aio_write,
4671 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4672 .unlocked_ioctl = ll_file_ioctl,
4673 .open = ll_file_open,
4674 .release = ll_file_release,
4675 .mmap = ll_file_mmap,
4676 .llseek = ll_file_seek,
4677 .splice_read = ll_file_splice_read,
4680 .flock = ll_file_noflock,
4681 .lock = ll_file_noflock
4684 struct inode_operations ll_file_inode_operations = {
4685 .setattr = ll_setattr,
4686 .getattr = ll_getattr,
4687 .permission = ll_inode_permission,
4688 #ifdef HAVE_IOP_XATTR
4689 .setxattr = ll_setxattr,
4690 .getxattr = ll_getxattr,
4691 .removexattr = ll_removexattr,
4693 .listxattr = ll_listxattr,
4694 .fiemap = ll_fiemap,
4695 #ifdef HAVE_IOP_GET_ACL
4696 .get_acl = ll_get_acl,
4698 #ifdef HAVE_IOP_SET_ACL
4699 .set_acl = ll_set_acl,
4703 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4705 struct ll_inode_info *lli = ll_i2info(inode);
4706 struct cl_object *obj = lli->lli_clob;
4715 env = cl_env_get(&refcheck);
4717 RETURN(PTR_ERR(env));
4719 rc = cl_conf_set(env, lli->lli_clob, conf);
4723 if (conf->coc_opc == OBJECT_CONF_SET) {
4724 struct ldlm_lock *lock = conf->coc_lock;
4725 struct cl_layout cl = {
4729 LASSERT(lock != NULL);
4730 LASSERT(ldlm_has_layout(lock));
4732 /* it can only be allowed to match after layout is
4733 * applied to inode otherwise false layout would be
4734 * seen. Applying layout shoud happen before dropping
4735 * the intent lock. */
4736 ldlm_lock_allow_match(lock);
4738 rc = cl_object_layout_get(env, obj, &cl);
4743 DFID": layout version change: %u -> %u\n",
4744 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4746 ll_layout_version_set(lli, cl.cl_layout_gen);
4750 cl_env_put(env, &refcheck);
4755 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4756 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4759 struct ll_sb_info *sbi = ll_i2sbi(inode);
4760 struct ptlrpc_request *req;
4761 struct mdt_body *body;
4768 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4769 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4770 lock->l_lvb_data, lock->l_lvb_len);
4772 if (lock->l_lvb_data != NULL)
4775 /* if layout lock was granted right away, the layout is returned
4776 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4777 * blocked and then granted via completion ast, we have to fetch
4778 * layout here. Please note that we can't use the LVB buffer in
4779 * completion AST because it doesn't have a large enough buffer */
4780 rc = ll_get_default_mdsize(sbi, &lmmsize);
4782 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4783 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4787 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4789 GOTO(out, rc = -EPROTO);
4791 lmmsize = body->mbo_eadatasize;
4792 if (lmmsize == 0) /* empty layout */
4795 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4797 GOTO(out, rc = -EFAULT);
4799 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4800 if (lvbdata == NULL)
4801 GOTO(out, rc = -ENOMEM);
4803 memcpy(lvbdata, lmm, lmmsize);
4804 lock_res_and_lock(lock);
4805 if (unlikely(lock->l_lvb_data == NULL)) {
4806 lock->l_lvb_type = LVB_T_LAYOUT;
4807 lock->l_lvb_data = lvbdata;
4808 lock->l_lvb_len = lmmsize;
4811 unlock_res_and_lock(lock);
4814 OBD_FREE_LARGE(lvbdata, lmmsize);
4819 ptlrpc_req_finished(req);
4824 * Apply the layout to the inode. Layout lock is held and will be released
4827 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4828 struct inode *inode)
4830 struct ll_inode_info *lli = ll_i2info(inode);
4831 struct ll_sb_info *sbi = ll_i2sbi(inode);
4832 struct ldlm_lock *lock;
4833 struct cl_object_conf conf;
4836 bool wait_layout = false;
4839 LASSERT(lustre_handle_is_used(lockh));
4841 lock = ldlm_handle2lock(lockh);
4842 LASSERT(lock != NULL);
4843 LASSERT(ldlm_has_layout(lock));
4845 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4846 PFID(&lli->lli_fid), inode);
4848 /* in case this is a caching lock and reinstate with new inode */
4849 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4851 lock_res_and_lock(lock);
4852 lvb_ready = ldlm_is_lvb_ready(lock);
4853 unlock_res_and_lock(lock);
4855 /* checking lvb_ready is racy but this is okay. The worst case is
4856 * that multi processes may configure the file on the same time. */
4860 rc = ll_layout_fetch(inode, lock);
4864 /* for layout lock, lmm is stored in lock's lvb.
4865 * lvb_data is immutable if the lock is held so it's safe to access it
4868 * set layout to file. Unlikely this will fail as old layout was
4869 * surely eliminated */
4870 memset(&conf, 0, sizeof conf);
4871 conf.coc_opc = OBJECT_CONF_SET;
4872 conf.coc_inode = inode;
4873 conf.coc_lock = lock;
4874 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4875 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4876 rc = ll_layout_conf(inode, &conf);
4878 /* refresh layout failed, need to wait */
4879 wait_layout = rc == -EBUSY;
4882 LDLM_LOCK_PUT(lock);
4883 ldlm_lock_decref(lockh, mode);
4885 /* wait for IO to complete if it's still being used. */
4887 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4888 ll_get_fsname(inode->i_sb, NULL, 0),
4889 PFID(&lli->lli_fid), inode);
4891 memset(&conf, 0, sizeof conf);
4892 conf.coc_opc = OBJECT_CONF_WAIT;
4893 conf.coc_inode = inode;
4894 rc = ll_layout_conf(inode, &conf);
4898 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4899 ll_get_fsname(inode->i_sb, NULL, 0),
4900 PFID(&lli->lli_fid), rc);
4906 * Issue layout intent RPC to MDS.
4907 * \param inode [in] file inode
4908 * \param intent [in] layout intent
4910 * \retval 0 on success
4911 * \retval < 0 error code
4913 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4915 struct ll_inode_info *lli = ll_i2info(inode);
4916 struct ll_sb_info *sbi = ll_i2sbi(inode);
4917 struct md_op_data *op_data;
4918 struct lookup_intent it;
4919 struct ptlrpc_request *req;
4923 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4924 0, 0, LUSTRE_OPC_ANY, NULL);
4925 if (IS_ERR(op_data))
4926 RETURN(PTR_ERR(op_data));
4928 op_data->op_data = intent;
4929 op_data->op_data_size = sizeof(*intent);
4931 memset(&it, 0, sizeof(it));
4932 it.it_op = IT_LAYOUT;
4933 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4934 intent->li_opc == LAYOUT_INTENT_TRUNC)
4935 it.it_flags = FMODE_WRITE;
4937 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4938 ll_get_fsname(inode->i_sb, NULL, 0),
4939 PFID(&lli->lli_fid), inode);
4941 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4942 &ll_md_blocking_ast, 0);
4943 if (it.it_request != NULL)
4944 ptlrpc_req_finished(it.it_request);
4945 it.it_request = NULL;
4947 ll_finish_md_op_data(op_data);
4949 /* set lock data in case this is a new lock */
4951 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4953 ll_intent_drop_lock(&it);
4959 * This function checks if there exists a LAYOUT lock on the client side,
4960 * or enqueues it if it doesn't have one in cache.
4962 * This function will not hold layout lock so it may be revoked any time after
4963 * this function returns. Any operations depend on layout should be redone
4966 * This function should be called before lov_io_init() to get an uptodate
4967 * layout version, the caller should save the version number and after IO
4968 * is finished, this function should be called again to verify that layout
4969 * is not changed during IO time.
4971 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4973 struct ll_inode_info *lli = ll_i2info(inode);
4974 struct ll_sb_info *sbi = ll_i2sbi(inode);
4975 struct lustre_handle lockh;
4976 struct layout_intent intent = {
4977 .li_opc = LAYOUT_INTENT_ACCESS,
4979 enum ldlm_mode mode;
4983 *gen = ll_layout_version_get(lli);
4984 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4988 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4989 LASSERT(S_ISREG(inode->i_mode));
4991 /* take layout lock mutex to enqueue layout lock exclusively. */
4992 mutex_lock(&lli->lli_layout_mutex);
4995 /* mostly layout lock is caching on the local side, so try to
4996 * match it before grabbing layout lock mutex. */
4997 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4998 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4999 if (mode != 0) { /* hit cached lock */
5000 rc = ll_layout_lock_set(&lockh, mode, inode);
5006 rc = ll_layout_intent(inode, &intent);
5012 *gen = ll_layout_version_get(lli);
5013 mutex_unlock(&lli->lli_layout_mutex);
5019 * Issue layout intent RPC indicating where in a file an IO is about to write.
5021 * \param[in] inode file inode.
5022 * \param[in] ext write range with start offset of fille in bytes where
5023 * an IO is about to write, and exclusive end offset in
5026 * \retval 0 on success
5027 * \retval < 0 error code
5029 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5030 struct lu_extent *ext)
5032 struct layout_intent intent = {
5034 .li_extent.e_start = ext->e_start,
5035 .li_extent.e_end = ext->e_end,
5040 rc = ll_layout_intent(inode, &intent);
5046 * This function send a restore request to the MDT
5048 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5050 struct hsm_user_request *hur;
5054 len = sizeof(struct hsm_user_request) +
5055 sizeof(struct hsm_user_item);
5056 OBD_ALLOC(hur, len);
5060 hur->hur_request.hr_action = HUA_RESTORE;
5061 hur->hur_request.hr_archive_id = 0;
5062 hur->hur_request.hr_flags = 0;
5063 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5064 sizeof(hur->hur_user_item[0].hui_fid));
5065 hur->hur_user_item[0].hui_extent.offset = offset;
5066 hur->hur_user_item[0].hui_extent.length = length;
5067 hur->hur_request.hr_itemcount = 1;
5068 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,