4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_MERGE:
153 /* merge blocks from the victim inode */
154 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
155 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
156 case MDS_CLOSE_LAYOUT_SPLIT:
157 case MDS_CLOSE_LAYOUT_SWAP: {
158 struct split_param *sp = data;
160 LASSERT(data != NULL);
161 op_data->op_bias |= bias;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
165 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
166 op_data->op_mirror_id = sp->sp_mirror_id;
168 op_data->op_fid2 = *ll_inode2fid(data);
173 case MDS_CLOSE_RESYNC_DONE: {
174 struct ll_ioc_lease *ioc = data;
176 LASSERT(data != NULL);
177 op_data->op_attr_blocks +=
178 ioc->lil_count * op_data->op_attr_blocks;
179 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
180 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
182 op_data->op_lease_handle = och->och_lease_handle;
183 op_data->op_data = &ioc->lil_ids[0];
184 op_data->op_data_size =
185 ioc->lil_count * sizeof(ioc->lil_ids[0]);
189 case MDS_HSM_RELEASE:
190 LASSERT(data != NULL);
191 op_data->op_bias |= MDS_HSM_RELEASE;
192 op_data->op_data_version = *(__u64 *)data;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
198 LASSERT(data == NULL);
202 rc = md_close(md_exp, op_data, och->och_mod, &req);
203 if (rc != 0 && rc != -EINTR)
204 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
205 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
207 if (rc == 0 && op_data->op_bias & bias) {
208 struct mdt_body *body;
210 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
211 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
215 ll_finish_md_op_data(op_data);
219 md_clear_open_replay_data(md_exp, och);
220 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
223 ptlrpc_req_finished(req); /* This is close request */
227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
229 struct ll_inode_info *lli = ll_i2info(inode);
230 struct obd_client_handle **och_p;
231 struct obd_client_handle *och;
236 if (fmode & FMODE_WRITE) {
237 och_p = &lli->lli_mds_write_och;
238 och_usecount = &lli->lli_open_fd_write_count;
239 } else if (fmode & FMODE_EXEC) {
240 och_p = &lli->lli_mds_exec_och;
241 och_usecount = &lli->lli_open_fd_exec_count;
243 LASSERT(fmode & FMODE_READ);
244 och_p = &lli->lli_mds_read_och;
245 och_usecount = &lli->lli_open_fd_read_count;
248 mutex_lock(&lli->lli_och_mutex);
249 if (*och_usecount > 0) {
250 /* There are still users of this handle, so skip
252 mutex_unlock(&lli->lli_och_mutex);
258 mutex_unlock(&lli->lli_och_mutex);
261 /* There might be a race and this handle may already
263 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
269 static int ll_md_close(struct inode *inode, struct file *file)
271 union ldlm_policy_data policy = {
272 .l_inodebits = { MDS_INODELOCK_OPEN },
274 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
275 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
276 struct ll_inode_info *lli = ll_i2info(inode);
277 struct lustre_handle lockh;
278 enum ldlm_mode lockmode;
282 /* clear group lock, if present */
283 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
284 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
286 if (fd->fd_lease_och != NULL) {
289 /* Usually the lease is not released when the
290 * application crashed, we need to release here. */
291 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
292 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
293 PFID(&lli->lli_fid), rc, lease_broken);
295 fd->fd_lease_och = NULL;
298 if (fd->fd_och != NULL) {
299 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
304 /* Let's see if we have good enough OPEN lock on the file and if
305 we can skip talking to MDS */
306 mutex_lock(&lli->lli_och_mutex);
307 if (fd->fd_omode & FMODE_WRITE) {
309 LASSERT(lli->lli_open_fd_write_count);
310 lli->lli_open_fd_write_count--;
311 } else if (fd->fd_omode & FMODE_EXEC) {
313 LASSERT(lli->lli_open_fd_exec_count);
314 lli->lli_open_fd_exec_count--;
317 LASSERT(lli->lli_open_fd_read_count);
318 lli->lli_open_fd_read_count--;
320 mutex_unlock(&lli->lli_och_mutex);
322 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
323 LDLM_IBITS, &policy, lockmode, &lockh))
324 rc = ll_md_real_close(inode, fd->fd_omode);
327 LUSTRE_FPRIVATE(file) = NULL;
328 ll_file_data_put(fd);
333 /* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here. Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
338 int ll_file_release(struct inode *inode, struct file *file)
340 struct ll_file_data *fd;
341 struct ll_sb_info *sbi = ll_i2sbi(inode);
342 struct ll_inode_info *lli = ll_i2info(inode);
346 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
347 PFID(ll_inode2fid(inode)), inode);
349 if (inode->i_sb->s_root != file_dentry(file))
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
354 /* The last ref on @file, maybe not the the owner pid of statahead,
355 * because parent and child process can share the same file handle. */
356 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
357 ll_deauthorize_statahead(inode, fd);
359 if (inode->i_sb->s_root == file_dentry(file)) {
360 LUSTRE_FPRIVATE(file) = NULL;
361 ll_file_data_put(fd);
365 if (!S_ISDIR(inode->i_mode)) {
366 if (lli->lli_clob != NULL)
367 lov_read_and_clear_async_rc(lli->lli_clob);
368 lli->lli_async_rc = 0;
371 rc = ll_md_close(inode, file);
373 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
374 libcfs_debug_dumplog();
379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
380 struct lookup_intent *itp)
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
440 /* We did open by fid, but by the time we got to the server,
441 * the object disappeared. If this is a create, we cannot really
442 * tell the userspace that the file it was trying to create
443 * does not exist. Instead let's return -ESTALE, and the VFS will
444 * retry the create with LOOKUP_REVAL that we are going to catch
445 * in ll_revalidate_dentry() and use lookup then.
447 if (rc == -ENOENT && itp->it_op & IT_CREAT)
453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
454 struct obd_client_handle *och)
456 struct mdt_body *body;
458 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
459 och->och_fh = body->mbo_handle;
460 och->och_fid = body->mbo_fid1;
461 och->och_lease_handle.cookie = it->it_lock_handle;
462 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
463 och->och_flags = it->it_flags;
465 return md_set_open_replay_data(md_exp, och, it);
468 static int ll_local_open(struct file *file, struct lookup_intent *it,
469 struct ll_file_data *fd, struct obd_client_handle *och)
471 struct inode *inode = file_inode(file);
474 LASSERT(!LUSTRE_FPRIVATE(file));
481 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
486 LUSTRE_FPRIVATE(file) = fd;
487 ll_readahead_init(inode, &fd->fd_ras);
488 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
490 /* ll_cl_context initialize */
491 rwlock_init(&fd->fd_lock);
492 INIT_LIST_HEAD(&fd->fd_lccs);
497 /* Open a file, and (for the very first open) create objects on the OSTs at
498 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
499 * creation or open until ll_lov_setstripe() ioctl is called.
501 * If we already have the stripe MD locally then we don't request it in
502 * md_open(), by passing a lmm_size = 0.
504 * It is up to the application to ensure no other processes open this file
505 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
506 * used. We might be able to avoid races of that sort by getting lli_open_sem
507 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
508 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
510 int ll_file_open(struct inode *inode, struct file *file)
512 struct ll_inode_info *lli = ll_i2info(inode);
513 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
514 .it_flags = file->f_flags };
515 struct obd_client_handle **och_p = NULL;
516 __u64 *och_usecount = NULL;
517 struct ll_file_data *fd;
521 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
522 PFID(ll_inode2fid(inode)), inode, file->f_flags);
524 it = file->private_data; /* XXX: compat macro */
525 file->private_data = NULL; /* prevent ll_local_open assertion */
527 fd = ll_file_data_get();
529 GOTO(out_nofiledata, rc = -ENOMEM);
532 if (S_ISDIR(inode->i_mode))
533 ll_authorize_statahead(inode, fd);
535 if (inode->i_sb->s_root == file_dentry(file)) {
536 LUSTRE_FPRIVATE(file) = fd;
540 if (!it || !it->it_disposition) {
541 /* Convert f_flags into access mode. We cannot use file->f_mode,
542 * because everything but O_ACCMODE mask was stripped from
544 if ((oit.it_flags + 1) & O_ACCMODE)
546 if (file->f_flags & O_TRUNC)
547 oit.it_flags |= FMODE_WRITE;
549 /* kernel only call f_op->open in dentry_open. filp_open calls
550 * dentry_open after call to open_namei that checks permissions.
551 * Only nfsd_open call dentry_open directly without checking
552 * permissions and because of that this code below is safe. */
553 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
554 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556 /* We do not want O_EXCL here, presumably we opened the file
557 * already? XXX - NFS implications? */
558 oit.it_flags &= ~O_EXCL;
560 /* bug20584, if "it_flags" contains O_CREAT, the file will be
561 * created if necessary, then "IT_CREAT" should be set to keep
562 * consistent with it */
563 if (oit.it_flags & O_CREAT)
564 oit.it_op |= IT_CREAT;
570 /* Let's see if we have file open on MDS already. */
571 if (it->it_flags & FMODE_WRITE) {
572 och_p = &lli->lli_mds_write_och;
573 och_usecount = &lli->lli_open_fd_write_count;
574 } else if (it->it_flags & FMODE_EXEC) {
575 och_p = &lli->lli_mds_exec_och;
576 och_usecount = &lli->lli_open_fd_exec_count;
578 och_p = &lli->lli_mds_read_och;
579 och_usecount = &lli->lli_open_fd_read_count;
582 mutex_lock(&lli->lli_och_mutex);
583 if (*och_p) { /* Open handle is present */
584 if (it_disposition(it, DISP_OPEN_OPEN)) {
585 /* Well, there's extra open request that we do not need,
586 let's close it somehow. This will decref request. */
587 rc = it_open_error(DISP_OPEN_OPEN, it);
589 mutex_unlock(&lli->lli_och_mutex);
590 GOTO(out_openerr, rc);
593 ll_release_openhandle(file_dentry(file), it);
597 rc = ll_local_open(file, it, fd, NULL);
600 mutex_unlock(&lli->lli_och_mutex);
601 GOTO(out_openerr, rc);
604 LASSERT(*och_usecount == 0);
605 if (!it->it_disposition) {
606 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
607 /* We cannot just request lock handle now, new ELC code
608 means that one of other OPEN locks for this file
609 could be cancelled, and since blocking ast handler
610 would attempt to grab och_mutex as well, that would
611 result in a deadlock */
612 mutex_unlock(&lli->lli_och_mutex);
614 * Normally called under two situations:
616 * 2. A race/condition on MDS resulting in no open
617 * handle to be returned from LOOKUP|OPEN request,
618 * for example if the target entry was a symlink.
620 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
621 * marked by a bit set in ll_iget_for_nfs. Clear the
622 * bit so that it's not confusing later callers.
624 * NB; when ldd is NULL, it must have come via normal
625 * lookup path only, since ll_iget_for_nfs always calls
628 if (ldd && ldd->lld_nfs_dentry) {
629 ldd->lld_nfs_dentry = 0;
630 it->it_flags |= MDS_OPEN_LOCK;
634 * Always specify MDS_OPEN_BY_FID because we don't want
635 * to get file with different fid.
637 it->it_flags |= MDS_OPEN_BY_FID;
638 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
641 GOTO(out_openerr, rc);
645 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
647 GOTO(out_och_free, rc = -ENOMEM);
651 /* md_intent_lock() didn't get a request ref if there was an
652 * open error, so don't do cleanup on the request here
654 /* XXX (green): Should not we bail out on any error here, not
655 * just open error? */
656 rc = it_open_error(DISP_OPEN_OPEN, it);
658 GOTO(out_och_free, rc);
660 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
661 "inode %p: disposition %x, status %d\n", inode,
662 it_disposition(it, ~0), it->it_status);
664 rc = ll_local_open(file, it, fd, *och_p);
666 GOTO(out_och_free, rc);
668 mutex_unlock(&lli->lli_och_mutex);
671 /* Must do this outside lli_och_mutex lock to prevent deadlock where
672 different kind of OPEN lock for this same inode gets cancelled
673 by ldlm_cancel_lru */
674 if (!S_ISREG(inode->i_mode))
675 GOTO(out_och_free, rc);
677 cl_lov_delay_create_clear(&file->f_flags);
678 GOTO(out_och_free, rc);
682 if (och_p && *och_p) {
683 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
684 *och_p = NULL; /* OBD_FREE writes some magic there */
687 mutex_unlock(&lli->lli_och_mutex);
690 if (lli->lli_opendir_key == fd)
691 ll_deauthorize_statahead(inode, fd);
693 ll_file_data_put(fd);
695 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
699 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
700 ptlrpc_req_finished(it->it_request);
701 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
707 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
708 struct ldlm_lock_desc *desc, void *data, int flag)
711 struct lustre_handle lockh;
715 case LDLM_CB_BLOCKING:
716 ldlm_lock2handle(lock, &lockh);
717 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
719 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
723 case LDLM_CB_CANCELING:
731 * When setting a lease on a file, we take ownership of the lli_mds_*_och
732 * and save it as fd->fd_och so as to force client to reopen the file even
733 * if it has an open lock in cache already.
735 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
736 struct lustre_handle *old_handle)
738 struct ll_inode_info *lli = ll_i2info(inode);
739 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
740 struct obd_client_handle **och_p;
745 /* Get the openhandle of the file */
746 mutex_lock(&lli->lli_och_mutex);
747 if (fd->fd_lease_och != NULL)
748 GOTO(out_unlock, rc = -EBUSY);
750 if (fd->fd_och == NULL) {
751 if (file->f_mode & FMODE_WRITE) {
752 LASSERT(lli->lli_mds_write_och != NULL);
753 och_p = &lli->lli_mds_write_och;
754 och_usecount = &lli->lli_open_fd_write_count;
756 LASSERT(lli->lli_mds_read_och != NULL);
757 och_p = &lli->lli_mds_read_och;
758 och_usecount = &lli->lli_open_fd_read_count;
761 if (*och_usecount > 1)
762 GOTO(out_unlock, rc = -EBUSY);
769 *old_handle = fd->fd_och->och_fh;
773 mutex_unlock(&lli->lli_och_mutex);
778 * Release ownership on lli_mds_*_och when putting back a file lease.
780 static int ll_lease_och_release(struct inode *inode, struct file *file)
782 struct ll_inode_info *lli = ll_i2info(inode);
783 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
784 struct obd_client_handle **och_p;
785 struct obd_client_handle *old_och = NULL;
790 mutex_lock(&lli->lli_och_mutex);
791 if (file->f_mode & FMODE_WRITE) {
792 och_p = &lli->lli_mds_write_och;
793 och_usecount = &lli->lli_open_fd_write_count;
795 och_p = &lli->lli_mds_read_och;
796 och_usecount = &lli->lli_open_fd_read_count;
799 /* The file may have been open by another process (broken lease) so
800 * *och_p is not NULL. In this case we should simply increase usecount
803 if (*och_p != NULL) {
804 old_och = fd->fd_och;
811 mutex_unlock(&lli->lli_och_mutex);
814 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
820 * Acquire a lease and open the file.
822 static struct obd_client_handle *
823 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
826 struct lookup_intent it = { .it_op = IT_OPEN };
827 struct ll_sb_info *sbi = ll_i2sbi(inode);
828 struct md_op_data *op_data;
829 struct ptlrpc_request *req = NULL;
830 struct lustre_handle old_handle = { 0 };
831 struct obd_client_handle *och = NULL;
836 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
837 RETURN(ERR_PTR(-EINVAL));
840 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
841 RETURN(ERR_PTR(-EPERM));
843 rc = ll_lease_och_acquire(inode, file, &old_handle);
850 RETURN(ERR_PTR(-ENOMEM));
852 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
853 LUSTRE_OPC_ANY, NULL);
855 GOTO(out, rc = PTR_ERR(op_data));
857 /* To tell the MDT this openhandle is from the same owner */
858 op_data->op_handle = old_handle;
860 it.it_flags = fmode | open_flags;
861 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
862 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
863 &ll_md_blocking_lease_ast,
864 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
865 * it can be cancelled which may mislead applications that the lease is
867 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
868 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
869 * doesn't deal with openhandle, so normal openhandle will be leaked. */
870 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
871 ll_finish_md_op_data(op_data);
872 ptlrpc_req_finished(req);
874 GOTO(out_release_it, rc);
876 if (it_disposition(&it, DISP_LOOKUP_NEG))
877 GOTO(out_release_it, rc = -ENOENT);
879 rc = it_open_error(DISP_OPEN_OPEN, &it);
881 GOTO(out_release_it, rc);
883 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
884 ll_och_fill(sbi->ll_md_exp, &it, och);
886 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
887 GOTO(out_close, rc = -EOPNOTSUPP);
889 /* already get lease, handle lease lock */
890 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
891 if (it.it_lock_mode == 0 ||
892 it.it_lock_bits != MDS_INODELOCK_OPEN) {
893 /* open lock must return for lease */
894 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
895 PFID(ll_inode2fid(inode)), it.it_lock_mode,
897 GOTO(out_close, rc = -EPROTO);
900 ll_intent_release(&it);
904 /* Cancel open lock */
905 if (it.it_lock_mode != 0) {
906 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
909 och->och_lease_handle.cookie = 0ULL;
911 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
913 CERROR("%s: error closing file "DFID": %d\n",
914 ll_get_fsname(inode->i_sb, NULL, 0),
915 PFID(&ll_i2info(inode)->lli_fid), rc2);
916 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
918 ll_intent_release(&it);
926 * Check whether a layout swap can be done between two inodes.
928 * \param[in] inode1 First inode to check
929 * \param[in] inode2 Second inode to check
931 * \retval 0 on success, layout swap can be performed between both inodes
932 * \retval negative error code if requirements are not met
934 static int ll_check_swap_layouts_validity(struct inode *inode1,
935 struct inode *inode2)
937 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
940 if (inode_permission(inode1, MAY_WRITE) ||
941 inode_permission(inode2, MAY_WRITE))
944 if (inode1->i_sb != inode2->i_sb)
950 static int ll_swap_layouts_close(struct obd_client_handle *och,
951 struct inode *inode, struct inode *inode2)
953 const struct lu_fid *fid1 = ll_inode2fid(inode);
954 const struct lu_fid *fid2;
958 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
959 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
961 rc = ll_check_swap_layouts_validity(inode, inode2);
963 GOTO(out_free_och, rc);
965 /* We now know that inode2 is a lustre inode */
966 fid2 = ll_inode2fid(inode2);
968 rc = lu_fid_cmp(fid1, fid2);
970 GOTO(out_free_och, rc = -EINVAL);
972 /* Close the file and {swap,merge} layouts between inode & inode2.
973 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
974 * because we still need it to pack l_remote_handle to MDT. */
975 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
978 och = NULL; /* freed in ll_close_inode_openhandle() */
988 * Release lease and close the file.
989 * It will check if the lease has ever broken.
991 static int ll_lease_close_intent(struct obd_client_handle *och,
993 bool *lease_broken, enum mds_op_bias bias,
996 struct ldlm_lock *lock;
997 bool cancelled = true;
1001 lock = ldlm_handle2lock(&och->och_lease_handle);
1003 lock_res_and_lock(lock);
1004 cancelled = ldlm_is_cancel(lock);
1005 unlock_res_and_lock(lock);
1006 LDLM_LOCK_PUT(lock);
1009 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1010 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1012 if (lease_broken != NULL)
1013 *lease_broken = cancelled;
1015 if (!cancelled && !bias)
1016 ldlm_cli_cancel(&och->och_lease_handle, 0);
1018 if (cancelled) { /* no need to excute intent */
1023 rc = ll_close_inode_openhandle(inode, och, bias, data);
1027 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1030 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1034 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1036 static int ll_lease_file_resync(struct obd_client_handle *och,
1037 struct inode *inode)
1039 struct ll_sb_info *sbi = ll_i2sbi(inode);
1040 struct md_op_data *op_data;
1041 __u64 data_version_unused;
1045 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1046 LUSTRE_OPC_ANY, NULL);
1047 if (IS_ERR(op_data))
1048 RETURN(PTR_ERR(op_data));
1050 /* before starting file resync, it's necessary to clean up page cache
1051 * in client memory, otherwise once the layout version is increased,
1052 * writing back cached data will be denied the OSTs. */
1053 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1057 op_data->op_handle = och->och_lease_handle;
1058 rc = md_file_resync(sbi->ll_md_exp, op_data);
1064 ll_finish_md_op_data(op_data);
1068 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1070 struct ll_inode_info *lli = ll_i2info(inode);
1071 struct cl_object *obj = lli->lli_clob;
1072 struct cl_attr *attr = vvp_env_thread_attr(env);
1080 ll_inode_size_lock(inode);
1082 /* Merge timestamps the most recently obtained from MDS with
1083 * timestamps obtained from OSTs.
1085 * Do not overwrite atime of inode because it may be refreshed
1086 * by file_accessed() function. If the read was served by cache
1087 * data, there is no RPC to be sent so that atime may not be
1088 * transferred to OSTs at all. MDT only updates atime at close time
1089 * if it's at least 'mdd.*.atime_diff' older.
1090 * All in all, the atime in Lustre does not strictly comply with
1091 * POSIX. Solving this problem needs to send an RPC to MDT for each
1092 * read, this will hurt performance. */
1093 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1094 LTIME_S(inode->i_atime) = lli->lli_atime;
1095 lli->lli_update_atime = 0;
1097 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1098 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1100 atime = LTIME_S(inode->i_atime);
1101 mtime = LTIME_S(inode->i_mtime);
1102 ctime = LTIME_S(inode->i_ctime);
1104 cl_object_attr_lock(obj);
1105 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1108 rc = cl_object_attr_get(env, obj, attr);
1109 cl_object_attr_unlock(obj);
1112 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1114 if (atime < attr->cat_atime)
1115 atime = attr->cat_atime;
1117 if (ctime < attr->cat_ctime)
1118 ctime = attr->cat_ctime;
1120 if (mtime < attr->cat_mtime)
1121 mtime = attr->cat_mtime;
1123 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1124 PFID(&lli->lli_fid), attr->cat_size);
1126 i_size_write(inode, attr->cat_size);
1127 inode->i_blocks = attr->cat_blocks;
1129 LTIME_S(inode->i_atime) = atime;
1130 LTIME_S(inode->i_mtime) = mtime;
1131 LTIME_S(inode->i_ctime) = ctime;
1134 ll_inode_size_unlock(inode);
1140 * Set designated mirror for I/O.
1142 * So far only read, write, and truncated can support to issue I/O to
1143 * designated mirror.
1145 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1147 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1149 /* clear layout version for generic(non-resync) I/O in case it carries
1150 * stale layout version due to I/O restart */
1151 io->ci_layout_version = 0;
1153 /* FLR: disable non-delay for designated mirror I/O because obviously
1154 * only one mirror is available */
1155 if (fd->fd_designated_mirror > 0) {
1157 io->ci_designated_mirror = fd->fd_designated_mirror;
1158 io->ci_layout_version = fd->fd_layout_version;
1159 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1163 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1164 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1167 static bool file_is_noatime(const struct file *file)
1169 const struct vfsmount *mnt = file->f_path.mnt;
1170 const struct inode *inode = file_inode((struct file *)file);
1172 /* Adapted from file_accessed() and touch_atime().*/
1173 if (file->f_flags & O_NOATIME)
1176 if (inode->i_flags & S_NOATIME)
1179 if (IS_NOATIME(inode))
1182 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1185 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1188 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1194 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1196 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1198 struct inode *inode = file_inode(file);
1199 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1201 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1202 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1203 io->u.ci_rw.rw_file = file;
1204 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1205 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1206 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1208 if (iot == CIT_WRITE) {
1209 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1210 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1211 file->f_flags & O_DIRECT ||
1214 io->ci_obj = ll_i2info(inode)->lli_clob;
1215 io->ci_lockreq = CILR_MAYBE;
1216 if (ll_file_nolock(file)) {
1217 io->ci_lockreq = CILR_NEVER;
1218 io->ci_no_srvlock = 1;
1219 } else if (file->f_flags & O_APPEND) {
1220 io->ci_lockreq = CILR_MANDATORY;
1222 io->ci_noatime = file_is_noatime(file);
1223 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1224 io->ci_pio = !io->u.ci_rw.rw_append;
1228 /* FLR: only use non-delay I/O for read as there is only one
1229 * avaliable mirror for write. */
1230 io->ci_ndelay = !(iot == CIT_WRITE);
1232 ll_io_set_mirror(io, file);
1235 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1237 struct cl_io_pt *pt = ptask->pt_cbdata;
1238 struct file *file = pt->cip_file;
1241 loff_t pos = pt->cip_pos;
1246 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1247 file_dentry(file)->d_name.name,
1248 pt->cip_iot == CIT_READ ? "read" : "write",
1249 pos, pos + pt->cip_count);
1251 env = cl_env_get(&refcheck);
1253 RETURN(PTR_ERR(env));
1255 io = vvp_env_thread_io(env);
1256 ll_io_init(io, file, pt->cip_iot);
1257 io->u.ci_rw.rw_iter = pt->cip_iter;
1258 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1259 io->ci_pio = 0; /* It's already in parallel task */
1261 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1262 pt->cip_count - pt->cip_result);
1264 struct vvp_io *vio = vvp_env_io(env);
1266 vio->vui_io_subtype = IO_NORMAL;
1267 vio->vui_fd = LUSTRE_FPRIVATE(file);
1269 ll_cl_add(file, env, io, LCC_RW);
1270 rc = cl_io_loop(env, io);
1271 ll_cl_remove(file, env);
1273 /* cl_io_rw_init() handled IO */
1277 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1283 if (io->ci_nob > 0) {
1284 pt->cip_result += io->ci_nob;
1285 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1287 pt->cip_iocb.ki_pos = pos;
1288 #ifdef HAVE_KIOCB_KI_LEFT
1289 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1290 #elif defined(HAVE_KI_NBYTES)
1291 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1295 cl_io_fini(env, io);
1296 cl_env_put(env, &refcheck);
1298 pt->cip_need_restart = io->ci_need_restart;
1300 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1301 file_dentry(file)->d_name.name,
1302 pt->cip_iot == CIT_READ ? "read" : "write",
1303 pt->cip_result, rc);
1305 RETURN(pt->cip_result > 0 ? 0 : rc);
1309 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1310 struct file *file, enum cl_io_type iot,
1311 loff_t *ppos, size_t count)
1313 struct range_lock range;
1314 struct vvp_io *vio = vvp_env_io(env);
1315 struct inode *inode = file_inode(file);
1316 struct ll_inode_info *lli = ll_i2info(inode);
1317 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1322 unsigned retried = 0;
1323 bool restarted = false;
1327 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1328 file_dentry(file)->d_name.name,
1329 iot == CIT_READ ? "read" : "write", pos, pos + count);
1332 io = vvp_env_thread_io(env);
1333 ll_io_init(io, file, iot);
1334 if (args->via_io_subtype == IO_NORMAL) {
1335 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1336 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1338 if (args->via_io_subtype != IO_NORMAL || restarted)
1340 io->ci_ndelay_tried = retried;
1342 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1343 bool range_locked = false;
1345 if (file->f_flags & O_APPEND)
1346 range_lock_init(&range, 0, LUSTRE_EOF);
1348 range_lock_init(&range, pos, pos + count - 1);
1350 vio->vui_fd = LUSTRE_FPRIVATE(file);
1351 vio->vui_io_subtype = args->via_io_subtype;
1353 switch (vio->vui_io_subtype) {
1355 /* Direct IO reads must also take range lock,
1356 * or multiple reads will try to work on the same pages
1357 * See LU-6227 for details. */
1358 if (((iot == CIT_WRITE) ||
1359 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1360 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1361 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1363 rc = range_lock(&lli->lli_write_tree, &range);
1367 range_locked = true;
1371 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1372 vio->u.splice.vui_flags = args->u.splice.via_flags;
1375 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1379 ll_cl_add(file, env, io, LCC_RW);
1380 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1381 !lli->lli_inode_locked) {
1383 lli->lli_inode_locked = 1;
1385 rc = cl_io_loop(env, io);
1386 if (lli->lli_inode_locked) {
1387 lli->lli_inode_locked = 0;
1388 inode_unlock(inode);
1390 ll_cl_remove(file, env);
1393 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1395 range_unlock(&lli->lli_write_tree, &range);
1398 /* cl_io_rw_init() handled IO */
1402 if (io->ci_nob > 0) {
1403 result += io->ci_nob;
1404 count -= io->ci_nob;
1406 if (args->via_io_subtype == IO_NORMAL) {
1407 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1409 args->u.normal.via_iocb->ki_pos = pos;
1410 #ifdef HAVE_KIOCB_KI_LEFT
1411 args->u.normal.via_iocb->ki_left = count;
1412 #elif defined(HAVE_KI_NBYTES)
1413 args->u.normal.via_iocb->ki_nbytes = count;
1417 pos = io->u.ci_rw.rw_range.cir_pos;
1421 cl_io_fini(env, io);
1424 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1425 file->f_path.dentry->d_name.name,
1426 iot, rc, result, io->ci_need_restart);
1428 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1430 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1431 file_dentry(file)->d_name.name,
1432 iot == CIT_READ ? "read" : "write",
1433 pos, pos + count, result, rc);
1434 /* preserve the tried count for FLR */
1435 retried = io->ci_ndelay_tried;
1440 if (iot == CIT_READ) {
1442 ll_stats_ops_tally(ll_i2sbi(inode),
1443 LPROC_LL_READ_BYTES, result);
1444 } else if (iot == CIT_WRITE) {
1446 ll_stats_ops_tally(ll_i2sbi(inode),
1447 LPROC_LL_WRITE_BYTES, result);
1448 fd->fd_write_failed = false;
1449 } else if (result == 0 && rc == 0) {
1452 fd->fd_write_failed = true;
1454 fd->fd_write_failed = false;
1455 } else if (rc != -ERESTARTSYS) {
1456 fd->fd_write_failed = true;
1460 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1461 file_dentry(file)->d_name.name,
1462 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1466 RETURN(result > 0 ? result : rc);
1470 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1471 * especially for small I/O.
1473 * To serve a read request, CLIO has to create and initialize a cl_io and
1474 * then request DLM lock. This has turned out to have siginificant overhead
1475 * and affects the performance of small I/O dramatically.
1477 * It's not necessary to create a cl_io for each I/O. Under the help of read
1478 * ahead, most of the pages being read are already in memory cache and we can
1479 * read those pages directly because if the pages exist, the corresponding DLM
1480 * lock must exist so that page content must be valid.
1482 * In fast read implementation, the llite speculatively finds and reads pages
1483 * in memory cache. There are three scenarios for fast read:
1484 * - If the page exists and is uptodate, kernel VM will provide the data and
1485 * CLIO won't be intervened;
1486 * - If the page was brought into memory by read ahead, it will be exported
1487 * and read ahead parameters will be updated;
1488 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1489 * it will go back and invoke normal read, i.e., a cl_io will be created
1490 * and DLM lock will be requested.
1492 * POSIX compliance: posix standard states that read is intended to be atomic.
1493 * Lustre read implementation is in line with Linux kernel read implementation
1494 * and neither of them complies with POSIX standard in this matter. Fast read
1495 * doesn't make the situation worse on single node but it may interleave write
1496 * results from multiple nodes due to short read handling in ll_file_aio_read().
1498 * \param env - lu_env
1499 * \param iocb - kiocb from kernel
1500 * \param iter - user space buffers where the data will be copied
1502 * \retval - number of bytes have been read, or error code if error occurred.
1505 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1509 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1512 /* NB: we can't do direct IO for fast read because it will need a lock
1513 * to make IO engine happy. */
1514 if (iocb->ki_filp->f_flags & O_DIRECT)
1517 result = generic_file_read_iter(iocb, iter);
1519 /* If the first page is not in cache, generic_file_aio_read() will be
1520 * returned with -ENODATA.
1521 * See corresponding code in ll_readpage(). */
1522 if (result == -ENODATA)
1526 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1527 LPROC_LL_READ_BYTES, result);
1533 * Read from a file (through the page cache).
1535 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1538 struct vvp_io_args *args;
1543 result = ll_do_fast_read(iocb, to);
1544 if (result < 0 || iov_iter_count(to) == 0)
1547 env = cl_env_get(&refcheck);
1549 return PTR_ERR(env);
1551 args = ll_env_args(env, IO_NORMAL);
1552 args->u.normal.via_iter = to;
1553 args->u.normal.via_iocb = iocb;
1555 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1556 &iocb->ki_pos, iov_iter_count(to));
1559 else if (result == 0)
1562 cl_env_put(env, &refcheck);
1568 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1569 * If a page is already in the page cache and dirty (and some other things -
1570 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1571 * write to it without doing a full I/O, because Lustre already knows about it
1572 * and will write it out. This saves a lot of processing time.
1574 * All writes here are within one page, so exclusion is handled by the page
1575 * lock on the vm page. We do not do tiny writes for writes which touch
1576 * multiple pages because it's very unlikely multiple sequential pages are
1577 * are already dirty.
1579 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1580 * and are unlikely to be to already dirty pages.
1582 * Attribute updates are important here, we do them in ll_tiny_write_end.
1584 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1586 ssize_t count = iov_iter_count(iter);
1587 struct file *file = iocb->ki_filp;
1588 struct inode *inode = file_inode(file);
1593 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1594 * of function for why.
1596 if (count >= PAGE_SIZE ||
1597 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1600 result = __generic_file_write_iter(iocb, iter);
1602 /* If the page is not already dirty, ll_tiny_write_begin returns
1603 * -ENODATA. We continue on to normal write.
1605 if (result == -ENODATA)
1609 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1611 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1614 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1620 * Write to a file (through the page cache).
1622 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1624 struct vvp_io_args *args;
1626 ssize_t rc_tiny = 0, rc_normal;
1631 /* NB: we can't do direct IO for tiny writes because they use the page
1632 * cache, we can't do sync writes because tiny writes can't flush
1633 * pages, and we can't do append writes because we can't guarantee the
1634 * required DLM locks are held to protect file size.
1636 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1637 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1638 rc_tiny = ll_do_tiny_write(iocb, from);
1640 /* In case of error, go on and try normal write - Only stop if tiny
1641 * write completed I/O.
1643 if (iov_iter_count(from) == 0)
1644 GOTO(out, rc_normal = rc_tiny);
1646 env = cl_env_get(&refcheck);
1648 return PTR_ERR(env);
1650 args = ll_env_args(env, IO_NORMAL);
1651 args->u.normal.via_iter = from;
1652 args->u.normal.via_iocb = iocb;
1654 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1655 &iocb->ki_pos, iov_iter_count(from));
1657 /* On success, combine bytes written. */
1658 if (rc_tiny >= 0 && rc_normal > 0)
1659 rc_normal += rc_tiny;
1660 /* On error, only return error from normal write if tiny write did not
1661 * write any bytes. Otherwise return bytes written by tiny write.
1663 else if (rc_tiny > 0)
1664 rc_normal = rc_tiny;
1666 cl_env_put(env, &refcheck);
1671 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1673 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1675 static int ll_file_get_iov_count(const struct iovec *iov,
1676 unsigned long *nr_segs, size_t *count)
1681 for (seg = 0; seg < *nr_segs; seg++) {
1682 const struct iovec *iv = &iov[seg];
1685 * If any segment has a negative length, or the cumulative
1686 * length ever wraps negative then return -EINVAL.
1689 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1691 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1696 cnt -= iv->iov_len; /* This segment is no good */
1703 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1704 unsigned long nr_segs, loff_t pos)
1711 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1715 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1716 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1717 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1718 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1719 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1721 result = ll_file_read_iter(iocb, &to);
1726 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1729 struct iovec iov = { .iov_base = buf, .iov_len = count };
1734 init_sync_kiocb(&kiocb, file);
1735 kiocb.ki_pos = *ppos;
1736 #ifdef HAVE_KIOCB_KI_LEFT
1737 kiocb.ki_left = count;
1738 #elif defined(HAVE_KI_NBYTES)
1739 kiocb.i_nbytes = count;
1742 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1743 *ppos = kiocb.ki_pos;
1749 * Write to a file (through the page cache).
1752 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1753 unsigned long nr_segs, loff_t pos)
1755 struct iov_iter from;
1760 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1764 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1765 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1766 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1767 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1768 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1770 result = ll_file_write_iter(iocb, &from);
1775 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1776 size_t count, loff_t *ppos)
1778 struct iovec iov = { .iov_base = (void __user *)buf,
1785 init_sync_kiocb(&kiocb, file);
1786 kiocb.ki_pos = *ppos;
1787 #ifdef HAVE_KIOCB_KI_LEFT
1788 kiocb.ki_left = count;
1789 #elif defined(HAVE_KI_NBYTES)
1790 kiocb.ki_nbytes = count;
1793 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1794 *ppos = kiocb.ki_pos;
1798 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1801 * Send file content (through pagecache) somewhere with helper
1803 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1804 struct pipe_inode_info *pipe, size_t count,
1808 struct vvp_io_args *args;
1813 env = cl_env_get(&refcheck);
1815 RETURN(PTR_ERR(env));
1817 args = ll_env_args(env, IO_SPLICE);
1818 args->u.splice.via_pipe = pipe;
1819 args->u.splice.via_flags = flags;
1821 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1822 cl_env_put(env, &refcheck);
1826 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1827 __u64 flags, struct lov_user_md *lum, int lum_size)
1829 struct lookup_intent oit = {
1831 .it_flags = flags | MDS_OPEN_BY_FID,
1836 ll_inode_size_lock(inode);
1837 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1839 GOTO(out_unlock, rc);
1841 ll_release_openhandle(dentry, &oit);
1844 ll_inode_size_unlock(inode);
1845 ll_intent_release(&oit);
1850 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1851 struct lov_mds_md **lmmp, int *lmm_size,
1852 struct ptlrpc_request **request)
1854 struct ll_sb_info *sbi = ll_i2sbi(inode);
1855 struct mdt_body *body;
1856 struct lov_mds_md *lmm = NULL;
1857 struct ptlrpc_request *req = NULL;
1858 struct md_op_data *op_data;
1861 rc = ll_get_default_mdsize(sbi, &lmmsize);
1865 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1866 strlen(filename), lmmsize,
1867 LUSTRE_OPC_ANY, NULL);
1868 if (IS_ERR(op_data))
1869 RETURN(PTR_ERR(op_data));
1871 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1872 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1873 ll_finish_md_op_data(op_data);
1875 CDEBUG(D_INFO, "md_getattr_name failed "
1876 "on %s: rc %d\n", filename, rc);
1880 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1881 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1883 lmmsize = body->mbo_eadatasize;
1885 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1887 GOTO(out, rc = -ENODATA);
1890 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1891 LASSERT(lmm != NULL);
1893 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1894 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1895 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1896 GOTO(out, rc = -EPROTO);
1899 * This is coming from the MDS, so is probably in
1900 * little endian. We convert it to host endian before
1901 * passing it to userspace.
1903 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1906 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1907 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1908 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1909 if (le32_to_cpu(lmm->lmm_pattern) &
1910 LOV_PATTERN_F_RELEASED)
1914 /* if function called for directory - we should
1915 * avoid swab not existent lsm objects */
1916 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1917 lustre_swab_lov_user_md_v1(
1918 (struct lov_user_md_v1 *)lmm);
1919 if (S_ISREG(body->mbo_mode))
1920 lustre_swab_lov_user_md_objects(
1921 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1923 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1924 lustre_swab_lov_user_md_v3(
1925 (struct lov_user_md_v3 *)lmm);
1926 if (S_ISREG(body->mbo_mode))
1927 lustre_swab_lov_user_md_objects(
1928 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1930 } else if (lmm->lmm_magic ==
1931 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1932 lustre_swab_lov_comp_md_v1(
1933 (struct lov_comp_md_v1 *)lmm);
1939 *lmm_size = lmmsize;
1944 static int ll_lov_setea(struct inode *inode, struct file *file,
1947 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1948 struct lov_user_md *lump;
1949 int lum_size = sizeof(struct lov_user_md) +
1950 sizeof(struct lov_user_ost_data);
1954 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1957 OBD_ALLOC_LARGE(lump, lum_size);
1961 if (copy_from_user(lump, arg, lum_size))
1962 GOTO(out_lump, rc = -EFAULT);
1964 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1966 cl_lov_delay_create_clear(&file->f_flags);
1969 OBD_FREE_LARGE(lump, lum_size);
1973 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1980 env = cl_env_get(&refcheck);
1982 RETURN(PTR_ERR(env));
1984 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1985 cl_env_put(env, &refcheck);
1989 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1992 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1993 struct lov_user_md *klum;
1995 __u64 flags = FMODE_WRITE;
1998 rc = ll_copy_user_md(lum, &klum);
2003 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2008 rc = put_user(0, &lum->lmm_stripe_count);
2012 rc = ll_layout_refresh(inode, &gen);
2016 rc = ll_file_getstripe(inode, arg, lum_size);
2018 cl_lov_delay_create_clear(&file->f_flags);
2021 OBD_FREE(klum, lum_size);
2026 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2028 struct ll_inode_info *lli = ll_i2info(inode);
2029 struct cl_object *obj = lli->lli_clob;
2030 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2031 struct ll_grouplock grouplock;
2036 CWARN("group id for group lock must not be 0\n");
2040 if (ll_file_nolock(file))
2041 RETURN(-EOPNOTSUPP);
2043 spin_lock(&lli->lli_lock);
2044 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2045 CWARN("group lock already existed with gid %lu\n",
2046 fd->fd_grouplock.lg_gid);
2047 spin_unlock(&lli->lli_lock);
2050 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2051 spin_unlock(&lli->lli_lock);
2054 * XXX: group lock needs to protect all OST objects while PFL
2055 * can add new OST objects during the IO, so we'd instantiate
2056 * all OST objects before getting its group lock.
2061 struct cl_layout cl = {
2062 .cl_is_composite = false,
2064 struct lu_extent ext = {
2066 .e_end = OBD_OBJECT_EOF,
2069 env = cl_env_get(&refcheck);
2071 RETURN(PTR_ERR(env));
2073 rc = cl_object_layout_get(env, obj, &cl);
2074 if (!rc && cl.cl_is_composite)
2075 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2078 cl_env_put(env, &refcheck);
2083 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2084 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2088 spin_lock(&lli->lli_lock);
2089 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2090 spin_unlock(&lli->lli_lock);
2091 CERROR("another thread just won the race\n");
2092 cl_put_grouplock(&grouplock);
2096 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2097 fd->fd_grouplock = grouplock;
2098 spin_unlock(&lli->lli_lock);
2100 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2104 static int ll_put_grouplock(struct inode *inode, struct file *file,
2107 struct ll_inode_info *lli = ll_i2info(inode);
2108 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2109 struct ll_grouplock grouplock;
2112 spin_lock(&lli->lli_lock);
2113 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2114 spin_unlock(&lli->lli_lock);
2115 CWARN("no group lock held\n");
2119 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2121 if (fd->fd_grouplock.lg_gid != arg) {
2122 CWARN("group lock %lu doesn't match current id %lu\n",
2123 arg, fd->fd_grouplock.lg_gid);
2124 spin_unlock(&lli->lli_lock);
2128 grouplock = fd->fd_grouplock;
2129 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2130 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2131 spin_unlock(&lli->lli_lock);
2133 cl_put_grouplock(&grouplock);
2134 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2139 * Close inode open handle
2141 * \param dentry [in] dentry which contains the inode
2142 * \param it [in,out] intent which contains open info and result
2145 * \retval <0 failure
2147 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2149 struct inode *inode = dentry->d_inode;
2150 struct obd_client_handle *och;
2156 /* Root ? Do nothing. */
2157 if (dentry->d_inode->i_sb->s_root == dentry)
2160 /* No open handle to close? Move away */
2161 if (!it_disposition(it, DISP_OPEN_OPEN))
2164 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2166 OBD_ALLOC(och, sizeof(*och));
2168 GOTO(out, rc = -ENOMEM);
2170 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2172 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2174 /* this one is in place of ll_file_open */
2175 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2176 ptlrpc_req_finished(it->it_request);
2177 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2183 * Get size for inode for which FIEMAP mapping is requested.
2184 * Make the FIEMAP get_info call and returns the result.
2185 * \param fiemap kernel buffer to hold extens
2186 * \param num_bytes kernel buffer size
2188 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2194 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2197 /* Checks for fiemap flags */
2198 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2199 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2203 /* Check for FIEMAP_FLAG_SYNC */
2204 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2205 rc = filemap_fdatawrite(inode->i_mapping);
2210 env = cl_env_get(&refcheck);
2212 RETURN(PTR_ERR(env));
2214 if (i_size_read(inode) == 0) {
2215 rc = ll_glimpse_size(inode);
2220 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2221 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2222 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2224 /* If filesize is 0, then there would be no objects for mapping */
2225 if (fmkey.lfik_oa.o_size == 0) {
2226 fiemap->fm_mapped_extents = 0;
2230 fmkey.lfik_fiemap = *fiemap;
2232 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2233 &fmkey, fiemap, &num_bytes);
2235 cl_env_put(env, &refcheck);
2239 int ll_fid2path(struct inode *inode, void __user *arg)
2241 struct obd_export *exp = ll_i2mdexp(inode);
2242 const struct getinfo_fid2path __user *gfin = arg;
2244 struct getinfo_fid2path *gfout;
2250 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2251 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2254 /* Only need to get the buflen */
2255 if (get_user(pathlen, &gfin->gf_pathlen))
2258 if (pathlen > PATH_MAX)
2261 outsize = sizeof(*gfout) + pathlen;
2262 OBD_ALLOC(gfout, outsize);
2266 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2267 GOTO(gf_free, rc = -EFAULT);
2268 /* append root FID after gfout to let MDT know the root FID so that it
2269 * can lookup the correct path, this is mainly for fileset.
2270 * old server without fileset mount support will ignore this. */
2271 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2273 /* Call mdc_iocontrol */
2274 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2278 if (copy_to_user(arg, gfout, outsize))
2282 OBD_FREE(gfout, outsize);
2287 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2289 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2297 ioc->idv_version = 0;
2298 ioc->idv_layout_version = UINT_MAX;
2300 /* If no file object initialized, we consider its version is 0. */
2304 env = cl_env_get(&refcheck);
2306 RETURN(PTR_ERR(env));
2308 io = vvp_env_thread_io(env);
2310 io->u.ci_data_version.dv_data_version = 0;
2311 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2312 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2315 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2316 result = cl_io_loop(env, io);
2318 result = io->ci_result;
2320 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2321 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2323 cl_io_fini(env, io);
2325 if (unlikely(io->ci_need_restart))
2328 cl_env_put(env, &refcheck);
2334 * Read the data_version for inode.
2336 * This value is computed using stripe object version on OST.
2337 * Version is computed using server side locking.
2339 * @param flags if do sync on the OST side;
2341 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2342 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2344 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2346 struct ioc_data_version ioc = { .idv_flags = flags };
2349 rc = ll_ioc_data_version(inode, &ioc);
2351 *data_version = ioc.idv_version;
2357 * Trigger a HSM release request for the provided inode.
2359 int ll_hsm_release(struct inode *inode)
2362 struct obd_client_handle *och = NULL;
2363 __u64 data_version = 0;
2368 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2369 ll_get_fsname(inode->i_sb, NULL, 0),
2370 PFID(&ll_i2info(inode)->lli_fid));
2372 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2374 GOTO(out, rc = PTR_ERR(och));
2376 /* Grab latest data_version and [am]time values */
2377 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2381 env = cl_env_get(&refcheck);
2383 GOTO(out, rc = PTR_ERR(env));
2385 rc = ll_merge_attr(env, inode);
2386 cl_env_put(env, &refcheck);
2388 /* If error happen, we have the wrong size for a file.
2394 /* Release the file.
2395 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2396 * we still need it to pack l_remote_handle to MDT. */
2397 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2403 if (och != NULL && !IS_ERR(och)) /* close the file */
2404 ll_lease_close(och, inode, NULL);
2409 struct ll_swap_stack {
2412 struct inode *inode1;
2413 struct inode *inode2;
2418 static int ll_swap_layouts(struct file *file1, struct file *file2,
2419 struct lustre_swap_layouts *lsl)
2421 struct mdc_swap_layouts msl;
2422 struct md_op_data *op_data;
2425 struct ll_swap_stack *llss = NULL;
2428 OBD_ALLOC_PTR(llss);
2432 llss->inode1 = file_inode(file1);
2433 llss->inode2 = file_inode(file2);
2435 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2439 /* we use 2 bool because it is easier to swap than 2 bits */
2440 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2441 llss->check_dv1 = true;
2443 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2444 llss->check_dv2 = true;
2446 /* we cannot use lsl->sl_dvX directly because we may swap them */
2447 llss->dv1 = lsl->sl_dv1;
2448 llss->dv2 = lsl->sl_dv2;
2450 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2451 if (rc == 0) /* same file, done! */
2454 if (rc < 0) { /* sequentialize it */
2455 swap(llss->inode1, llss->inode2);
2457 swap(llss->dv1, llss->dv2);
2458 swap(llss->check_dv1, llss->check_dv2);
2462 if (gid != 0) { /* application asks to flush dirty cache */
2463 rc = ll_get_grouplock(llss->inode1, file1, gid);
2467 rc = ll_get_grouplock(llss->inode2, file2, gid);
2469 ll_put_grouplock(llss->inode1, file1, gid);
2474 /* ultimate check, before swaping the layouts we check if
2475 * dataversion has changed (if requested) */
2476 if (llss->check_dv1) {
2477 rc = ll_data_version(llss->inode1, &dv, 0);
2480 if (dv != llss->dv1)
2481 GOTO(putgl, rc = -EAGAIN);
2484 if (llss->check_dv2) {
2485 rc = ll_data_version(llss->inode2, &dv, 0);
2488 if (dv != llss->dv2)
2489 GOTO(putgl, rc = -EAGAIN);
2492 /* struct md_op_data is used to send the swap args to the mdt
2493 * only flags is missing, so we use struct mdc_swap_layouts
2494 * through the md_op_data->op_data */
2495 /* flags from user space have to be converted before they are send to
2496 * server, no flag is sent today, they are only used on the client */
2499 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2500 0, LUSTRE_OPC_ANY, &msl);
2501 if (IS_ERR(op_data))
2502 GOTO(free, rc = PTR_ERR(op_data));
2504 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2505 sizeof(*op_data), op_data, NULL);
2506 ll_finish_md_op_data(op_data);
2513 ll_put_grouplock(llss->inode2, file2, gid);
2514 ll_put_grouplock(llss->inode1, file1, gid);
2524 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2526 struct md_op_data *op_data;
2530 /* Detect out-of range masks */
2531 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2534 /* Non-root users are forbidden to set or clear flags which are
2535 * NOT defined in HSM_USER_MASK. */
2536 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2537 !cfs_capable(CFS_CAP_SYS_ADMIN))
2540 /* Detect out-of range archive id */
2541 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2542 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2545 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2546 LUSTRE_OPC_ANY, hss);
2547 if (IS_ERR(op_data))
2548 RETURN(PTR_ERR(op_data));
2550 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2551 sizeof(*op_data), op_data, NULL);
2553 ll_finish_md_op_data(op_data);
2558 static int ll_hsm_import(struct inode *inode, struct file *file,
2559 struct hsm_user_import *hui)
2561 struct hsm_state_set *hss = NULL;
2562 struct iattr *attr = NULL;
2566 if (!S_ISREG(inode->i_mode))
2572 GOTO(out, rc = -ENOMEM);
2574 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2575 hss->hss_archive_id = hui->hui_archive_id;
2576 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2577 rc = ll_hsm_state_set(inode, hss);
2581 OBD_ALLOC_PTR(attr);
2583 GOTO(out, rc = -ENOMEM);
2585 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2586 attr->ia_mode |= S_IFREG;
2587 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2588 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2589 attr->ia_size = hui->hui_size;
2590 attr->ia_mtime.tv_sec = hui->hui_mtime;
2591 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2592 attr->ia_atime.tv_sec = hui->hui_atime;
2593 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2595 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2596 ATTR_UID | ATTR_GID |
2597 ATTR_MTIME | ATTR_MTIME_SET |
2598 ATTR_ATIME | ATTR_ATIME_SET;
2602 rc = ll_setattr_raw(file_dentry(file), attr, true);
2606 inode_unlock(inode);
2618 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2620 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2621 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2624 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2626 struct inode *inode = file_inode(file);
2628 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2629 ATTR_MTIME | ATTR_MTIME_SET |
2630 ATTR_CTIME | ATTR_CTIME_SET,
2632 .tv_sec = lfu->lfu_atime_sec,
2633 .tv_nsec = lfu->lfu_atime_nsec,
2636 .tv_sec = lfu->lfu_mtime_sec,
2637 .tv_nsec = lfu->lfu_mtime_nsec,
2640 .tv_sec = lfu->lfu_ctime_sec,
2641 .tv_nsec = lfu->lfu_ctime_nsec,
2647 if (!capable(CAP_SYS_ADMIN))
2650 if (!S_ISREG(inode->i_mode))
2654 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2655 inode_unlock(inode);
2660 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2663 case MODE_READ_USER:
2665 case MODE_WRITE_USER:
2672 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2674 /* Used to allow the upper layers of the client to request an LDLM lock
2675 * without doing an actual read or write.
2677 * Used for ladvise lockahead to manually request specific locks.
2679 * \param[in] file file this ladvise lock request is on
2680 * \param[in] ladvise ladvise struct describing this lock request
2682 * \retval 0 success, no detailed result available (sync requests
2683 * and requests sent to the server [not handled locally]
2684 * cannot return detailed results)
2685 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2686 * see definitions for details.
2687 * \retval negative negative errno on error
2689 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2691 struct lu_env *env = NULL;
2692 struct cl_io *io = NULL;
2693 struct cl_lock *lock = NULL;
2694 struct cl_lock_descr *descr = NULL;
2695 struct dentry *dentry = file->f_path.dentry;
2696 struct inode *inode = dentry->d_inode;
2697 enum cl_lock_mode cl_mode;
2698 off_t start = ladvise->lla_start;
2699 off_t end = ladvise->lla_end;
2705 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2706 "start=%llu, end=%llu\n", dentry->d_name.len,
2707 dentry->d_name.name, dentry->d_inode,
2708 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2711 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2713 GOTO(out, result = cl_mode);
2715 /* Get IO environment */
2716 result = cl_io_get(inode, &env, &io, &refcheck);
2720 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2723 * nothing to do for this io. This currently happens when
2724 * stripe sub-object's are not yet created.
2726 result = io->ci_result;
2727 } else if (result == 0) {
2728 lock = vvp_env_lock(env);
2729 descr = &lock->cll_descr;
2731 descr->cld_obj = io->ci_obj;
2732 /* Convert byte offsets to pages */
2733 descr->cld_start = cl_index(io->ci_obj, start);
2734 descr->cld_end = cl_index(io->ci_obj, end);
2735 descr->cld_mode = cl_mode;
2736 /* CEF_MUST is used because we do not want to convert a
2737 * lockahead request to a lockless lock */
2738 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2741 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2742 descr->cld_enq_flags |= CEF_SPECULATIVE;
2744 result = cl_lock_request(env, io, lock);
2746 /* On success, we need to release the lock */
2748 cl_lock_release(env, lock);
2750 cl_io_fini(env, io);
2751 cl_env_put(env, &refcheck);
2753 /* -ECANCELED indicates a matching lock with a different extent
2754 * was already present, and -EEXIST indicates a matching lock
2755 * on exactly the same extent was already present.
2756 * We convert them to positive values for userspace to make
2757 * recognizing true errors easier.
2758 * Note we can only return these detailed results on async requests,
2759 * as sync requests look the same as i/o requests for locking. */
2760 if (result == -ECANCELED)
2761 result = LLA_RESULT_DIFFERENT;
2762 else if (result == -EEXIST)
2763 result = LLA_RESULT_SAME;
2768 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2770 static int ll_ladvise_sanity(struct inode *inode,
2771 struct llapi_lu_ladvise *ladvise)
2773 enum lu_ladvise_type advice = ladvise->lla_advice;
2774 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2775 * be in the first 32 bits of enum ladvise_flags */
2776 __u32 flags = ladvise->lla_peradvice_flags;
2777 /* 3 lines at 80 characters per line, should be plenty */
2780 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2782 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2783 "last supported advice is %s (value '%d'): rc = %d\n",
2784 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2785 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2789 /* Per-advice checks */
2791 case LU_LADVISE_LOCKNOEXPAND:
2792 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2794 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2796 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2797 ladvise_names[advice], rc);
2801 case LU_LADVISE_LOCKAHEAD:
2802 /* Currently only READ and WRITE modes can be requested */
2803 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2804 ladvise->lla_lockahead_mode == 0) {
2806 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2808 ll_get_fsname(inode->i_sb, NULL, 0),
2809 ladvise->lla_lockahead_mode,
2810 ladvise_names[advice], rc);
2813 case LU_LADVISE_WILLREAD:
2814 case LU_LADVISE_DONTNEED:
2816 /* Note fall through above - These checks apply to all advices
2817 * except LOCKNOEXPAND */
2818 if (flags & ~LF_DEFAULT_MASK) {
2820 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2822 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2823 ladvise_names[advice], rc);
2826 if (ladvise->lla_start >= ladvise->lla_end) {
2828 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2829 "for %s: rc = %d\n",
2830 ll_get_fsname(inode->i_sb, NULL, 0),
2831 ladvise->lla_start, ladvise->lla_end,
2832 ladvise_names[advice], rc);
2844 * Give file access advices
2846 * The ladvise interface is similar to Linux fadvise() system call, except it
2847 * forwards the advices directly from Lustre client to server. The server side
2848 * codes will apply appropriate read-ahead and caching techniques for the
2849 * corresponding files.
2851 * A typical workload for ladvise is e.g. a bunch of different clients are
2852 * doing small random reads of a file, so prefetching pages into OSS cache
2853 * with big linear reads before the random IO is a net benefit. Fetching
2854 * all that data into each client cache with fadvise() may not be, due to
2855 * much more data being sent to the client.
2857 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2858 struct llapi_lu_ladvise *ladvise)
2862 struct cl_ladvise_io *lio;
2867 env = cl_env_get(&refcheck);
2869 RETURN(PTR_ERR(env));
2871 io = vvp_env_thread_io(env);
2872 io->ci_obj = ll_i2info(inode)->lli_clob;
2874 /* initialize parameters for ladvise */
2875 lio = &io->u.ci_ladvise;
2876 lio->li_start = ladvise->lla_start;
2877 lio->li_end = ladvise->lla_end;
2878 lio->li_fid = ll_inode2fid(inode);
2879 lio->li_advice = ladvise->lla_advice;
2880 lio->li_flags = flags;
2882 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2883 rc = cl_io_loop(env, io);
2887 cl_io_fini(env, io);
2888 cl_env_put(env, &refcheck);
2892 static int ll_lock_noexpand(struct file *file, int flags)
2894 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2896 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2901 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2904 struct fsxattr fsxattr;
2906 if (copy_from_user(&fsxattr,
2907 (const struct fsxattr __user *)arg,
2911 fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2912 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2913 if (copy_to_user((struct fsxattr __user *)arg,
2914 &fsxattr, sizeof(fsxattr)))
2920 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2924 struct md_op_data *op_data;
2925 struct ptlrpc_request *req = NULL;
2927 struct fsxattr fsxattr;
2928 struct cl_object *obj;
2930 /* only root could change project ID */
2931 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2934 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2935 LUSTRE_OPC_ANY, NULL);
2936 if (IS_ERR(op_data))
2937 RETURN(PTR_ERR(op_data));
2939 if (copy_from_user(&fsxattr,
2940 (const struct fsxattr __user *)arg,
2942 GOTO(out_fsxattr1, rc = -EFAULT);
2944 op_data->op_attr_flags = fsxattr.fsx_xflags;
2945 op_data->op_projid = fsxattr.fsx_projid;
2946 op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2947 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2949 ptlrpc_req_finished(req);
2951 obj = ll_i2info(inode)->lli_clob;
2955 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2956 OBD_ALLOC_PTR(attr);
2958 GOTO(out_fsxattr1, rc = -ENOMEM);
2959 attr->ia_valid = ATTR_ATTR_FLAG;
2960 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2965 ll_finish_md_op_data(op_data);
2969 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2972 struct inode *inode = file_inode(file);
2973 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2974 struct ll_inode_info *lli = ll_i2info(inode);
2975 struct obd_client_handle *och = NULL;
2976 struct split_param sp;
2979 enum mds_op_bias bias = 0;
2980 struct file *layout_file = NULL;
2982 size_t data_size = 0;
2986 mutex_lock(&lli->lli_och_mutex);
2987 if (fd->fd_lease_och != NULL) {
2988 och = fd->fd_lease_och;
2989 fd->fd_lease_och = NULL;
2991 mutex_unlock(&lli->lli_och_mutex);
2994 GOTO(out, rc = -ENOLCK);
2996 fmode = och->och_flags;
2998 switch (ioc->lil_flags) {
2999 case LL_LEASE_RESYNC_DONE:
3000 if (ioc->lil_count > IOC_IDS_MAX)
3001 GOTO(out, rc = -EINVAL);
3003 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3004 OBD_ALLOC(data, data_size);
3006 GOTO(out, rc = -ENOMEM);
3008 if (copy_from_user(data, (void __user *)arg, data_size))
3009 GOTO(out, rc = -EFAULT);
3011 bias = MDS_CLOSE_RESYNC_DONE;
3013 case LL_LEASE_LAYOUT_MERGE: {
3016 if (ioc->lil_count != 1)
3017 GOTO(out, rc = -EINVAL);
3019 arg += sizeof(*ioc);
3020 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3021 GOTO(out, rc = -EFAULT);
3023 layout_file = fget(fd);
3025 GOTO(out, rc = -EBADF);
3027 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3028 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3029 GOTO(out, rc = -EPERM);
3031 data = file_inode(layout_file);
3032 bias = MDS_CLOSE_LAYOUT_MERGE;
3035 case LL_LEASE_LAYOUT_SPLIT: {
3039 if (ioc->lil_count != 2)
3040 GOTO(out, rc = -EINVAL);
3042 arg += sizeof(*ioc);
3043 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3044 GOTO(out, rc = -EFAULT);
3046 arg += sizeof(__u32);
3047 if (copy_from_user(&mirror_id, (void __user *)arg,
3049 GOTO(out, rc = -EFAULT);
3051 layout_file = fget(fdv);
3053 GOTO(out, rc = -EBADF);
3055 sp.sp_inode = file_inode(layout_file);
3056 sp.sp_mirror_id = (__u16)mirror_id;
3058 bias = MDS_CLOSE_LAYOUT_SPLIT;
3062 /* without close intent */
3066 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3070 rc = ll_lease_och_release(inode, file);
3079 switch (ioc->lil_flags) {
3080 case LL_LEASE_RESYNC_DONE:
3082 OBD_FREE(data, data_size);
3084 case LL_LEASE_LAYOUT_MERGE:
3085 case LL_LEASE_LAYOUT_SPLIT:
3092 rc = ll_lease_type_from_fmode(fmode);
3096 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3099 struct inode *inode = file_inode(file);
3100 struct ll_inode_info *lli = ll_i2info(inode);
3101 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3102 struct obd_client_handle *och = NULL;
3103 __u64 open_flags = 0;
3109 switch (ioc->lil_mode) {
3110 case LL_LEASE_WRLCK:
3111 if (!(file->f_mode & FMODE_WRITE))
3113 fmode = FMODE_WRITE;
3115 case LL_LEASE_RDLCK:
3116 if (!(file->f_mode & FMODE_READ))
3120 case LL_LEASE_UNLCK:
3121 RETURN(ll_file_unlock_lease(file, ioc, arg));
3126 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3128 /* apply for lease */
3129 if (ioc->lil_flags & LL_LEASE_RESYNC)
3130 open_flags = MDS_OPEN_RESYNC;
3131 och = ll_lease_open(inode, file, fmode, open_flags);
3133 RETURN(PTR_ERR(och));
3135 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3136 rc = ll_lease_file_resync(och, inode);
3138 ll_lease_close(och, inode, NULL);
3141 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3143 ll_lease_close(och, inode, NULL);
3149 mutex_lock(&lli->lli_och_mutex);
3150 if (fd->fd_lease_och == NULL) {
3151 fd->fd_lease_och = och;
3154 mutex_unlock(&lli->lli_och_mutex);
3156 /* impossible now that only excl is supported for now */
3157 ll_lease_close(och, inode, &lease_broken);
3164 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3166 struct inode *inode = file_inode(file);
3167 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3171 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3172 PFID(ll_inode2fid(inode)), inode, cmd);
3173 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3175 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3176 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3180 case LL_IOC_GETFLAGS:
3181 /* Get the current value of the file flags */
3182 return put_user(fd->fd_flags, (int __user *)arg);
3183 case LL_IOC_SETFLAGS:
3184 case LL_IOC_CLRFLAGS:
3185 /* Set or clear specific file flags */
3186 /* XXX This probably needs checks to ensure the flags are
3187 * not abused, and to handle any flag side effects.
3189 if (get_user(flags, (int __user *) arg))
3192 if (cmd == LL_IOC_SETFLAGS) {
3193 if ((flags & LL_FILE_IGNORE_LOCK) &&
3194 !(file->f_flags & O_DIRECT)) {
3195 CERROR("%s: unable to disable locking on "
3196 "non-O_DIRECT file\n", current->comm);
3200 fd->fd_flags |= flags;
3202 fd->fd_flags &= ~flags;
3205 case LL_IOC_LOV_SETSTRIPE:
3206 case LL_IOC_LOV_SETSTRIPE_NEW:
3207 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3208 case LL_IOC_LOV_SETEA:
3209 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3210 case LL_IOC_LOV_SWAP_LAYOUTS: {
3212 struct lustre_swap_layouts lsl;
3214 if (copy_from_user(&lsl, (char __user *)arg,
3215 sizeof(struct lustre_swap_layouts)))
3218 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3221 file2 = fget(lsl.sl_fd);
3225 /* O_WRONLY or O_RDWR */
3226 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3227 GOTO(out, rc = -EPERM);
3229 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3230 struct inode *inode2;
3231 struct ll_inode_info *lli;
3232 struct obd_client_handle *och = NULL;
3234 lli = ll_i2info(inode);
3235 mutex_lock(&lli->lli_och_mutex);
3236 if (fd->fd_lease_och != NULL) {
3237 och = fd->fd_lease_och;
3238 fd->fd_lease_och = NULL;
3240 mutex_unlock(&lli->lli_och_mutex);
3242 GOTO(out, rc = -ENOLCK);
3243 inode2 = file_inode(file2);
3244 rc = ll_swap_layouts_close(och, inode, inode2);
3246 rc = ll_swap_layouts(file, file2, &lsl);
3252 case LL_IOC_LOV_GETSTRIPE:
3253 case LL_IOC_LOV_GETSTRIPE_NEW:
3254 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3255 case FSFILT_IOC_GETFLAGS:
3256 case FSFILT_IOC_SETFLAGS:
3257 RETURN(ll_iocontrol(inode, file, cmd, arg));
3258 case FSFILT_IOC_GETVERSION_OLD:
3259 case FSFILT_IOC_GETVERSION:
3260 RETURN(put_user(inode->i_generation, (int __user *)arg));
3261 case LL_IOC_GROUP_LOCK:
3262 RETURN(ll_get_grouplock(inode, file, arg));
3263 case LL_IOC_GROUP_UNLOCK:
3264 RETURN(ll_put_grouplock(inode, file, arg));
3265 case IOC_OBD_STATFS:
3266 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3268 /* We need to special case any other ioctls we want to handle,
3269 * to send them to the MDS/OST as appropriate and to properly
3270 * network encode the arg field.
3271 case FSFILT_IOC_SETVERSION_OLD:
3272 case FSFILT_IOC_SETVERSION:
3274 case LL_IOC_FLUSHCTX:
3275 RETURN(ll_flush_ctx(inode));
3276 case LL_IOC_PATH2FID: {
3277 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3278 sizeof(struct lu_fid)))
3283 case LL_IOC_GETPARENT:
3284 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3286 case OBD_IOC_FID2PATH:
3287 RETURN(ll_fid2path(inode, (void __user *)arg));
3288 case LL_IOC_DATA_VERSION: {
3289 struct ioc_data_version idv;
3292 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3295 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3296 rc = ll_ioc_data_version(inode, &idv);
3299 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3305 case LL_IOC_GET_MDTIDX: {
3308 mdtidx = ll_get_mdt_idx(inode);
3312 if (put_user((int)mdtidx, (int __user *)arg))
3317 case OBD_IOC_GETDTNAME:
3318 case OBD_IOC_GETMDNAME:
3319 RETURN(ll_get_obd_name(inode, cmd, arg));
3320 case LL_IOC_HSM_STATE_GET: {
3321 struct md_op_data *op_data;
3322 struct hsm_user_state *hus;
3329 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3330 LUSTRE_OPC_ANY, hus);
3331 if (IS_ERR(op_data)) {
3333 RETURN(PTR_ERR(op_data));
3336 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3339 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3342 ll_finish_md_op_data(op_data);
3346 case LL_IOC_HSM_STATE_SET: {
3347 struct hsm_state_set *hss;
3354 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3359 rc = ll_hsm_state_set(inode, hss);
3364 case LL_IOC_HSM_ACTION: {
3365 struct md_op_data *op_data;
3366 struct hsm_current_action *hca;
3373 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3374 LUSTRE_OPC_ANY, hca);
3375 if (IS_ERR(op_data)) {
3377 RETURN(PTR_ERR(op_data));
3380 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3383 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3386 ll_finish_md_op_data(op_data);
3390 case LL_IOC_SET_LEASE_OLD: {
3391 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3393 RETURN(ll_file_set_lease(file, &ioc, 0));
3395 case LL_IOC_SET_LEASE: {
3396 struct ll_ioc_lease ioc;
3398 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3401 RETURN(ll_file_set_lease(file, &ioc, arg));
3403 case LL_IOC_GET_LEASE: {
3404 struct ll_inode_info *lli = ll_i2info(inode);
3405 struct ldlm_lock *lock = NULL;
3408 mutex_lock(&lli->lli_och_mutex);
3409 if (fd->fd_lease_och != NULL) {
3410 struct obd_client_handle *och = fd->fd_lease_och;
3412 lock = ldlm_handle2lock(&och->och_lease_handle);
3414 lock_res_and_lock(lock);
3415 if (!ldlm_is_cancel(lock))
3416 fmode = och->och_flags;
3418 unlock_res_and_lock(lock);
3419 LDLM_LOCK_PUT(lock);
3422 mutex_unlock(&lli->lli_och_mutex);
3424 RETURN(ll_lease_type_from_fmode(fmode));
3426 case LL_IOC_HSM_IMPORT: {
3427 struct hsm_user_import *hui;
3433 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3438 rc = ll_hsm_import(inode, file, hui);
3443 case LL_IOC_FUTIMES_3: {
3444 struct ll_futimes_3 lfu;
3446 if (copy_from_user(&lfu,
3447 (const struct ll_futimes_3 __user *)arg,
3451 RETURN(ll_file_futimes_3(file, &lfu));
3453 case LL_IOC_LADVISE: {
3454 struct llapi_ladvise_hdr *k_ladvise_hdr;
3455 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3458 int alloc_size = sizeof(*k_ladvise_hdr);
3461 u_ladvise_hdr = (void __user *)arg;
3462 OBD_ALLOC_PTR(k_ladvise_hdr);
3463 if (k_ladvise_hdr == NULL)
3466 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3467 GOTO(out_ladvise, rc = -EFAULT);
3469 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3470 k_ladvise_hdr->lah_count < 1)
3471 GOTO(out_ladvise, rc = -EINVAL);
3473 num_advise = k_ladvise_hdr->lah_count;
3474 if (num_advise >= LAH_COUNT_MAX)
3475 GOTO(out_ladvise, rc = -EFBIG);
3477 OBD_FREE_PTR(k_ladvise_hdr);
3478 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3479 lah_advise[num_advise]);
3480 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3481 if (k_ladvise_hdr == NULL)
3485 * TODO: submit multiple advices to one server in a single RPC
3487 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3488 GOTO(out_ladvise, rc = -EFAULT);
3490 for (i = 0; i < num_advise; i++) {
3491 struct llapi_lu_ladvise *k_ladvise =
3492 &k_ladvise_hdr->lah_advise[i];
3493 struct llapi_lu_ladvise __user *u_ladvise =
3494 &u_ladvise_hdr->lah_advise[i];
3496 rc = ll_ladvise_sanity(inode, k_ladvise);
3498 GOTO(out_ladvise, rc);
3500 switch (k_ladvise->lla_advice) {
3501 case LU_LADVISE_LOCKNOEXPAND:
3502 rc = ll_lock_noexpand(file,
3503 k_ladvise->lla_peradvice_flags);
3504 GOTO(out_ladvise, rc);
3505 case LU_LADVISE_LOCKAHEAD:
3507 rc = ll_file_lock_ahead(file, k_ladvise);
3510 GOTO(out_ladvise, rc);
3513 &u_ladvise->lla_lockahead_result))
3514 GOTO(out_ladvise, rc = -EFAULT);
3517 rc = ll_ladvise(inode, file,
3518 k_ladvise_hdr->lah_flags,
3521 GOTO(out_ladvise, rc);
3528 OBD_FREE(k_ladvise_hdr, alloc_size);
3531 case LL_IOC_FLR_SET_MIRROR: {
3532 /* mirror I/O must be direct to avoid polluting page cache
3534 if (!(file->f_flags & O_DIRECT))
3537 fd->fd_designated_mirror = (__u32)arg;
3540 case LL_IOC_FSGETXATTR:
3541 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3542 case LL_IOC_FSSETXATTR:
3543 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3545 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3547 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3548 (void __user *)arg));
3552 #ifndef HAVE_FILE_LLSEEK_SIZE
3553 static inline loff_t
3554 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3556 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3558 if (offset > maxsize)
3561 if (offset != file->f_pos) {
3562 file->f_pos = offset;
3563 file->f_version = 0;
3569 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3570 loff_t maxsize, loff_t eof)
3572 struct inode *inode = file_inode(file);
3580 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3581 * position-querying operation. Avoid rewriting the "same"
3582 * f_pos value back to the file because a concurrent read(),
3583 * write() or lseek() might have altered it
3588 * f_lock protects against read/modify/write race with other
3589 * SEEK_CURs. Note that parallel writes and reads behave
3593 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3594 inode_unlock(inode);
3598 * In the generic case the entire file is data, so as long as
3599 * offset isn't at the end of the file then the offset is data.
3606 * There is a virtual hole at the end of the file, so as long as
3607 * offset isn't i_size or larger, return i_size.
3615 return llseek_execute(file, offset, maxsize);
3619 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3621 struct inode *inode = file_inode(file);
3622 loff_t retval, eof = 0;
3625 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3626 (origin == SEEK_CUR) ? file->f_pos : 0);
3627 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3628 PFID(ll_inode2fid(inode)), inode, retval, retval,
3630 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3632 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3633 retval = ll_glimpse_size(inode);
3636 eof = i_size_read(inode);
3639 retval = ll_generic_file_llseek_size(file, offset, origin,
3640 ll_file_maxbytes(inode), eof);
3644 static int ll_flush(struct file *file, fl_owner_t id)
3646 struct inode *inode = file_inode(file);
3647 struct ll_inode_info *lli = ll_i2info(inode);
3648 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3651 LASSERT(!S_ISDIR(inode->i_mode));
3653 /* catch async errors that were recorded back when async writeback
3654 * failed for pages in this mapping. */
3655 rc = lli->lli_async_rc;
3656 lli->lli_async_rc = 0;
3657 if (lli->lli_clob != NULL) {
3658 err = lov_read_and_clear_async_rc(lli->lli_clob);
3663 /* The application has been told write failure already.
3664 * Do not report failure again. */
3665 if (fd->fd_write_failed)
3667 return rc ? -EIO : 0;
3671 * Called to make sure a portion of file has been written out.
3672 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3674 * Return how many pages have been written.
3676 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3677 enum cl_fsync_mode mode, int ignore_layout)
3681 struct cl_fsync_io *fio;
3686 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3687 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3690 env = cl_env_get(&refcheck);
3692 RETURN(PTR_ERR(env));
3694 io = vvp_env_thread_io(env);
3695 io->ci_obj = ll_i2info(inode)->lli_clob;
3696 io->ci_ignore_layout = ignore_layout;
3698 /* initialize parameters for sync */
3699 fio = &io->u.ci_fsync;
3700 fio->fi_start = start;
3702 fio->fi_fid = ll_inode2fid(inode);
3703 fio->fi_mode = mode;
3704 fio->fi_nr_written = 0;
3706 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3707 result = cl_io_loop(env, io);
3709 result = io->ci_result;
3711 result = fio->fi_nr_written;
3712 cl_io_fini(env, io);
3713 cl_env_put(env, &refcheck);
3719 * When dentry is provided (the 'else' case), file_dentry() may be
3720 * null and dentry must be used directly rather than pulled from
3721 * file_dentry() as is done otherwise.
3724 #ifdef HAVE_FILE_FSYNC_4ARGS
3725 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3727 struct dentry *dentry = file_dentry(file);
3729 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3730 int ll_fsync(struct file *file, int datasync)
3732 struct dentry *dentry = file_dentry(file);
3734 loff_t end = LLONG_MAX;
3736 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3739 loff_t end = LLONG_MAX;
3741 struct inode *inode = dentry->d_inode;
3742 struct ll_inode_info *lli = ll_i2info(inode);
3743 struct ptlrpc_request *req;
3747 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3748 PFID(ll_inode2fid(inode)), inode);
3749 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3751 #ifdef HAVE_FILE_FSYNC_4ARGS
3752 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3753 lock_inode = !lli->lli_inode_locked;
3757 /* fsync's caller has already called _fdata{sync,write}, we want
3758 * that IO to finish before calling the osc and mdc sync methods */
3759 rc = filemap_fdatawait(inode->i_mapping);
3762 /* catch async errors that were recorded back when async writeback
3763 * failed for pages in this mapping. */
3764 if (!S_ISDIR(inode->i_mode)) {
3765 err = lli->lli_async_rc;
3766 lli->lli_async_rc = 0;
3769 if (lli->lli_clob != NULL) {
3770 err = lov_read_and_clear_async_rc(lli->lli_clob);
3776 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3780 ptlrpc_req_finished(req);
3782 if (S_ISREG(inode->i_mode)) {
3783 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3785 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3786 if (rc == 0 && err < 0)
3789 fd->fd_write_failed = true;
3791 fd->fd_write_failed = false;
3794 #ifdef HAVE_FILE_FSYNC_4ARGS
3796 inode_unlock(inode);
3802 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3804 struct inode *inode = file_inode(file);
3805 struct ll_sb_info *sbi = ll_i2sbi(inode);
3806 struct ldlm_enqueue_info einfo = {
3807 .ei_type = LDLM_FLOCK,
3808 .ei_cb_cp = ldlm_flock_completion_ast,
3809 .ei_cbdata = file_lock,
3811 struct md_op_data *op_data;
3812 struct lustre_handle lockh = { 0 };
3813 union ldlm_policy_data flock = { { 0 } };
3814 int fl_type = file_lock->fl_type;
3820 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3821 PFID(ll_inode2fid(inode)), file_lock);
3823 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3825 if (file_lock->fl_flags & FL_FLOCK) {
3826 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3827 /* flocks are whole-file locks */
3828 flock.l_flock.end = OFFSET_MAX;
3829 /* For flocks owner is determined by the local file desctiptor*/
3830 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3831 } else if (file_lock->fl_flags & FL_POSIX) {
3832 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3833 flock.l_flock.start = file_lock->fl_start;
3834 flock.l_flock.end = file_lock->fl_end;
3838 flock.l_flock.pid = file_lock->fl_pid;
3840 /* Somewhat ugly workaround for svc lockd.
3841 * lockd installs custom fl_lmops->lm_compare_owner that checks
3842 * for the fl_owner to be the same (which it always is on local node
3843 * I guess between lockd processes) and then compares pid.
3844 * As such we assign pid to the owner field to make it all work,
3845 * conflict with normal locks is unlikely since pid space and
3846 * pointer space for current->files are not intersecting */
3847 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3848 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3852 einfo.ei_mode = LCK_PR;
3855 /* An unlock request may or may not have any relation to
3856 * existing locks so we may not be able to pass a lock handle
3857 * via a normal ldlm_lock_cancel() request. The request may even
3858 * unlock a byte range in the middle of an existing lock. In
3859 * order to process an unlock request we need all of the same
3860 * information that is given with a normal read or write record
3861 * lock request. To avoid creating another ldlm unlock (cancel)
3862 * message we'll treat a LCK_NL flock request as an unlock. */
3863 einfo.ei_mode = LCK_NL;
3866 einfo.ei_mode = LCK_PW;
3869 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3884 flags = LDLM_FL_BLOCK_NOWAIT;
3890 flags = LDLM_FL_TEST_LOCK;
3893 CERROR("unknown fcntl lock command: %d\n", cmd);
3897 /* Save the old mode so that if the mode in the lock changes we
3898 * can decrement the appropriate reader or writer refcount. */
3899 file_lock->fl_type = einfo.ei_mode;
3901 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3902 LUSTRE_OPC_ANY, NULL);
3903 if (IS_ERR(op_data))
3904 RETURN(PTR_ERR(op_data));
3906 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3907 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3908 flock.l_flock.pid, flags, einfo.ei_mode,
3909 flock.l_flock.start, flock.l_flock.end);
3911 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3914 /* Restore the file lock type if not TEST lock. */
3915 if (!(flags & LDLM_FL_TEST_LOCK))
3916 file_lock->fl_type = fl_type;
3918 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3919 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3920 !(flags & LDLM_FL_TEST_LOCK))
3921 rc2 = locks_lock_file_wait(file, file_lock);
3923 if ((file_lock->fl_flags & FL_FLOCK) &&
3924 (rc == 0 || file_lock->fl_type == F_UNLCK))
3925 rc2 = flock_lock_file_wait(file, file_lock);
3926 if ((file_lock->fl_flags & FL_POSIX) &&
3927 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3928 !(flags & LDLM_FL_TEST_LOCK))
3929 rc2 = posix_lock_file_wait(file, file_lock);
3930 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3932 if (rc2 && file_lock->fl_type != F_UNLCK) {
3933 einfo.ei_mode = LCK_NL;
3934 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3939 ll_finish_md_op_data(op_data);
3944 int ll_get_fid_by_name(struct inode *parent, const char *name,
3945 int namelen, struct lu_fid *fid,
3946 struct inode **inode)
3948 struct md_op_data *op_data = NULL;
3949 struct mdt_body *body;
3950 struct ptlrpc_request *req;
3954 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3955 LUSTRE_OPC_ANY, NULL);
3956 if (IS_ERR(op_data))
3957 RETURN(PTR_ERR(op_data));
3959 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3960 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3961 ll_finish_md_op_data(op_data);
3965 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3967 GOTO(out_req, rc = -EFAULT);
3969 *fid = body->mbo_fid1;
3972 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3974 ptlrpc_req_finished(req);
3978 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3979 const char *name, int namelen)
3981 struct dentry *dchild = NULL;
3982 struct inode *child_inode = NULL;
3983 struct md_op_data *op_data;
3984 struct ptlrpc_request *request = NULL;
3985 struct obd_client_handle *och = NULL;
3987 struct mdt_body *body;
3989 __u64 data_version = 0;
3992 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3993 name, PFID(ll_inode2fid(parent)), mdtidx);
3995 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3996 0, LUSTRE_OPC_ANY, NULL);
3997 if (IS_ERR(op_data))
3998 RETURN(PTR_ERR(op_data));
4000 /* Get child FID first */
4001 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4004 dchild = d_lookup(file_dentry(file), &qstr);
4005 if (dchild != NULL) {
4006 if (dchild->d_inode != NULL)
4007 child_inode = igrab(dchild->d_inode);
4011 if (child_inode == NULL) {
4012 rc = ll_get_fid_by_name(parent, name, namelen,
4013 &op_data->op_fid3, &child_inode);
4018 if (child_inode == NULL)
4019 GOTO(out_free, rc = -EINVAL);
4022 * lfs migrate command needs to be blocked on the client
4023 * by checking the migrate FID against the FID of the
4026 if (child_inode == parent->i_sb->s_root->d_inode)
4027 GOTO(out_iput, rc = -EINVAL);
4029 inode_lock(child_inode);
4030 op_data->op_fid3 = *ll_inode2fid(child_inode);
4031 if (!fid_is_sane(&op_data->op_fid3)) {
4032 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4033 ll_get_fsname(parent->i_sb, NULL, 0), name,
4034 PFID(&op_data->op_fid3));
4035 GOTO(out_unlock, rc = -EINVAL);
4038 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4040 GOTO(out_unlock, rc);
4043 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4044 PFID(&op_data->op_fid3), mdtidx);
4045 GOTO(out_unlock, rc = 0);
4048 if (S_ISREG(child_inode->i_mode)) {
4049 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4053 GOTO(out_unlock, rc);
4056 rc = ll_data_version(child_inode, &data_version,
4059 GOTO(out_close, rc);
4061 op_data->op_handle = och->och_fh;
4062 op_data->op_data = och->och_mod;
4063 op_data->op_data_version = data_version;
4064 op_data->op_lease_handle = och->och_lease_handle;
4065 op_data->op_bias |= MDS_RENAME_MIGRATE;
4068 op_data->op_mds = mdtidx;
4069 op_data->op_cli_flags = CLI_MIGRATE;
4070 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4071 namelen, name, namelen, &request);
4073 LASSERT(request != NULL);
4074 ll_update_times(request, parent);
4076 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4077 LASSERT(body != NULL);
4079 /* If the server does release layout lock, then we cleanup
4080 * the client och here, otherwise release it in out_close: */
4082 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4083 obd_mod_put(och->och_mod);
4084 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4086 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4092 if (request != NULL) {
4093 ptlrpc_req_finished(request);
4097 /* Try again if the file layout has changed. */
4098 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4102 if (och != NULL) /* close the file */
4103 ll_lease_close(och, child_inode, NULL);
4105 clear_nlink(child_inode);
4107 inode_unlock(child_inode);
4111 ll_finish_md_op_data(op_data);
4116 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4124 * test if some locks matching bits and l_req_mode are acquired
4125 * - bits can be in different locks
4126 * - if found clear the common lock bits in *bits
4127 * - the bits not found, are kept in *bits
4129 * \param bits [IN] searched lock bits [IN]
4130 * \param l_req_mode [IN] searched lock mode
4131 * \retval boolean, true iff all bits are found
4133 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4135 struct lustre_handle lockh;
4136 union ldlm_policy_data policy;
4137 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4138 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4147 fid = &ll_i2info(inode)->lli_fid;
4148 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4149 ldlm_lockname[mode]);
4151 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4152 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4153 policy.l_inodebits.bits = *bits & (1 << i);
4154 if (policy.l_inodebits.bits == 0)
4157 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4158 &policy, mode, &lockh)) {
4159 struct ldlm_lock *lock;
4161 lock = ldlm_handle2lock(&lockh);
4164 ~(lock->l_policy_data.l_inodebits.bits);
4165 LDLM_LOCK_PUT(lock);
4167 *bits &= ~policy.l_inodebits.bits;
4174 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4175 struct lustre_handle *lockh, __u64 flags,
4176 enum ldlm_mode mode)
4178 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4183 fid = &ll_i2info(inode)->lli_fid;
4184 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4186 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4187 fid, LDLM_IBITS, &policy, mode, lockh);
4192 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4194 /* Already unlinked. Just update nlink and return success */
4195 if (rc == -ENOENT) {
4197 /* If it is striped directory, and there is bad stripe
4198 * Let's revalidate the dentry again, instead of returning
4200 if (S_ISDIR(inode->i_mode) &&
4201 ll_i2info(inode)->lli_lsm_md != NULL)
4204 /* This path cannot be hit for regular files unless in
4205 * case of obscure races, so no need to to validate
4207 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4209 } else if (rc != 0) {
4210 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4211 "%s: revalidate FID "DFID" error: rc = %d\n",
4212 ll_get_fsname(inode->i_sb, NULL, 0),
4213 PFID(ll_inode2fid(inode)), rc);
4219 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4221 struct inode *inode = dentry->d_inode;
4222 struct obd_export *exp = ll_i2mdexp(inode);
4223 struct lookup_intent oit = {
4226 struct ptlrpc_request *req = NULL;
4227 struct md_op_data *op_data;
4231 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4232 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4234 /* Call getattr by fid, so do not provide name at all. */
4235 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4236 LUSTRE_OPC_ANY, NULL);
4237 if (IS_ERR(op_data))
4238 RETURN(PTR_ERR(op_data));
4240 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4241 ll_finish_md_op_data(op_data);
4243 rc = ll_inode_revalidate_fini(inode, rc);
4247 rc = ll_revalidate_it_finish(req, &oit, dentry);
4249 ll_intent_release(&oit);
4253 /* Unlinked? Unhash dentry, so it is not picked up later by
4254 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4255 * here to preserve get_cwd functionality on 2.6.
4257 if (!dentry->d_inode->i_nlink) {
4258 ll_lock_dcache(inode);
4259 d_lustre_invalidate(dentry, 0);
4260 ll_unlock_dcache(inode);
4263 ll_lookup_finish_locks(&oit, dentry);
4265 ptlrpc_req_finished(req);
4270 static int ll_merge_md_attr(struct inode *inode)
4272 struct cl_attr attr = { 0 };
4275 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4276 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4277 &attr, ll_md_blocking_ast);
4281 set_nlink(inode, attr.cat_nlink);
4282 inode->i_blocks = attr.cat_blocks;
4283 i_size_write(inode, attr.cat_size);
4285 ll_i2info(inode)->lli_atime = attr.cat_atime;
4286 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4287 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4292 static inline dev_t ll_compat_encode_dev(dev_t dev)
4294 /* The compat_sys_*stat*() syscalls will fail unless the
4295 * device majors and minors are both less than 256. Note that
4296 * the value returned here will be passed through
4297 * old_encode_dev() in cp_compat_stat(). And so we are not
4298 * trying to return a valid compat (u16) device number, just
4299 * one that will pass the old_valid_dev() check. */
4301 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4304 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4305 int ll_getattr(const struct path *path, struct kstat *stat,
4306 u32 request_mask, unsigned int flags)
4308 struct dentry *de = path->dentry;
4310 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4313 struct inode *inode = de->d_inode;
4314 struct ll_sb_info *sbi = ll_i2sbi(inode);
4315 struct ll_inode_info *lli = ll_i2info(inode);
4318 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4320 rc = ll_inode_revalidate(de, IT_GETATTR);
4324 if (S_ISREG(inode->i_mode)) {
4325 /* In case of restore, the MDT has the right size and has
4326 * already send it back without granting the layout lock,
4327 * inode is up-to-date so glimpse is useless.
4328 * Also to glimpse we need the layout, in case of a running
4329 * restore the MDT holds the layout lock so the glimpse will
4330 * block up to the end of restore (getattr will block)
4332 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4333 rc = ll_glimpse_size(inode);
4338 /* If object isn't regular a file then don't validate size. */
4339 if (S_ISDIR(inode->i_mode) &&
4340 lli->lli_lsm_md != NULL) {
4341 rc = ll_merge_md_attr(inode);
4346 LTIME_S(inode->i_atime) = lli->lli_atime;
4347 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4348 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4351 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4353 if (ll_need_32bit_api(sbi)) {
4354 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4355 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4356 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4358 stat->ino = inode->i_ino;
4359 stat->dev = inode->i_sb->s_dev;
4360 stat->rdev = inode->i_rdev;
4363 stat->mode = inode->i_mode;
4364 stat->uid = inode->i_uid;
4365 stat->gid = inode->i_gid;
4366 stat->atime = inode->i_atime;
4367 stat->mtime = inode->i_mtime;
4368 stat->ctime = inode->i_ctime;
4369 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4371 stat->nlink = inode->i_nlink;
4372 stat->size = i_size_read(inode);
4373 stat->blocks = inode->i_blocks;
4378 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4379 __u64 start, __u64 len)
4383 struct fiemap *fiemap;
4384 unsigned int extent_count = fieinfo->fi_extents_max;
4386 num_bytes = sizeof(*fiemap) + (extent_count *
4387 sizeof(struct fiemap_extent));
4388 OBD_ALLOC_LARGE(fiemap, num_bytes);
4393 fiemap->fm_flags = fieinfo->fi_flags;
4394 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4395 fiemap->fm_start = start;
4396 fiemap->fm_length = len;
4397 if (extent_count > 0 &&
4398 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4399 sizeof(struct fiemap_extent)) != 0)
4400 GOTO(out, rc = -EFAULT);
4402 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4404 fieinfo->fi_flags = fiemap->fm_flags;
4405 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4406 if (extent_count > 0 &&
4407 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4408 fiemap->fm_mapped_extents *
4409 sizeof(struct fiemap_extent)) != 0)
4410 GOTO(out, rc = -EFAULT);
4412 OBD_FREE_LARGE(fiemap, num_bytes);
4416 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4418 struct ll_inode_info *lli = ll_i2info(inode);
4419 struct posix_acl *acl = NULL;
4422 spin_lock(&lli->lli_lock);
4423 /* VFS' acl_permission_check->check_acl will release the refcount */
4424 acl = posix_acl_dup(lli->lli_posix_acl);
4425 spin_unlock(&lli->lli_lock);
4430 #ifdef HAVE_IOP_SET_ACL
4431 #ifdef CONFIG_FS_POSIX_ACL
4432 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4434 const char *name = NULL;
4441 case ACL_TYPE_ACCESS:
4443 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4447 name = XATTR_NAME_POSIX_ACL_ACCESS;
4449 case ACL_TYPE_DEFAULT:
4450 if (!S_ISDIR(inode->i_mode))
4451 GOTO(out, rc = acl ? -EACCES : 0);
4452 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4455 GOTO(out, rc = -EINVAL);
4459 size = posix_acl_xattr_size(acl->a_count);
4460 value = kmalloc(size, GFP_NOFS);
4462 GOTO(out, rc = -ENOMEM);
4464 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4469 /* dentry is only used for *.lov attributes so it's safe to be NULL */
4470 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4475 set_cached_acl(inode, type, acl);
4477 forget_cached_acl(inode, type);
4480 #endif /* CONFIG_FS_POSIX_ACL */
4481 #endif /* HAVE_IOP_SET_ACL */
4483 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4485 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4486 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4488 ll_check_acl(struct inode *inode, int mask)
4491 # ifdef CONFIG_FS_POSIX_ACL
4492 struct posix_acl *acl;
4496 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4497 if (flags & IPERM_FLAG_RCU)
4500 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4505 rc = posix_acl_permission(inode, acl, mask);
4506 posix_acl_release(acl);
4509 # else /* !CONFIG_FS_POSIX_ACL */
4511 # endif /* CONFIG_FS_POSIX_ACL */
4513 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4515 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4516 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4518 # ifdef HAVE_INODE_PERMISION_2ARGS
4519 int ll_inode_permission(struct inode *inode, int mask)
4521 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4526 struct ll_sb_info *sbi;
4527 struct root_squash_info *squash;
4528 struct cred *cred = NULL;
4529 const struct cred *old_cred = NULL;
4531 bool squash_id = false;
4534 #ifdef MAY_NOT_BLOCK
4535 if (mask & MAY_NOT_BLOCK)
4537 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4538 if (flags & IPERM_FLAG_RCU)
4542 /* as root inode are NOT getting validated in lookup operation,
4543 * need to do it before permission check. */
4545 if (inode == inode->i_sb->s_root->d_inode) {
4546 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4551 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4552 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4554 /* squash fsuid/fsgid if needed */
4555 sbi = ll_i2sbi(inode);
4556 squash = &sbi->ll_squash;
4557 if (unlikely(squash->rsi_uid != 0 &&
4558 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4559 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4563 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4564 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4565 squash->rsi_uid, squash->rsi_gid);
4567 /* update current process's credentials
4568 * and FS capability */
4569 cred = prepare_creds();
4573 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4574 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4575 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4576 if ((1 << cap) & CFS_CAP_FS_MASK)
4577 cap_lower(cred->cap_effective, cap);
4579 old_cred = override_creds(cred);
4582 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4583 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4584 /* restore current process's credentials and FS capability */
4586 revert_creds(old_cred);
4593 /* -o localflock - only provides locally consistent flock locks */
4594 struct file_operations ll_file_operations = {
4595 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4596 # ifdef HAVE_SYNC_READ_WRITE
4597 .read = new_sync_read,
4598 .write = new_sync_write,
4600 .read_iter = ll_file_read_iter,
4601 .write_iter = ll_file_write_iter,
4602 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4603 .read = ll_file_read,
4604 .aio_read = ll_file_aio_read,
4605 .write = ll_file_write,
4606 .aio_write = ll_file_aio_write,
4607 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4608 .unlocked_ioctl = ll_file_ioctl,
4609 .open = ll_file_open,
4610 .release = ll_file_release,
4611 .mmap = ll_file_mmap,
4612 .llseek = ll_file_seek,
4613 .splice_read = ll_file_splice_read,
4618 struct file_operations ll_file_operations_flock = {
4619 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4620 # ifdef HAVE_SYNC_READ_WRITE
4621 .read = new_sync_read,
4622 .write = new_sync_write,
4623 # endif /* HAVE_SYNC_READ_WRITE */
4624 .read_iter = ll_file_read_iter,
4625 .write_iter = ll_file_write_iter,
4626 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4627 .read = ll_file_read,
4628 .aio_read = ll_file_aio_read,
4629 .write = ll_file_write,
4630 .aio_write = ll_file_aio_write,
4631 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4632 .unlocked_ioctl = ll_file_ioctl,
4633 .open = ll_file_open,
4634 .release = ll_file_release,
4635 .mmap = ll_file_mmap,
4636 .llseek = ll_file_seek,
4637 .splice_read = ll_file_splice_read,
4640 .flock = ll_file_flock,
4641 .lock = ll_file_flock
4644 /* These are for -o noflock - to return ENOSYS on flock calls */
4645 struct file_operations ll_file_operations_noflock = {
4646 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4647 # ifdef HAVE_SYNC_READ_WRITE
4648 .read = new_sync_read,
4649 .write = new_sync_write,
4650 # endif /* HAVE_SYNC_READ_WRITE */
4651 .read_iter = ll_file_read_iter,
4652 .write_iter = ll_file_write_iter,
4653 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4654 .read = ll_file_read,
4655 .aio_read = ll_file_aio_read,
4656 .write = ll_file_write,
4657 .aio_write = ll_file_aio_write,
4658 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4659 .unlocked_ioctl = ll_file_ioctl,
4660 .open = ll_file_open,
4661 .release = ll_file_release,
4662 .mmap = ll_file_mmap,
4663 .llseek = ll_file_seek,
4664 .splice_read = ll_file_splice_read,
4667 .flock = ll_file_noflock,
4668 .lock = ll_file_noflock
4671 struct inode_operations ll_file_inode_operations = {
4672 .setattr = ll_setattr,
4673 .getattr = ll_getattr,
4674 .permission = ll_inode_permission,
4675 #ifdef HAVE_IOP_XATTR
4676 .setxattr = ll_setxattr,
4677 .getxattr = ll_getxattr,
4678 .removexattr = ll_removexattr,
4680 .listxattr = ll_listxattr,
4681 .fiemap = ll_fiemap,
4682 #ifdef HAVE_IOP_GET_ACL
4683 .get_acl = ll_get_acl,
4685 #ifdef HAVE_IOP_SET_ACL
4686 .set_acl = ll_set_acl,
4690 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4692 struct ll_inode_info *lli = ll_i2info(inode);
4693 struct cl_object *obj = lli->lli_clob;
4702 env = cl_env_get(&refcheck);
4704 RETURN(PTR_ERR(env));
4706 rc = cl_conf_set(env, lli->lli_clob, conf);
4710 if (conf->coc_opc == OBJECT_CONF_SET) {
4711 struct ldlm_lock *lock = conf->coc_lock;
4712 struct cl_layout cl = {
4716 LASSERT(lock != NULL);
4717 LASSERT(ldlm_has_layout(lock));
4719 /* it can only be allowed to match after layout is
4720 * applied to inode otherwise false layout would be
4721 * seen. Applying layout shoud happen before dropping
4722 * the intent lock. */
4723 ldlm_lock_allow_match(lock);
4725 rc = cl_object_layout_get(env, obj, &cl);
4730 DFID": layout version change: %u -> %u\n",
4731 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4733 ll_layout_version_set(lli, cl.cl_layout_gen);
4737 cl_env_put(env, &refcheck);
4742 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4743 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4746 struct ll_sb_info *sbi = ll_i2sbi(inode);
4747 struct ptlrpc_request *req;
4748 struct mdt_body *body;
4755 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4756 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4757 lock->l_lvb_data, lock->l_lvb_len);
4759 if (lock->l_lvb_data != NULL)
4762 /* if layout lock was granted right away, the layout is returned
4763 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4764 * blocked and then granted via completion ast, we have to fetch
4765 * layout here. Please note that we can't use the LVB buffer in
4766 * completion AST because it doesn't have a large enough buffer */
4767 rc = ll_get_default_mdsize(sbi, &lmmsize);
4769 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4770 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4775 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4777 GOTO(out, rc = -EPROTO);
4779 lmmsize = body->mbo_eadatasize;
4780 if (lmmsize == 0) /* empty layout */
4783 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4785 GOTO(out, rc = -EFAULT);
4787 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4788 if (lvbdata == NULL)
4789 GOTO(out, rc = -ENOMEM);
4791 memcpy(lvbdata, lmm, lmmsize);
4792 lock_res_and_lock(lock);
4793 if (unlikely(lock->l_lvb_data == NULL)) {
4794 lock->l_lvb_type = LVB_T_LAYOUT;
4795 lock->l_lvb_data = lvbdata;
4796 lock->l_lvb_len = lmmsize;
4799 unlock_res_and_lock(lock);
4802 OBD_FREE_LARGE(lvbdata, lmmsize);
4807 ptlrpc_req_finished(req);
4812 * Apply the layout to the inode. Layout lock is held and will be released
4815 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4816 struct inode *inode)
4818 struct ll_inode_info *lli = ll_i2info(inode);
4819 struct ll_sb_info *sbi = ll_i2sbi(inode);
4820 struct ldlm_lock *lock;
4821 struct cl_object_conf conf;
4824 bool wait_layout = false;
4827 LASSERT(lustre_handle_is_used(lockh));
4829 lock = ldlm_handle2lock(lockh);
4830 LASSERT(lock != NULL);
4831 LASSERT(ldlm_has_layout(lock));
4833 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4834 PFID(&lli->lli_fid), inode);
4836 /* in case this is a caching lock and reinstate with new inode */
4837 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4839 lock_res_and_lock(lock);
4840 lvb_ready = ldlm_is_lvb_ready(lock);
4841 unlock_res_and_lock(lock);
4843 /* checking lvb_ready is racy but this is okay. The worst case is
4844 * that multi processes may configure the file on the same time. */
4848 rc = ll_layout_fetch(inode, lock);
4852 /* for layout lock, lmm is stored in lock's lvb.
4853 * lvb_data is immutable if the lock is held so it's safe to access it
4856 * set layout to file. Unlikely this will fail as old layout was
4857 * surely eliminated */
4858 memset(&conf, 0, sizeof conf);
4859 conf.coc_opc = OBJECT_CONF_SET;
4860 conf.coc_inode = inode;
4861 conf.coc_lock = lock;
4862 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4863 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4864 rc = ll_layout_conf(inode, &conf);
4866 /* refresh layout failed, need to wait */
4867 wait_layout = rc == -EBUSY;
4870 LDLM_LOCK_PUT(lock);
4871 ldlm_lock_decref(lockh, mode);
4873 /* wait for IO to complete if it's still being used. */
4875 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4876 ll_get_fsname(inode->i_sb, NULL, 0),
4877 PFID(&lli->lli_fid), inode);
4879 memset(&conf, 0, sizeof conf);
4880 conf.coc_opc = OBJECT_CONF_WAIT;
4881 conf.coc_inode = inode;
4882 rc = ll_layout_conf(inode, &conf);
4886 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4887 ll_get_fsname(inode->i_sb, NULL, 0),
4888 PFID(&lli->lli_fid), rc);
4894 * Issue layout intent RPC to MDS.
4895 * \param inode [in] file inode
4896 * \param intent [in] layout intent
4898 * \retval 0 on success
4899 * \retval < 0 error code
4901 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4903 struct ll_inode_info *lli = ll_i2info(inode);
4904 struct ll_sb_info *sbi = ll_i2sbi(inode);
4905 struct md_op_data *op_data;
4906 struct lookup_intent it;
4907 struct ptlrpc_request *req;
4911 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4912 0, 0, LUSTRE_OPC_ANY, NULL);
4913 if (IS_ERR(op_data))
4914 RETURN(PTR_ERR(op_data));
4916 op_data->op_data = intent;
4917 op_data->op_data_size = sizeof(*intent);
4919 memset(&it, 0, sizeof(it));
4920 it.it_op = IT_LAYOUT;
4921 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4922 intent->li_opc == LAYOUT_INTENT_TRUNC)
4923 it.it_flags = FMODE_WRITE;
4925 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4926 ll_get_fsname(inode->i_sb, NULL, 0),
4927 PFID(&lli->lli_fid), inode);
4929 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4930 &ll_md_blocking_ast, 0);
4931 if (it.it_request != NULL)
4932 ptlrpc_req_finished(it.it_request);
4933 it.it_request = NULL;
4935 ll_finish_md_op_data(op_data);
4937 /* set lock data in case this is a new lock */
4939 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4941 ll_intent_drop_lock(&it);
4947 * This function checks if there exists a LAYOUT lock on the client side,
4948 * or enqueues it if it doesn't have one in cache.
4950 * This function will not hold layout lock so it may be revoked any time after
4951 * this function returns. Any operations depend on layout should be redone
4954 * This function should be called before lov_io_init() to get an uptodate
4955 * layout version, the caller should save the version number and after IO
4956 * is finished, this function should be called again to verify that layout
4957 * is not changed during IO time.
4959 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4961 struct ll_inode_info *lli = ll_i2info(inode);
4962 struct ll_sb_info *sbi = ll_i2sbi(inode);
4963 struct lustre_handle lockh;
4964 struct layout_intent intent = {
4965 .li_opc = LAYOUT_INTENT_ACCESS,
4967 enum ldlm_mode mode;
4971 *gen = ll_layout_version_get(lli);
4972 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4976 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4977 LASSERT(S_ISREG(inode->i_mode));
4979 /* take layout lock mutex to enqueue layout lock exclusively. */
4980 mutex_lock(&lli->lli_layout_mutex);
4983 /* mostly layout lock is caching on the local side, so try to
4984 * match it before grabbing layout lock mutex. */
4985 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4986 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4987 if (mode != 0) { /* hit cached lock */
4988 rc = ll_layout_lock_set(&lockh, mode, inode);
4994 rc = ll_layout_intent(inode, &intent);
5000 *gen = ll_layout_version_get(lli);
5001 mutex_unlock(&lli->lli_layout_mutex);
5007 * Issue layout intent RPC indicating where in a file an IO is about to write.
5009 * \param[in] inode file inode.
5010 * \param[in] ext write range with start offset of fille in bytes where
5011 * an IO is about to write, and exclusive end offset in
5014 * \retval 0 on success
5015 * \retval < 0 error code
5017 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5018 struct lu_extent *ext)
5020 struct layout_intent intent = {
5022 .li_extent.e_start = ext->e_start,
5023 .li_extent.e_end = ext->e_end,
5028 rc = ll_layout_intent(inode, &intent);
5034 * This function send a restore request to the MDT
5036 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5038 struct hsm_user_request *hur;
5042 len = sizeof(struct hsm_user_request) +
5043 sizeof(struct hsm_user_item);
5044 OBD_ALLOC(hur, len);
5048 hur->hur_request.hr_action = HUA_RESTORE;
5049 hur->hur_request.hr_archive_id = 0;
5050 hur->hur_request.hr_flags = 0;
5051 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5052 sizeof(hur->hur_user_item[0].hui_fid));
5053 hur->hur_user_item[0].hui_extent.offset = offset;
5054 hur->hur_user_item[0].hui_extent.length = length;
5055 hur->hur_request.hr_itemcount = 1;
5056 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,