4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
48 #include <lustre/ll_fiemap.h>
50 #include <uapi/linux/lustre_ioctl.h>
51 #include <lustre_swab.h>
53 #include "cl_object.h"
54 #include "llite_internal.h"
55 #include "vvp_internal.h"
58 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
60 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
63 static enum llioc_iter
64 ll_iocontrol_call(struct inode *inode, struct file *file,
65 unsigned int cmd, unsigned long arg, int *rcp);
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
105 op_data->op_attr_blocks = inode->i_blocks;
106 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
107 op_data->op_handle = och->och_fh;
109 if (och->och_flags & FMODE_WRITE &&
110 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
111 /* For HSM: if inode data has been modified, pack it so that
112 * MDT can set data dirty flag in the archive. */
113 op_data->op_bias |= MDS_DATA_MODIFIED;
119 * Perform a close, possibly with a bias.
120 * The meaning of "data" depends on the value of "bias".
122 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
123 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
126 static int ll_close_inode_openhandle(struct inode *inode,
127 struct obd_client_handle *och,
128 enum mds_op_bias bias, void *data)
130 struct obd_export *md_exp = ll_i2mdexp(inode);
131 const struct ll_inode_info *lli = ll_i2info(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
137 if (class_exp2obd(md_exp) == NULL) {
138 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
139 ll_get_fsname(inode->i_sb, NULL, 0),
140 PFID(&lli->lli_fid));
144 OBD_ALLOC_PTR(op_data);
145 /* We leak openhandle and request here on error, but not much to be
146 * done in OOM case since app won't retry close on error either. */
148 GOTO(out, rc = -ENOMEM);
150 ll_prepare_close(inode, op_data, och);
152 case MDS_CLOSE_LAYOUT_SWAP:
153 LASSERT(data != NULL);
154 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
155 op_data->op_data_version = 0;
156 op_data->op_lease_handle = och->och_lease_handle;
157 op_data->op_fid2 = *ll_inode2fid(data);
160 case MDS_HSM_RELEASE:
161 LASSERT(data != NULL);
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *(__u64 *)data;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
169 LASSERT(data == NULL);
173 rc = md_close(md_exp, op_data, och->och_mod, &req);
174 if (rc != 0 && rc != -EINTR)
175 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
176 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
179 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
180 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 ptlrpc_req_finished(req); /* This is close request */
199 int ll_md_real_close(struct inode *inode, fmode_t fmode)
201 struct ll_inode_info *lli = ll_i2info(inode);
202 struct obd_client_handle **och_p;
203 struct obd_client_handle *och;
208 if (fmode & FMODE_WRITE) {
209 och_p = &lli->lli_mds_write_och;
210 och_usecount = &lli->lli_open_fd_write_count;
211 } else if (fmode & FMODE_EXEC) {
212 och_p = &lli->lli_mds_exec_och;
213 och_usecount = &lli->lli_open_fd_exec_count;
215 LASSERT(fmode & FMODE_READ);
216 och_p = &lli->lli_mds_read_och;
217 och_usecount = &lli->lli_open_fd_read_count;
220 mutex_lock(&lli->lli_och_mutex);
221 if (*och_usecount > 0) {
222 /* There are still users of this handle, so skip
224 mutex_unlock(&lli->lli_och_mutex);
230 mutex_unlock(&lli->lli_och_mutex);
233 /* There might be a race and this handle may already
235 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
241 static int ll_md_close(struct inode *inode, struct file *file)
243 union ldlm_policy_data policy = {
244 .l_inodebits = { MDS_INODELOCK_OPEN },
246 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
247 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
248 struct ll_inode_info *lli = ll_i2info(inode);
249 struct lustre_handle lockh;
250 enum ldlm_mode lockmode;
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
258 if (fd->fd_lease_och != NULL) {
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
267 fd->fd_lease_och = NULL;
270 if (fd->fd_och != NULL) {
271 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 mutex_lock(&lli->lli_och_mutex);
279 if (fd->fd_omode & FMODE_WRITE) {
281 LASSERT(lli->lli_open_fd_write_count);
282 lli->lli_open_fd_write_count--;
283 } else if (fd->fd_omode & FMODE_EXEC) {
285 LASSERT(lli->lli_open_fd_exec_count);
286 lli->lli_open_fd_exec_count--;
289 LASSERT(lli->lli_open_fd_read_count);
290 lli->lli_open_fd_read_count--;
292 mutex_unlock(&lli->lli_och_mutex);
294 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
295 LDLM_IBITS, &policy, lockmode, &lockh))
296 rc = ll_md_real_close(inode, fd->fd_omode);
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 /* While this returns an error code, fput() the caller does not, so we need
306 * to make every effort to clean up all of our state here. Also, applications
307 * rarely check close errors and even if an error is returned they will not
308 * re-try the close call.
310 int ll_file_release(struct inode *inode, struct file *file)
312 struct ll_file_data *fd;
313 struct ll_sb_info *sbi = ll_i2sbi(inode);
314 struct ll_inode_info *lli = ll_i2info(inode);
318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
319 PFID(ll_inode2fid(inode)), inode);
321 if (inode->i_sb->s_root != file_dentry(file))
322 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
323 fd = LUSTRE_FPRIVATE(file);
326 /* The last ref on @file, maybe not the the owner pid of statahead,
327 * because parent and child process can share the same file handle. */
328 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
329 ll_deauthorize_statahead(inode, fd);
331 if (inode->i_sb->s_root == file_dentry(file)) {
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
337 if (!S_ISDIR(inode->i_mode)) {
338 if (lli->lli_clob != NULL)
339 lov_read_and_clear_async_rc(lli->lli_clob);
340 lli->lli_async_rc = 0;
343 rc = ll_md_close(inode, file);
345 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
346 libcfs_debug_dumplog();
351 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
352 struct lookup_intent *itp)
354 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
355 struct dentry *parent = de->d_parent;
356 const char *name = NULL;
358 struct md_op_data *op_data;
359 struct ptlrpc_request *req = NULL;
363 LASSERT(parent != NULL);
364 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
366 /* if server supports open-by-fid, or file name is invalid, don't pack
367 * name in open request */
368 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
369 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
370 name = de->d_name.name;
371 len = de->d_name.len;
374 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
375 name, len, 0, LUSTRE_OPC_ANY, NULL);
377 RETURN(PTR_ERR(op_data));
378 op_data->op_data = lmm;
379 op_data->op_data_size = lmmsize;
381 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
382 &ll_md_blocking_ast, 0);
383 ll_finish_md_op_data(op_data);
385 /* reason for keep own exit path - don`t flood log
386 * with messages with -ESTALE errors.
388 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
389 it_open_error(DISP_OPEN_OPEN, itp))
391 ll_release_openhandle(de, itp);
395 if (it_disposition(itp, DISP_LOOKUP_NEG))
396 GOTO(out, rc = -ENOENT);
398 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
399 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
400 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
405 if (!rc && itp->it_lock_mode)
406 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
409 ptlrpc_req_finished(req);
410 ll_intent_drop_lock(itp);
412 /* We did open by fid, but by the time we got to the server,
413 * the object disappeared. If this is a create, we cannot really
414 * tell the userspace that the file it was trying to create
415 * does not exist. Instead let's return -ESTALE, and the VFS will
416 * retry the create with LOOKUP_REVAL that we are going to catch
417 * in ll_revalidate_dentry() and use lookup then.
419 if (rc == -ENOENT && itp->it_op & IT_CREAT)
425 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
426 struct obd_client_handle *och)
428 struct mdt_body *body;
430 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
431 och->och_fh = body->mbo_handle;
432 och->och_fid = body->mbo_fid1;
433 och->och_lease_handle.cookie = it->it_lock_handle;
434 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
435 och->och_flags = it->it_flags;
437 return md_set_open_replay_data(md_exp, och, it);
440 static int ll_local_open(struct file *file, struct lookup_intent *it,
441 struct ll_file_data *fd, struct obd_client_handle *och)
443 struct inode *inode = file_inode(file);
446 LASSERT(!LUSTRE_FPRIVATE(file));
453 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
458 LUSTRE_FPRIVATE(file) = fd;
459 ll_readahead_init(inode, &fd->fd_ras);
460 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
462 /* ll_cl_context initialize */
463 rwlock_init(&fd->fd_lock);
464 INIT_LIST_HEAD(&fd->fd_lccs);
469 /* Open a file, and (for the very first open) create objects on the OSTs at
470 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
471 * creation or open until ll_lov_setstripe() ioctl is called.
473 * If we already have the stripe MD locally then we don't request it in
474 * md_open(), by passing a lmm_size = 0.
476 * It is up to the application to ensure no other processes open this file
477 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
478 * used. We might be able to avoid races of that sort by getting lli_open_sem
479 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
480 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
482 int ll_file_open(struct inode *inode, struct file *file)
484 struct ll_inode_info *lli = ll_i2info(inode);
485 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
486 .it_flags = file->f_flags };
487 struct obd_client_handle **och_p = NULL;
488 __u64 *och_usecount = NULL;
489 struct ll_file_data *fd;
493 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
494 PFID(ll_inode2fid(inode)), inode, file->f_flags);
496 it = file->private_data; /* XXX: compat macro */
497 file->private_data = NULL; /* prevent ll_local_open assertion */
499 fd = ll_file_data_get();
501 GOTO(out_openerr, rc = -ENOMEM);
504 if (S_ISDIR(inode->i_mode))
505 ll_authorize_statahead(inode, fd);
507 if (inode->i_sb->s_root == file_dentry(file)) {
508 LUSTRE_FPRIVATE(file) = fd;
512 if (!it || !it->it_disposition) {
513 /* Convert f_flags into access mode. We cannot use file->f_mode,
514 * because everything but O_ACCMODE mask was stripped from
516 if ((oit.it_flags + 1) & O_ACCMODE)
518 if (file->f_flags & O_TRUNC)
519 oit.it_flags |= FMODE_WRITE;
521 /* kernel only call f_op->open in dentry_open. filp_open calls
522 * dentry_open after call to open_namei that checks permissions.
523 * Only nfsd_open call dentry_open directly without checking
524 * permissions and because of that this code below is safe. */
525 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
526 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
528 /* We do not want O_EXCL here, presumably we opened the file
529 * already? XXX - NFS implications? */
530 oit.it_flags &= ~O_EXCL;
532 /* bug20584, if "it_flags" contains O_CREAT, the file will be
533 * created if necessary, then "IT_CREAT" should be set to keep
534 * consistent with it */
535 if (oit.it_flags & O_CREAT)
536 oit.it_op |= IT_CREAT;
542 /* Let's see if we have file open on MDS already. */
543 if (it->it_flags & FMODE_WRITE) {
544 och_p = &lli->lli_mds_write_och;
545 och_usecount = &lli->lli_open_fd_write_count;
546 } else if (it->it_flags & FMODE_EXEC) {
547 och_p = &lli->lli_mds_exec_och;
548 och_usecount = &lli->lli_open_fd_exec_count;
550 och_p = &lli->lli_mds_read_och;
551 och_usecount = &lli->lli_open_fd_read_count;
554 mutex_lock(&lli->lli_och_mutex);
555 if (*och_p) { /* Open handle is present */
556 if (it_disposition(it, DISP_OPEN_OPEN)) {
557 /* Well, there's extra open request that we do not need,
558 let's close it somehow. This will decref request. */
559 rc = it_open_error(DISP_OPEN_OPEN, it);
561 mutex_unlock(&lli->lli_och_mutex);
562 GOTO(out_openerr, rc);
565 ll_release_openhandle(file_dentry(file), it);
569 rc = ll_local_open(file, it, fd, NULL);
572 mutex_unlock(&lli->lli_och_mutex);
573 GOTO(out_openerr, rc);
576 LASSERT(*och_usecount == 0);
577 if (!it->it_disposition) {
578 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
579 /* We cannot just request lock handle now, new ELC code
580 means that one of other OPEN locks for this file
581 could be cancelled, and since blocking ast handler
582 would attempt to grab och_mutex as well, that would
583 result in a deadlock */
584 mutex_unlock(&lli->lli_och_mutex);
586 * Normally called under two situations:
588 * 2. A race/condition on MDS resulting in no open
589 * handle to be returned from LOOKUP|OPEN request,
590 * for example if the target entry was a symlink.
592 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
593 * marked by a bit set in ll_iget_for_nfs. Clear the
594 * bit so that it's not confusing later callers.
596 * NB; when ldd is NULL, it must have come via normal
597 * lookup path only, since ll_iget_for_nfs always calls
600 if (ldd && ldd->lld_nfs_dentry) {
601 ldd->lld_nfs_dentry = 0;
602 it->it_flags |= MDS_OPEN_LOCK;
606 * Always specify MDS_OPEN_BY_FID because we don't want
607 * to get file with different fid.
609 it->it_flags |= MDS_OPEN_BY_FID;
610 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
613 GOTO(out_openerr, rc);
617 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
619 GOTO(out_och_free, rc = -ENOMEM);
623 /* md_intent_lock() didn't get a request ref if there was an
624 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
627 * just open error? */
628 rc = it_open_error(DISP_OPEN_OPEN, it);
630 GOTO(out_och_free, rc);
632 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
633 "inode %p: disposition %x, status %d\n", inode,
634 it_disposition(it, ~0), it->it_status);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
649 cl_lov_delay_create_clear(&file->f_flags);
650 GOTO(out_och_free, rc);
654 if (och_p && *och_p) {
655 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
656 *och_p = NULL; /* OBD_FREE writes some magic there */
659 mutex_unlock(&lli->lli_och_mutex);
662 if (lli->lli_opendir_key == fd)
663 ll_deauthorize_statahead(inode, fd);
665 ll_file_data_put(fd);
667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
670 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
671 ptlrpc_req_finished(it->it_request);
672 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
679 struct ldlm_lock_desc *desc, void *data, int flag)
682 struct lustre_handle lockh;
686 case LDLM_CB_BLOCKING:
687 ldlm_lock2handle(lock, &lockh);
688 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
690 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
694 case LDLM_CB_CANCELING:
702 * When setting a lease on a file, we take ownership of the lli_mds_*_och
703 * and save it as fd->fd_och so as to force client to reopen the file even
704 * if it has an open lock in cache already.
706 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
707 struct lustre_handle *old_handle)
709 struct ll_inode_info *lli = ll_i2info(inode);
710 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
711 struct obd_client_handle **och_p;
716 /* Get the openhandle of the file */
717 mutex_lock(&lli->lli_och_mutex);
718 if (fd->fd_lease_och != NULL)
719 GOTO(out_unlock, rc = -EBUSY);
721 if (fd->fd_och == NULL) {
722 if (file->f_mode & FMODE_WRITE) {
723 LASSERT(lli->lli_mds_write_och != NULL);
724 och_p = &lli->lli_mds_write_och;
725 och_usecount = &lli->lli_open_fd_write_count;
727 LASSERT(lli->lli_mds_read_och != NULL);
728 och_p = &lli->lli_mds_read_och;
729 och_usecount = &lli->lli_open_fd_read_count;
732 if (*och_usecount > 1)
733 GOTO(out_unlock, rc = -EBUSY);
740 *old_handle = fd->fd_och->och_fh;
744 mutex_unlock(&lli->lli_och_mutex);
749 * Release ownership on lli_mds_*_och when putting back a file lease.
751 static int ll_lease_och_release(struct inode *inode, struct file *file)
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 struct obd_client_handle *old_och = NULL;
761 mutex_lock(&lli->lli_och_mutex);
762 if (file->f_mode & FMODE_WRITE) {
763 och_p = &lli->lli_mds_write_och;
764 och_usecount = &lli->lli_open_fd_write_count;
766 och_p = &lli->lli_mds_read_och;
767 och_usecount = &lli->lli_open_fd_read_count;
770 /* The file may have been open by another process (broken lease) so
771 * *och_p is not NULL. In this case we should simply increase usecount
774 if (*och_p != NULL) {
775 old_och = fd->fd_och;
782 mutex_unlock(&lli->lli_och_mutex);
785 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
791 * Acquire a lease and open the file.
793 static struct obd_client_handle *
794 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
797 struct lookup_intent it = { .it_op = IT_OPEN };
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
799 struct md_op_data *op_data;
800 struct ptlrpc_request *req = NULL;
801 struct lustre_handle old_handle = { 0 };
802 struct obd_client_handle *och = NULL;
807 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
808 RETURN(ERR_PTR(-EINVAL));
811 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
812 RETURN(ERR_PTR(-EPERM));
814 rc = ll_lease_och_acquire(inode, file, &old_handle);
821 RETURN(ERR_PTR(-ENOMEM));
823 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
824 LUSTRE_OPC_ANY, NULL);
826 GOTO(out, rc = PTR_ERR(op_data));
828 /* To tell the MDT this openhandle is from the same owner */
829 op_data->op_handle = old_handle;
831 it.it_flags = fmode | open_flags;
832 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
833 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
834 &ll_md_blocking_lease_ast,
835 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
836 * it can be cancelled which may mislead applications that the lease is
838 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
839 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
840 * doesn't deal with openhandle, so normal openhandle will be leaked. */
841 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
842 ll_finish_md_op_data(op_data);
843 ptlrpc_req_finished(req);
845 GOTO(out_release_it, rc);
847 if (it_disposition(&it, DISP_LOOKUP_NEG))
848 GOTO(out_release_it, rc = -ENOENT);
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
852 GOTO(out_release_it, rc);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
858 GOTO(out_close, rc = -EOPNOTSUPP);
860 /* already get lease, handle lease lock */
861 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
862 if (it.it_lock_mode == 0 ||
863 it.it_lock_bits != MDS_INODELOCK_OPEN) {
864 /* open lock must return for lease */
865 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
866 PFID(ll_inode2fid(inode)), it.it_lock_mode,
868 GOTO(out_close, rc = -EPROTO);
871 ll_intent_release(&it);
875 /* Cancel open lock */
876 if (it.it_lock_mode != 0) {
877 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
880 och->och_lease_handle.cookie = 0ULL;
882 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
884 CERROR("%s: error closing file "DFID": %d\n",
885 ll_get_fsname(inode->i_sb, NULL, 0),
886 PFID(&ll_i2info(inode)->lli_fid), rc2);
887 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
889 ll_intent_release(&it);
897 * Check whether a layout swap can be done between two inodes.
899 * \param[in] inode1 First inode to check
900 * \param[in] inode2 Second inode to check
902 * \retval 0 on success, layout swap can be performed between both inodes
903 * \retval negative error code if requirements are not met
905 static int ll_check_swap_layouts_validity(struct inode *inode1,
906 struct inode *inode2)
908 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
911 if (inode_permission(inode1, MAY_WRITE) ||
912 inode_permission(inode2, MAY_WRITE))
915 if (inode1->i_sb != inode2->i_sb)
921 static int ll_swap_layouts_close(struct obd_client_handle *och,
922 struct inode *inode, struct inode *inode2)
924 const struct lu_fid *fid1 = ll_inode2fid(inode);
925 const struct lu_fid *fid2;
929 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
930 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
932 rc = ll_check_swap_layouts_validity(inode, inode2);
934 GOTO(out_free_och, rc);
936 /* We now know that inode2 is a lustre inode */
937 fid2 = ll_inode2fid(inode2);
939 rc = lu_fid_cmp(fid1, fid2);
941 GOTO(out_free_och, rc = -EINVAL);
943 /* Close the file and swap layouts between inode & inode2.
944 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
945 * because we still need it to pack l_remote_handle to MDT. */
946 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
949 och = NULL; /* freed in ll_close_inode_openhandle() */
959 * Release lease and close the file.
960 * It will check if the lease has ever broken.
962 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
965 struct ldlm_lock *lock;
966 bool cancelled = true;
970 lock = ldlm_handle2lock(&och->och_lease_handle);
972 lock_res_and_lock(lock);
973 cancelled = ldlm_is_cancel(lock);
974 unlock_res_and_lock(lock);
978 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
979 PFID(&ll_i2info(inode)->lli_fid), cancelled);
982 ldlm_cli_cancel(&och->och_lease_handle, 0);
984 if (lease_broken != NULL)
985 *lease_broken = cancelled;
987 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
991 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
993 struct ll_inode_info *lli = ll_i2info(inode);
994 struct cl_object *obj = lli->lli_clob;
995 struct cl_attr *attr = vvp_env_thread_attr(env);
1003 ll_inode_size_lock(inode);
1005 /* Merge timestamps the most recently obtained from MDS with
1006 * timestamps obtained from OSTs.
1008 * Do not overwrite atime of inode because it may be refreshed
1009 * by file_accessed() function. If the read was served by cache
1010 * data, there is no RPC to be sent so that atime may not be
1011 * transferred to OSTs at all. MDT only updates atime at close time
1012 * if it's at least 'mdd.*.atime_diff' older.
1013 * All in all, the atime in Lustre does not strictly comply with
1014 * POSIX. Solving this problem needs to send an RPC to MDT for each
1015 * read, this will hurt performance. */
1016 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1017 LTIME_S(inode->i_atime) = lli->lli_atime;
1018 lli->lli_update_atime = 0;
1020 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1021 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1023 atime = LTIME_S(inode->i_atime);
1024 mtime = LTIME_S(inode->i_mtime);
1025 ctime = LTIME_S(inode->i_ctime);
1027 cl_object_attr_lock(obj);
1028 rc = cl_object_attr_get(env, obj, attr);
1029 cl_object_attr_unlock(obj);
1032 GOTO(out_size_unlock, rc);
1034 if (atime < attr->cat_atime)
1035 atime = attr->cat_atime;
1037 if (ctime < attr->cat_ctime)
1038 ctime = attr->cat_ctime;
1040 if (mtime < attr->cat_mtime)
1041 mtime = attr->cat_mtime;
1043 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1044 PFID(&lli->lli_fid), attr->cat_size);
1046 i_size_write(inode, attr->cat_size);
1047 inode->i_blocks = attr->cat_blocks;
1049 LTIME_S(inode->i_atime) = atime;
1050 LTIME_S(inode->i_mtime) = mtime;
1051 LTIME_S(inode->i_ctime) = ctime;
1054 ll_inode_size_unlock(inode);
1059 static bool file_is_noatime(const struct file *file)
1061 const struct vfsmount *mnt = file->f_path.mnt;
1062 const struct inode *inode = file_inode((struct file *)file);
1064 /* Adapted from file_accessed() and touch_atime().*/
1065 if (file->f_flags & O_NOATIME)
1068 if (inode->i_flags & S_NOATIME)
1071 if (IS_NOATIME(inode))
1074 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1077 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1086 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1088 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1090 struct inode *inode = file_inode(file);
1092 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1093 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1094 io->u.ci_rw.rw_file = file;
1095 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1096 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1097 if (iot == CIT_WRITE) {
1098 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1099 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1100 file->f_flags & O_DIRECT ||
1103 io->ci_obj = ll_i2info(inode)->lli_clob;
1104 io->ci_lockreq = CILR_MAYBE;
1105 if (ll_file_nolock(file)) {
1106 io->ci_lockreq = CILR_NEVER;
1107 io->ci_no_srvlock = 1;
1108 } else if (file->f_flags & O_APPEND) {
1109 io->ci_lockreq = CILR_MANDATORY;
1111 io->ci_noatime = file_is_noatime(file);
1112 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1113 io->ci_pio = !io->u.ci_rw.rw_append;
1118 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1120 struct cl_io_pt *pt = ptask->pt_cbdata;
1121 struct file *file = pt->cip_file;
1124 loff_t pos = pt->cip_pos;
1129 env = cl_env_get(&refcheck);
1131 RETURN(PTR_ERR(env));
1133 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1134 file_dentry(file)->d_name.name,
1135 pt->cip_iot == CIT_READ ? "read" : "write",
1136 pos, pos + pt->cip_count);
1139 io = vvp_env_thread_io(env);
1140 ll_io_init(io, file, pt->cip_iot);
1141 io->u.ci_rw.rw_iter = pt->cip_iter;
1142 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1143 io->ci_pio = 0; /* It's already in parallel task */
1145 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1146 pt->cip_count - pt->cip_result);
1148 struct vvp_io *vio = vvp_env_io(env);
1150 vio->vui_io_subtype = IO_NORMAL;
1151 vio->vui_fd = LUSTRE_FPRIVATE(file);
1153 ll_cl_add(file, env, io, LCC_RW);
1154 rc = cl_io_loop(env, io);
1155 ll_cl_remove(file, env);
1157 /* cl_io_rw_init() handled IO */
1161 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1167 if (io->ci_nob > 0) {
1168 pt->cip_result += io->ci_nob;
1169 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1171 pt->cip_iocb.ki_pos = pos;
1172 #ifdef HAVE_KIOCB_KI_LEFT
1173 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1174 #elif defined(HAVE_KI_NBYTES)
1175 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1179 cl_io_fini(env, io);
1181 if ((rc == 0 || rc == -ENODATA) &&
1182 pt->cip_result < pt->cip_count &&
1183 io->ci_need_restart) {
1185 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1186 file_dentry(file)->d_name.name,
1187 pt->cip_iot == CIT_READ ? "read" : "write",
1188 pos, pos + pt->cip_count - pt->cip_result,
1189 pt->cip_result, rc);
1193 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1194 file_dentry(file)->d_name.name,
1195 pt->cip_iot == CIT_READ ? "read" : "write",
1196 pt->cip_result, rc);
1198 cl_env_put(env, &refcheck);
1199 RETURN(pt->cip_result > 0 ? 0 : rc);
1203 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1204 struct file *file, enum cl_io_type iot,
1205 loff_t *ppos, size_t count)
1207 struct range_lock range;
1208 struct vvp_io *vio = vvp_env_io(env);
1209 struct inode *inode = file_inode(file);
1210 struct ll_inode_info *lli = ll_i2info(inode);
1211 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1219 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1220 file_dentry(file)->d_name.name,
1221 iot == CIT_READ ? "read" : "write", pos, pos + count);
1224 io = vvp_env_thread_io(env);
1225 ll_io_init(io, file, iot);
1226 if (args->via_io_subtype == IO_NORMAL) {
1227 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1228 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1233 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1234 bool range_locked = false;
1236 if (file->f_flags & O_APPEND)
1237 range_lock_init(&range, 0, LUSTRE_EOF);
1239 range_lock_init(&range, pos, pos + count - 1);
1241 vio->vui_fd = LUSTRE_FPRIVATE(file);
1242 vio->vui_io_subtype = args->via_io_subtype;
1244 switch (vio->vui_io_subtype) {
1246 /* Direct IO reads must also take range lock,
1247 * or multiple reads will try to work on the same pages
1248 * See LU-6227 for details. */
1249 if (((iot == CIT_WRITE) ||
1250 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1251 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1252 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1254 rc = range_lock(&lli->lli_write_tree, &range);
1258 range_locked = true;
1262 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1263 vio->u.splice.vui_flags = args->u.splice.via_flags;
1266 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1270 ll_cl_add(file, env, io, LCC_RW);
1271 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1272 !lli->lli_inode_locked) {
1274 lli->lli_inode_locked = 1;
1276 rc = cl_io_loop(env, io);
1277 if (lli->lli_inode_locked) {
1278 lli->lli_inode_locked = 0;
1279 inode_unlock(inode);
1281 ll_cl_remove(file, env);
1284 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1286 range_unlock(&lli->lli_write_tree, &range);
1289 /* cl_io_rw_init() handled IO */
1293 if (io->ci_nob > 0) {
1294 result += io->ci_nob;
1295 count -= io->ci_nob;
1297 if (args->via_io_subtype == IO_NORMAL) {
1298 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1300 args->u.normal.via_iocb->ki_pos = pos;
1301 #ifdef HAVE_KIOCB_KI_LEFT
1302 args->u.normal.via_iocb->ki_left = count;
1303 #elif defined(HAVE_KI_NBYTES)
1304 args->u.normal.via_iocb->ki_nbytes = count;
1308 pos = io->u.ci_rw.rw_range.cir_pos;
1312 cl_io_fini(env, io);
1314 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1316 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1317 file_dentry(file)->d_name.name,
1318 iot == CIT_READ ? "read" : "write",
1319 pos, pos + count, result, rc);
1323 if (iot == CIT_READ) {
1325 ll_stats_ops_tally(ll_i2sbi(inode),
1326 LPROC_LL_READ_BYTES, result);
1327 } else if (iot == CIT_WRITE) {
1329 ll_stats_ops_tally(ll_i2sbi(inode),
1330 LPROC_LL_WRITE_BYTES, result);
1331 fd->fd_write_failed = false;
1332 } else if (result == 0 && rc == 0) {
1335 fd->fd_write_failed = true;
1337 fd->fd_write_failed = false;
1338 } else if (rc != -ERESTARTSYS) {
1339 fd->fd_write_failed = true;
1343 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1344 file_dentry(file)->d_name.name,
1345 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1349 RETURN(result > 0 ? result : rc);
1353 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1354 * especially for small I/O.
1356 * To serve a read request, CLIO has to create and initialize a cl_io and
1357 * then request DLM lock. This has turned out to have siginificant overhead
1358 * and affects the performance of small I/O dramatically.
1360 * It's not necessary to create a cl_io for each I/O. Under the help of read
1361 * ahead, most of the pages being read are already in memory cache and we can
1362 * read those pages directly because if the pages exist, the corresponding DLM
1363 * lock must exist so that page content must be valid.
1365 * In fast read implementation, the llite speculatively finds and reads pages
1366 * in memory cache. There are three scenarios for fast read:
1367 * - If the page exists and is uptodate, kernel VM will provide the data and
1368 * CLIO won't be intervened;
1369 * - If the page was brought into memory by read ahead, it will be exported
1370 * and read ahead parameters will be updated;
1371 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1372 * it will go back and invoke normal read, i.e., a cl_io will be created
1373 * and DLM lock will be requested.
1375 * POSIX compliance: posix standard states that read is intended to be atomic.
1376 * Lustre read implementation is in line with Linux kernel read implementation
1377 * and neither of them complies with POSIX standard in this matter. Fast read
1378 * doesn't make the situation worse on single node but it may interleave write
1379 * results from multiple nodes due to short read handling in ll_file_aio_read().
1381 * \param env - lu_env
1382 * \param iocb - kiocb from kernel
1383 * \param iter - user space buffers where the data will be copied
1385 * \retval - number of bytes have been read, or error code if error occurred.
1388 ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
1389 struct iov_iter *iter)
1393 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1396 /* NB: we can't do direct IO for fast read because it will need a lock
1397 * to make IO engine happy. */
1398 if (iocb->ki_filp->f_flags & O_DIRECT)
1401 ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
1402 result = generic_file_read_iter(iocb, iter);
1403 ll_cl_remove(iocb->ki_filp, env);
1405 /* If the first page is not in cache, generic_file_aio_read() will be
1406 * returned with -ENODATA.
1407 * See corresponding code in ll_readpage(). */
1408 if (result == -ENODATA)
1412 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1413 LPROC_LL_READ_BYTES, result);
1419 * Read from a file (through the page cache).
1421 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1424 struct vvp_io_args *args;
1429 env = cl_env_get(&refcheck);
1431 return PTR_ERR(env);
1433 result = ll_do_fast_read(env, iocb, to);
1434 if (result < 0 || iov_iter_count(to) == 0)
1437 args = ll_env_args(env, IO_NORMAL);
1438 args->u.normal.via_iter = to;
1439 args->u.normal.via_iocb = iocb;
1441 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1442 &iocb->ki_pos, iov_iter_count(to));
1445 else if (result == 0)
1449 cl_env_put(env, &refcheck);
1454 * Write to a file (through the page cache).
1456 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1458 struct vvp_io_args *args;
1463 env = cl_env_get(&refcheck);
1465 return PTR_ERR(env);
1467 args = ll_env_args(env, IO_NORMAL);
1468 args->u.normal.via_iter = from;
1469 args->u.normal.via_iocb = iocb;
1471 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1472 &iocb->ki_pos, iov_iter_count(from));
1473 cl_env_put(env, &refcheck);
1477 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1479 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1481 static int ll_file_get_iov_count(const struct iovec *iov,
1482 unsigned long *nr_segs, size_t *count)
1487 for (seg = 0; seg < *nr_segs; seg++) {
1488 const struct iovec *iv = &iov[seg];
1491 * If any segment has a negative length, or the cumulative
1492 * length ever wraps negative then return -EINVAL.
1495 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1497 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1502 cnt -= iv->iov_len; /* This segment is no good */
1509 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1510 unsigned long nr_segs, loff_t pos)
1517 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1521 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1522 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1523 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1524 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1525 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1527 result = ll_file_read_iter(iocb, &to);
1532 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1536 struct iovec iov = { .iov_base = buf, .iov_len = count };
1537 struct kiocb *kiocb;
1542 env = cl_env_get(&refcheck);
1544 RETURN(PTR_ERR(env));
1546 kiocb = &ll_env_info(env)->lti_kiocb;
1547 init_sync_kiocb(kiocb, file);
1548 kiocb->ki_pos = *ppos;
1549 #ifdef HAVE_KIOCB_KI_LEFT
1550 kiocb->ki_left = count;
1551 #elif defined(HAVE_KI_NBYTES)
1552 kiocb->ki_nbytes = count;
1555 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1556 *ppos = kiocb->ki_pos;
1558 cl_env_put(env, &refcheck);
1563 * Write to a file (through the page cache).
1566 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1567 unsigned long nr_segs, loff_t pos)
1569 struct iov_iter from;
1574 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1578 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1579 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1580 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1581 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1582 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1584 result = ll_file_write_iter(iocb, &from);
1589 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1590 size_t count, loff_t *ppos)
1593 struct iovec iov = { .iov_base = (void __user *)buf,
1595 struct kiocb *kiocb;
1600 env = cl_env_get(&refcheck);
1602 RETURN(PTR_ERR(env));
1604 kiocb = &ll_env_info(env)->lti_kiocb;
1605 init_sync_kiocb(kiocb, file);
1606 kiocb->ki_pos = *ppos;
1607 #ifdef HAVE_KIOCB_KI_LEFT
1608 kiocb->ki_left = count;
1609 #elif defined(HAVE_KI_NBYTES)
1610 kiocb->ki_nbytes = count;
1613 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1614 *ppos = kiocb->ki_pos;
1616 cl_env_put(env, &refcheck);
1619 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1622 * Send file content (through pagecache) somewhere with helper
1624 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1625 struct pipe_inode_info *pipe, size_t count,
1629 struct vvp_io_args *args;
1634 env = cl_env_get(&refcheck);
1636 RETURN(PTR_ERR(env));
1638 args = ll_env_args(env, IO_SPLICE);
1639 args->u.splice.via_pipe = pipe;
1640 args->u.splice.via_flags = flags;
1642 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1643 cl_env_put(env, &refcheck);
1647 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1648 __u64 flags, struct lov_user_md *lum, int lum_size)
1650 struct lookup_intent oit = {
1652 .it_flags = flags | MDS_OPEN_BY_FID,
1657 ll_inode_size_lock(inode);
1658 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1660 GOTO(out_unlock, rc);
1662 ll_release_openhandle(dentry, &oit);
1665 ll_inode_size_unlock(inode);
1666 ll_intent_release(&oit);
1671 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1672 struct lov_mds_md **lmmp, int *lmm_size,
1673 struct ptlrpc_request **request)
1675 struct ll_sb_info *sbi = ll_i2sbi(inode);
1676 struct mdt_body *body;
1677 struct lov_mds_md *lmm = NULL;
1678 struct ptlrpc_request *req = NULL;
1679 struct md_op_data *op_data;
1682 rc = ll_get_default_mdsize(sbi, &lmmsize);
1686 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1687 strlen(filename), lmmsize,
1688 LUSTRE_OPC_ANY, NULL);
1689 if (IS_ERR(op_data))
1690 RETURN(PTR_ERR(op_data));
1692 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1693 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1694 ll_finish_md_op_data(op_data);
1696 CDEBUG(D_INFO, "md_getattr_name failed "
1697 "on %s: rc %d\n", filename, rc);
1701 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1702 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1704 lmmsize = body->mbo_eadatasize;
1706 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1708 GOTO(out, rc = -ENODATA);
1711 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1712 LASSERT(lmm != NULL);
1714 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1715 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1716 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1717 GOTO(out, rc = -EPROTO);
1720 * This is coming from the MDS, so is probably in
1721 * little endian. We convert it to host endian before
1722 * passing it to userspace.
1724 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1727 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1728 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1729 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1730 if (le32_to_cpu(lmm->lmm_pattern) &
1731 LOV_PATTERN_F_RELEASED)
1735 /* if function called for directory - we should
1736 * avoid swab not existent lsm objects */
1737 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1738 lustre_swab_lov_user_md_v1(
1739 (struct lov_user_md_v1 *)lmm);
1740 if (S_ISREG(body->mbo_mode))
1741 lustre_swab_lov_user_md_objects(
1742 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1744 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1745 lustre_swab_lov_user_md_v3(
1746 (struct lov_user_md_v3 *)lmm);
1747 if (S_ISREG(body->mbo_mode))
1748 lustre_swab_lov_user_md_objects(
1749 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1751 } else if (lmm->lmm_magic ==
1752 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1753 lustre_swab_lov_comp_md_v1(
1754 (struct lov_comp_md_v1 *)lmm);
1760 *lmm_size = lmmsize;
1765 static int ll_lov_setea(struct inode *inode, struct file *file,
1768 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1769 struct lov_user_md *lump;
1770 int lum_size = sizeof(struct lov_user_md) +
1771 sizeof(struct lov_user_ost_data);
1775 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1778 OBD_ALLOC_LARGE(lump, lum_size);
1782 if (copy_from_user(lump, arg, lum_size))
1783 GOTO(out_lump, rc = -EFAULT);
1785 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1787 cl_lov_delay_create_clear(&file->f_flags);
1790 OBD_FREE_LARGE(lump, lum_size);
1794 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1801 env = cl_env_get(&refcheck);
1803 RETURN(PTR_ERR(env));
1805 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1806 cl_env_put(env, &refcheck);
1810 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1813 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1814 struct lov_user_md *klum;
1816 __u64 flags = FMODE_WRITE;
1819 rc = ll_copy_user_md(lum, &klum);
1824 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1829 rc = put_user(0, &lum->lmm_stripe_count);
1833 rc = ll_layout_refresh(inode, &gen);
1837 rc = ll_file_getstripe(inode, arg, lum_size);
1839 cl_lov_delay_create_clear(&file->f_flags);
1842 OBD_FREE(klum, lum_size);
1847 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1849 struct ll_inode_info *lli = ll_i2info(inode);
1850 struct cl_object *obj = lli->lli_clob;
1851 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1852 struct ll_grouplock grouplock;
1857 CWARN("group id for group lock must not be 0\n");
1861 if (ll_file_nolock(file))
1862 RETURN(-EOPNOTSUPP);
1864 spin_lock(&lli->lli_lock);
1865 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1866 CWARN("group lock already existed with gid %lu\n",
1867 fd->fd_grouplock.lg_gid);
1868 spin_unlock(&lli->lli_lock);
1871 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1872 spin_unlock(&lli->lli_lock);
1875 * XXX: group lock needs to protect all OST objects while PFL
1876 * can add new OST objects during the IO, so we'd instantiate
1877 * all OST objects before getting its group lock.
1882 struct cl_layout cl = {
1883 .cl_is_composite = false,
1886 env = cl_env_get(&refcheck);
1888 RETURN(PTR_ERR(env));
1890 rc = cl_object_layout_get(env, obj, &cl);
1891 if (!rc && cl.cl_is_composite)
1892 rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
1894 cl_env_put(env, &refcheck);
1899 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1900 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1904 spin_lock(&lli->lli_lock);
1905 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1906 spin_unlock(&lli->lli_lock);
1907 CERROR("another thread just won the race\n");
1908 cl_put_grouplock(&grouplock);
1912 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1913 fd->fd_grouplock = grouplock;
1914 spin_unlock(&lli->lli_lock);
1916 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1920 static int ll_put_grouplock(struct inode *inode, struct file *file,
1923 struct ll_inode_info *lli = ll_i2info(inode);
1924 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1925 struct ll_grouplock grouplock;
1928 spin_lock(&lli->lli_lock);
1929 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1930 spin_unlock(&lli->lli_lock);
1931 CWARN("no group lock held\n");
1935 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1937 if (fd->fd_grouplock.lg_gid != arg) {
1938 CWARN("group lock %lu doesn't match current id %lu\n",
1939 arg, fd->fd_grouplock.lg_gid);
1940 spin_unlock(&lli->lli_lock);
1944 grouplock = fd->fd_grouplock;
1945 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1946 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1947 spin_unlock(&lli->lli_lock);
1949 cl_put_grouplock(&grouplock);
1950 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1955 * Close inode open handle
1957 * \param dentry [in] dentry which contains the inode
1958 * \param it [in,out] intent which contains open info and result
1961 * \retval <0 failure
1963 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1965 struct inode *inode = dentry->d_inode;
1966 struct obd_client_handle *och;
1972 /* Root ? Do nothing. */
1973 if (dentry->d_inode->i_sb->s_root == dentry)
1976 /* No open handle to close? Move away */
1977 if (!it_disposition(it, DISP_OPEN_OPEN))
1980 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1982 OBD_ALLOC(och, sizeof(*och));
1984 GOTO(out, rc = -ENOMEM);
1986 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1988 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
1990 /* this one is in place of ll_file_open */
1991 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1992 ptlrpc_req_finished(it->it_request);
1993 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1999 * Get size for inode for which FIEMAP mapping is requested.
2000 * Make the FIEMAP get_info call and returns the result.
2001 * \param fiemap kernel buffer to hold extens
2002 * \param num_bytes kernel buffer size
2004 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2010 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2013 /* Checks for fiemap flags */
2014 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2015 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2019 /* Check for FIEMAP_FLAG_SYNC */
2020 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2021 rc = filemap_fdatawrite(inode->i_mapping);
2026 env = cl_env_get(&refcheck);
2028 RETURN(PTR_ERR(env));
2030 if (i_size_read(inode) == 0) {
2031 rc = ll_glimpse_size(inode);
2036 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2037 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2038 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2040 /* If filesize is 0, then there would be no objects for mapping */
2041 if (fmkey.lfik_oa.o_size == 0) {
2042 fiemap->fm_mapped_extents = 0;
2046 fmkey.lfik_fiemap = *fiemap;
2048 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2049 &fmkey, fiemap, &num_bytes);
2051 cl_env_put(env, &refcheck);
2055 int ll_fid2path(struct inode *inode, void __user *arg)
2057 struct obd_export *exp = ll_i2mdexp(inode);
2058 const struct getinfo_fid2path __user *gfin = arg;
2060 struct getinfo_fid2path *gfout;
2066 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2067 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2070 /* Only need to get the buflen */
2071 if (get_user(pathlen, &gfin->gf_pathlen))
2074 if (pathlen > PATH_MAX)
2077 outsize = sizeof(*gfout) + pathlen;
2078 OBD_ALLOC(gfout, outsize);
2082 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2083 GOTO(gf_free, rc = -EFAULT);
2084 /* append root FID after gfout to let MDT know the root FID so that it
2085 * can lookup the correct path, this is mainly for fileset.
2086 * old server without fileset mount support will ignore this. */
2087 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2089 /* Call mdc_iocontrol */
2090 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2094 if (copy_to_user(arg, gfout, outsize))
2098 OBD_FREE(gfout, outsize);
2103 * Read the data_version for inode.
2105 * This value is computed using stripe object version on OST.
2106 * Version is computed using server side locking.
2108 * @param flags if do sync on the OST side;
2110 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2111 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2113 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2115 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2123 /* If no file object initialized, we consider its version is 0. */
2129 env = cl_env_get(&refcheck);
2131 RETURN(PTR_ERR(env));
2133 io = vvp_env_thread_io(env);
2135 io->u.ci_data_version.dv_data_version = 0;
2136 io->u.ci_data_version.dv_flags = flags;
2139 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2140 result = cl_io_loop(env, io);
2142 result = io->ci_result;
2144 *data_version = io->u.ci_data_version.dv_data_version;
2146 cl_io_fini(env, io);
2148 if (unlikely(io->ci_need_restart))
2151 cl_env_put(env, &refcheck);
2157 * Trigger a HSM release request for the provided inode.
2159 int ll_hsm_release(struct inode *inode)
2162 struct obd_client_handle *och = NULL;
2163 __u64 data_version = 0;
2168 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2169 ll_get_fsname(inode->i_sb, NULL, 0),
2170 PFID(&ll_i2info(inode)->lli_fid));
2172 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2174 GOTO(out, rc = PTR_ERR(och));
2176 /* Grab latest data_version and [am]time values */
2177 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2181 env = cl_env_get(&refcheck);
2183 GOTO(out, rc = PTR_ERR(env));
2185 ll_merge_attr(env, inode);
2186 cl_env_put(env, &refcheck);
2188 /* Release the file.
2189 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2190 * we still need it to pack l_remote_handle to MDT. */
2191 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2197 if (och != NULL && !IS_ERR(och)) /* close the file */
2198 ll_lease_close(och, inode, NULL);
2203 struct ll_swap_stack {
2206 struct inode *inode1;
2207 struct inode *inode2;
2212 static int ll_swap_layouts(struct file *file1, struct file *file2,
2213 struct lustre_swap_layouts *lsl)
2215 struct mdc_swap_layouts msl;
2216 struct md_op_data *op_data;
2219 struct ll_swap_stack *llss = NULL;
2222 OBD_ALLOC_PTR(llss);
2226 llss->inode1 = file_inode(file1);
2227 llss->inode2 = file_inode(file2);
2229 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2233 /* we use 2 bool because it is easier to swap than 2 bits */
2234 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2235 llss->check_dv1 = true;
2237 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2238 llss->check_dv2 = true;
2240 /* we cannot use lsl->sl_dvX directly because we may swap them */
2241 llss->dv1 = lsl->sl_dv1;
2242 llss->dv2 = lsl->sl_dv2;
2244 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2245 if (rc == 0) /* same file, done! */
2248 if (rc < 0) { /* sequentialize it */
2249 swap(llss->inode1, llss->inode2);
2251 swap(llss->dv1, llss->dv2);
2252 swap(llss->check_dv1, llss->check_dv2);
2256 if (gid != 0) { /* application asks to flush dirty cache */
2257 rc = ll_get_grouplock(llss->inode1, file1, gid);
2261 rc = ll_get_grouplock(llss->inode2, file2, gid);
2263 ll_put_grouplock(llss->inode1, file1, gid);
2268 /* ultimate check, before swaping the layouts we check if
2269 * dataversion has changed (if requested) */
2270 if (llss->check_dv1) {
2271 rc = ll_data_version(llss->inode1, &dv, 0);
2274 if (dv != llss->dv1)
2275 GOTO(putgl, rc = -EAGAIN);
2278 if (llss->check_dv2) {
2279 rc = ll_data_version(llss->inode2, &dv, 0);
2282 if (dv != llss->dv2)
2283 GOTO(putgl, rc = -EAGAIN);
2286 /* struct md_op_data is used to send the swap args to the mdt
2287 * only flags is missing, so we use struct mdc_swap_layouts
2288 * through the md_op_data->op_data */
2289 /* flags from user space have to be converted before they are send to
2290 * server, no flag is sent today, they are only used on the client */
2293 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2294 0, LUSTRE_OPC_ANY, &msl);
2295 if (IS_ERR(op_data))
2296 GOTO(free, rc = PTR_ERR(op_data));
2298 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2299 sizeof(*op_data), op_data, NULL);
2300 ll_finish_md_op_data(op_data);
2307 ll_put_grouplock(llss->inode2, file2, gid);
2308 ll_put_grouplock(llss->inode1, file1, gid);
2318 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2320 struct md_op_data *op_data;
2324 /* Detect out-of range masks */
2325 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2328 /* Non-root users are forbidden to set or clear flags which are
2329 * NOT defined in HSM_USER_MASK. */
2330 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2331 !cfs_capable(CFS_CAP_SYS_ADMIN))
2334 /* Detect out-of range archive id */
2335 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2336 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2339 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2340 LUSTRE_OPC_ANY, hss);
2341 if (IS_ERR(op_data))
2342 RETURN(PTR_ERR(op_data));
2344 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2345 sizeof(*op_data), op_data, NULL);
2347 ll_finish_md_op_data(op_data);
2352 static int ll_hsm_import(struct inode *inode, struct file *file,
2353 struct hsm_user_import *hui)
2355 struct hsm_state_set *hss = NULL;
2356 struct iattr *attr = NULL;
2360 if (!S_ISREG(inode->i_mode))
2366 GOTO(out, rc = -ENOMEM);
2368 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2369 hss->hss_archive_id = hui->hui_archive_id;
2370 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2371 rc = ll_hsm_state_set(inode, hss);
2375 OBD_ALLOC_PTR(attr);
2377 GOTO(out, rc = -ENOMEM);
2379 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2380 attr->ia_mode |= S_IFREG;
2381 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2382 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2383 attr->ia_size = hui->hui_size;
2384 attr->ia_mtime.tv_sec = hui->hui_mtime;
2385 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2386 attr->ia_atime.tv_sec = hui->hui_atime;
2387 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2389 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2390 ATTR_UID | ATTR_GID |
2391 ATTR_MTIME | ATTR_MTIME_SET |
2392 ATTR_ATIME | ATTR_ATIME_SET;
2396 rc = ll_setattr_raw(file_dentry(file), attr, true);
2400 inode_unlock(inode);
2412 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2414 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2415 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2418 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2420 struct inode *inode = file_inode(file);
2422 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2423 ATTR_MTIME | ATTR_MTIME_SET |
2424 ATTR_CTIME | ATTR_CTIME_SET,
2426 .tv_sec = lfu->lfu_atime_sec,
2427 .tv_nsec = lfu->lfu_atime_nsec,
2430 .tv_sec = lfu->lfu_mtime_sec,
2431 .tv_nsec = lfu->lfu_mtime_nsec,
2434 .tv_sec = lfu->lfu_ctime_sec,
2435 .tv_nsec = lfu->lfu_ctime_nsec,
2441 if (!capable(CAP_SYS_ADMIN))
2444 if (!S_ISREG(inode->i_mode))
2448 rc = ll_setattr_raw(file_dentry(file), &ia, false);
2449 inode_unlock(inode);
2455 * Give file access advices
2457 * The ladvise interface is similar to Linux fadvise() system call, except it
2458 * forwards the advices directly from Lustre client to server. The server side
2459 * codes will apply appropriate read-ahead and caching techniques for the
2460 * corresponding files.
2462 * A typical workload for ladvise is e.g. a bunch of different clients are
2463 * doing small random reads of a file, so prefetching pages into OSS cache
2464 * with big linear reads before the random IO is a net benefit. Fetching
2465 * all that data into each client cache with fadvise() may not be, due to
2466 * much more data being sent to the client.
2468 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2469 struct llapi_lu_ladvise *ladvise)
2473 struct cl_ladvise_io *lio;
2478 env = cl_env_get(&refcheck);
2480 RETURN(PTR_ERR(env));
2482 io = vvp_env_thread_io(env);
2483 io->ci_obj = ll_i2info(inode)->lli_clob;
2485 /* initialize parameters for ladvise */
2486 lio = &io->u.ci_ladvise;
2487 lio->li_start = ladvise->lla_start;
2488 lio->li_end = ladvise->lla_end;
2489 lio->li_fid = ll_inode2fid(inode);
2490 lio->li_advice = ladvise->lla_advice;
2491 lio->li_flags = flags;
2493 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2494 rc = cl_io_loop(env, io);
2498 cl_io_fini(env, io);
2499 cl_env_put(env, &refcheck);
2503 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2506 struct fsxattr fsxattr;
2508 if (copy_from_user(&fsxattr,
2509 (const struct fsxattr __user *)arg,
2513 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2514 if (copy_to_user((struct fsxattr __user *)arg,
2515 &fsxattr, sizeof(fsxattr)))
2521 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2525 struct md_op_data *op_data;
2526 struct ptlrpc_request *req = NULL;
2528 struct fsxattr fsxattr;
2530 /* only root could change project ID */
2531 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2534 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2535 LUSTRE_OPC_ANY, NULL);
2536 if (IS_ERR(op_data))
2537 RETURN(PTR_ERR(op_data));
2539 if (copy_from_user(&fsxattr,
2540 (const struct fsxattr __user *)arg,
2542 GOTO(out_fsxattr1, rc = -EFAULT);
2544 op_data->op_projid = fsxattr.fsx_projid;
2545 op_data->op_attr.ia_valid |= MDS_ATTR_PROJID;
2546 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2548 ptlrpc_req_finished(req);
2551 ll_finish_md_op_data(op_data);
2558 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2560 struct inode *inode = file_inode(file);
2561 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2565 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2566 PFID(ll_inode2fid(inode)), inode, cmd);
2567 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2569 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2570 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2574 case LL_IOC_GETFLAGS:
2575 /* Get the current value of the file flags */
2576 return put_user(fd->fd_flags, (int __user *)arg);
2577 case LL_IOC_SETFLAGS:
2578 case LL_IOC_CLRFLAGS:
2579 /* Set or clear specific file flags */
2580 /* XXX This probably needs checks to ensure the flags are
2581 * not abused, and to handle any flag side effects.
2583 if (get_user(flags, (int __user *) arg))
2586 if (cmd == LL_IOC_SETFLAGS) {
2587 if ((flags & LL_FILE_IGNORE_LOCK) &&
2588 !(file->f_flags & O_DIRECT)) {
2589 CERROR("%s: unable to disable locking on "
2590 "non-O_DIRECT file\n", current->comm);
2594 fd->fd_flags |= flags;
2596 fd->fd_flags &= ~flags;
2599 case LL_IOC_LOV_SETSTRIPE:
2600 case LL_IOC_LOV_SETSTRIPE_NEW:
2601 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
2602 case LL_IOC_LOV_SETEA:
2603 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
2604 case LL_IOC_LOV_SWAP_LAYOUTS: {
2606 struct lustre_swap_layouts lsl;
2608 if (copy_from_user(&lsl, (char __user *)arg,
2609 sizeof(struct lustre_swap_layouts)))
2612 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2615 file2 = fget(lsl.sl_fd);
2619 /* O_WRONLY or O_RDWR */
2620 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2621 GOTO(out, rc = -EPERM);
2623 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2624 struct inode *inode2;
2625 struct ll_inode_info *lli;
2626 struct obd_client_handle *och = NULL;
2628 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2629 GOTO(out, rc = -EINVAL);
2631 lli = ll_i2info(inode);
2632 mutex_lock(&lli->lli_och_mutex);
2633 if (fd->fd_lease_och != NULL) {
2634 och = fd->fd_lease_och;
2635 fd->fd_lease_och = NULL;
2637 mutex_unlock(&lli->lli_och_mutex);
2639 GOTO(out, rc = -ENOLCK);
2640 inode2 = file_inode(file2);
2641 rc = ll_swap_layouts_close(och, inode, inode2);
2643 rc = ll_swap_layouts(file, file2, &lsl);
2649 case LL_IOC_LOV_GETSTRIPE:
2650 case LL_IOC_LOV_GETSTRIPE_NEW:
2651 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
2652 case FSFILT_IOC_GETFLAGS:
2653 case FSFILT_IOC_SETFLAGS:
2654 RETURN(ll_iocontrol(inode, file, cmd, arg));
2655 case FSFILT_IOC_GETVERSION_OLD:
2656 case FSFILT_IOC_GETVERSION:
2657 RETURN(put_user(inode->i_generation, (int __user *)arg));
2658 case LL_IOC_GROUP_LOCK:
2659 RETURN(ll_get_grouplock(inode, file, arg));
2660 case LL_IOC_GROUP_UNLOCK:
2661 RETURN(ll_put_grouplock(inode, file, arg));
2662 case IOC_OBD_STATFS:
2663 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2665 /* We need to special case any other ioctls we want to handle,
2666 * to send them to the MDS/OST as appropriate and to properly
2667 * network encode the arg field.
2668 case FSFILT_IOC_SETVERSION_OLD:
2669 case FSFILT_IOC_SETVERSION:
2671 case LL_IOC_FLUSHCTX:
2672 RETURN(ll_flush_ctx(inode));
2673 case LL_IOC_PATH2FID: {
2674 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2675 sizeof(struct lu_fid)))
2680 case LL_IOC_GETPARENT:
2681 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2683 case OBD_IOC_FID2PATH:
2684 RETURN(ll_fid2path(inode, (void __user *)arg));
2685 case LL_IOC_DATA_VERSION: {
2686 struct ioc_data_version idv;
2689 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2692 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2693 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2696 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2702 case LL_IOC_GET_MDTIDX: {
2705 mdtidx = ll_get_mdt_idx(inode);
2709 if (put_user((int)mdtidx, (int __user *)arg))
2714 case OBD_IOC_GETDTNAME:
2715 case OBD_IOC_GETMDNAME:
2716 RETURN(ll_get_obd_name(inode, cmd, arg));
2717 case LL_IOC_HSM_STATE_GET: {
2718 struct md_op_data *op_data;
2719 struct hsm_user_state *hus;
2726 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2727 LUSTRE_OPC_ANY, hus);
2728 if (IS_ERR(op_data)) {
2730 RETURN(PTR_ERR(op_data));
2733 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2736 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2739 ll_finish_md_op_data(op_data);
2743 case LL_IOC_HSM_STATE_SET: {
2744 struct hsm_state_set *hss;
2751 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2756 rc = ll_hsm_state_set(inode, hss);
2761 case LL_IOC_HSM_ACTION: {
2762 struct md_op_data *op_data;
2763 struct hsm_current_action *hca;
2770 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2771 LUSTRE_OPC_ANY, hca);
2772 if (IS_ERR(op_data)) {
2774 RETURN(PTR_ERR(op_data));
2777 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2780 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2783 ll_finish_md_op_data(op_data);
2787 case LL_IOC_SET_LEASE: {
2788 struct ll_inode_info *lli = ll_i2info(inode);
2789 struct obd_client_handle *och = NULL;
2794 case LL_LEASE_WRLCK:
2795 if (!(file->f_mode & FMODE_WRITE))
2797 fmode = FMODE_WRITE;
2799 case LL_LEASE_RDLCK:
2800 if (!(file->f_mode & FMODE_READ))
2804 case LL_LEASE_UNLCK:
2805 mutex_lock(&lli->lli_och_mutex);
2806 if (fd->fd_lease_och != NULL) {
2807 och = fd->fd_lease_och;
2808 fd->fd_lease_och = NULL;
2810 mutex_unlock(&lli->lli_och_mutex);
2815 fmode = och->och_flags;
2816 rc = ll_lease_close(och, inode, &lease_broken);
2820 rc = ll_lease_och_release(inode, file);
2827 RETURN(ll_lease_type_from_fmode(fmode));
2832 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2834 /* apply for lease */
2835 och = ll_lease_open(inode, file, fmode, 0);
2837 RETURN(PTR_ERR(och));
2840 mutex_lock(&lli->lli_och_mutex);
2841 if (fd->fd_lease_och == NULL) {
2842 fd->fd_lease_och = och;
2845 mutex_unlock(&lli->lli_och_mutex);
2847 /* impossible now that only excl is supported for now */
2848 ll_lease_close(och, inode, &lease_broken);
2853 case LL_IOC_GET_LEASE: {
2854 struct ll_inode_info *lli = ll_i2info(inode);
2855 struct ldlm_lock *lock = NULL;
2858 mutex_lock(&lli->lli_och_mutex);
2859 if (fd->fd_lease_och != NULL) {
2860 struct obd_client_handle *och = fd->fd_lease_och;
2862 lock = ldlm_handle2lock(&och->och_lease_handle);
2864 lock_res_and_lock(lock);
2865 if (!ldlm_is_cancel(lock))
2866 fmode = och->och_flags;
2868 unlock_res_and_lock(lock);
2869 LDLM_LOCK_PUT(lock);
2872 mutex_unlock(&lli->lli_och_mutex);
2874 RETURN(ll_lease_type_from_fmode(fmode));
2876 case LL_IOC_HSM_IMPORT: {
2877 struct hsm_user_import *hui;
2883 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2888 rc = ll_hsm_import(inode, file, hui);
2893 case LL_IOC_FUTIMES_3: {
2894 struct ll_futimes_3 lfu;
2896 if (copy_from_user(&lfu,
2897 (const struct ll_futimes_3 __user *)arg,
2901 RETURN(ll_file_futimes_3(file, &lfu));
2903 case LL_IOC_LADVISE: {
2904 struct llapi_ladvise_hdr *ladvise_hdr;
2907 int alloc_size = sizeof(*ladvise_hdr);
2910 OBD_ALLOC_PTR(ladvise_hdr);
2911 if (ladvise_hdr == NULL)
2914 if (copy_from_user(ladvise_hdr,
2915 (const struct llapi_ladvise_hdr __user *)arg,
2917 GOTO(out_ladvise, rc = -EFAULT);
2919 if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
2920 ladvise_hdr->lah_count < 1)
2921 GOTO(out_ladvise, rc = -EINVAL);
2923 num_advise = ladvise_hdr->lah_count;
2924 if (num_advise >= LAH_COUNT_MAX)
2925 GOTO(out_ladvise, rc = -EFBIG);
2927 OBD_FREE_PTR(ladvise_hdr);
2928 alloc_size = offsetof(typeof(*ladvise_hdr),
2929 lah_advise[num_advise]);
2930 OBD_ALLOC(ladvise_hdr, alloc_size);
2931 if (ladvise_hdr == NULL)
2935 * TODO: submit multiple advices to one server in a single RPC
2937 if (copy_from_user(ladvise_hdr,
2938 (const struct llapi_ladvise_hdr __user *)arg,
2940 GOTO(out_ladvise, rc = -EFAULT);
2942 for (i = 0; i < num_advise; i++) {
2943 rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
2944 &ladvise_hdr->lah_advise[i]);
2950 OBD_FREE(ladvise_hdr, alloc_size);
2953 case LL_IOC_FSGETXATTR:
2954 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
2955 case LL_IOC_FSSETXATTR:
2956 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
2961 ll_iocontrol_call(inode, file, cmd, arg, &err))
2964 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2965 (void __user *)arg));
2970 #ifndef HAVE_FILE_LLSEEK_SIZE
2971 static inline loff_t
2972 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2974 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2976 if (offset > maxsize)
2979 if (offset != file->f_pos) {
2980 file->f_pos = offset;
2981 file->f_version = 0;
2987 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2988 loff_t maxsize, loff_t eof)
2990 struct inode *inode = file_inode(file);
2998 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2999 * position-querying operation. Avoid rewriting the "same"
3000 * f_pos value back to the file because a concurrent read(),
3001 * write() or lseek() might have altered it
3006 * f_lock protects against read/modify/write race with other
3007 * SEEK_CURs. Note that parallel writes and reads behave
3011 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3012 inode_unlock(inode);
3016 * In the generic case the entire file is data, so as long as
3017 * offset isn't at the end of the file then the offset is data.
3024 * There is a virtual hole at the end of the file, so as long as
3025 * offset isn't i_size or larger, return i_size.
3033 return llseek_execute(file, offset, maxsize);
3037 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3039 struct inode *inode = file_inode(file);
3040 loff_t retval, eof = 0;
3043 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3044 (origin == SEEK_CUR) ? file->f_pos : 0);
3045 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3046 PFID(ll_inode2fid(inode)), inode, retval, retval,
3048 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3050 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3051 retval = ll_glimpse_size(inode);
3054 eof = i_size_read(inode);
3057 retval = ll_generic_file_llseek_size(file, offset, origin,
3058 ll_file_maxbytes(inode), eof);
3062 static int ll_flush(struct file *file, fl_owner_t id)
3064 struct inode *inode = file_inode(file);
3065 struct ll_inode_info *lli = ll_i2info(inode);
3066 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3069 LASSERT(!S_ISDIR(inode->i_mode));
3071 /* catch async errors that were recorded back when async writeback
3072 * failed for pages in this mapping. */
3073 rc = lli->lli_async_rc;
3074 lli->lli_async_rc = 0;
3075 if (lli->lli_clob != NULL) {
3076 err = lov_read_and_clear_async_rc(lli->lli_clob);
3081 /* The application has been told write failure already.
3082 * Do not report failure again. */
3083 if (fd->fd_write_failed)
3085 return rc ? -EIO : 0;
3089 * Called to make sure a portion of file has been written out.
3090 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3092 * Return how many pages have been written.
3094 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3095 enum cl_fsync_mode mode, int ignore_layout)
3099 struct cl_fsync_io *fio;
3104 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3105 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3108 env = cl_env_get(&refcheck);
3110 RETURN(PTR_ERR(env));
3112 io = vvp_env_thread_io(env);
3113 io->ci_obj = ll_i2info(inode)->lli_clob;
3114 io->ci_ignore_layout = ignore_layout;
3116 /* initialize parameters for sync */
3117 fio = &io->u.ci_fsync;
3118 fio->fi_start = start;
3120 fio->fi_fid = ll_inode2fid(inode);
3121 fio->fi_mode = mode;
3122 fio->fi_nr_written = 0;
3124 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3125 result = cl_io_loop(env, io);
3127 result = io->ci_result;
3129 result = fio->fi_nr_written;
3130 cl_io_fini(env, io);
3131 cl_env_put(env, &refcheck);
3137 * When dentry is provided (the 'else' case), file_dentry() may be
3138 * null and dentry must be used directly rather than pulled from
3139 * file_dentry() as is done otherwise.
3142 #ifdef HAVE_FILE_FSYNC_4ARGS
3143 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3145 struct dentry *dentry = file_dentry(file);
3147 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3148 int ll_fsync(struct file *file, int datasync)
3150 struct dentry *dentry = file_dentry(file);
3152 loff_t end = LLONG_MAX;
3154 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3157 loff_t end = LLONG_MAX;
3159 struct inode *inode = dentry->d_inode;
3160 struct ll_inode_info *lli = ll_i2info(inode);
3161 struct ptlrpc_request *req;
3165 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3166 PFID(ll_inode2fid(inode)), inode);
3167 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3169 #ifdef HAVE_FILE_FSYNC_4ARGS
3170 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3171 lock_inode = !lli->lli_inode_locked;
3175 /* fsync's caller has already called _fdata{sync,write}, we want
3176 * that IO to finish before calling the osc and mdc sync methods */
3177 rc = filemap_fdatawait(inode->i_mapping);
3180 /* catch async errors that were recorded back when async writeback
3181 * failed for pages in this mapping. */
3182 if (!S_ISDIR(inode->i_mode)) {
3183 err = lli->lli_async_rc;
3184 lli->lli_async_rc = 0;
3187 if (lli->lli_clob != NULL) {
3188 err = lov_read_and_clear_async_rc(lli->lli_clob);
3194 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3198 ptlrpc_req_finished(req);
3200 if (S_ISREG(inode->i_mode)) {
3201 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3203 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3204 if (rc == 0 && err < 0)
3207 fd->fd_write_failed = true;
3209 fd->fd_write_failed = false;
3212 #ifdef HAVE_FILE_FSYNC_4ARGS
3214 inode_unlock(inode);
3220 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3222 struct inode *inode = file_inode(file);
3223 struct ll_sb_info *sbi = ll_i2sbi(inode);
3224 struct ldlm_enqueue_info einfo = {
3225 .ei_type = LDLM_FLOCK,
3226 .ei_cb_cp = ldlm_flock_completion_ast,
3227 .ei_cbdata = file_lock,
3229 struct md_op_data *op_data;
3230 struct lustre_handle lockh = { 0 };
3231 union ldlm_policy_data flock = { { 0 } };
3232 int fl_type = file_lock->fl_type;
3238 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3239 PFID(ll_inode2fid(inode)), file_lock);
3241 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3243 if (file_lock->fl_flags & FL_FLOCK) {
3244 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3245 /* flocks are whole-file locks */
3246 flock.l_flock.end = OFFSET_MAX;
3247 /* For flocks owner is determined by the local file desctiptor*/
3248 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3249 } else if (file_lock->fl_flags & FL_POSIX) {
3250 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3251 flock.l_flock.start = file_lock->fl_start;
3252 flock.l_flock.end = file_lock->fl_end;
3256 flock.l_flock.pid = file_lock->fl_pid;
3258 /* Somewhat ugly workaround for svc lockd.
3259 * lockd installs custom fl_lmops->lm_compare_owner that checks
3260 * for the fl_owner to be the same (which it always is on local node
3261 * I guess between lockd processes) and then compares pid.
3262 * As such we assign pid to the owner field to make it all work,
3263 * conflict with normal locks is unlikely since pid space and
3264 * pointer space for current->files are not intersecting */
3265 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3266 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3270 einfo.ei_mode = LCK_PR;
3273 /* An unlock request may or may not have any relation to
3274 * existing locks so we may not be able to pass a lock handle
3275 * via a normal ldlm_lock_cancel() request. The request may even
3276 * unlock a byte range in the middle of an existing lock. In
3277 * order to process an unlock request we need all of the same
3278 * information that is given with a normal read or write record
3279 * lock request. To avoid creating another ldlm unlock (cancel)
3280 * message we'll treat a LCK_NL flock request as an unlock. */
3281 einfo.ei_mode = LCK_NL;
3284 einfo.ei_mode = LCK_PW;
3287 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3302 flags = LDLM_FL_BLOCK_NOWAIT;
3308 flags = LDLM_FL_TEST_LOCK;
3311 CERROR("unknown fcntl lock command: %d\n", cmd);
3315 /* Save the old mode so that if the mode in the lock changes we
3316 * can decrement the appropriate reader or writer refcount. */
3317 file_lock->fl_type = einfo.ei_mode;
3319 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3320 LUSTRE_OPC_ANY, NULL);
3321 if (IS_ERR(op_data))
3322 RETURN(PTR_ERR(op_data));
3324 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3325 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3326 flock.l_flock.pid, flags, einfo.ei_mode,
3327 flock.l_flock.start, flock.l_flock.end);
3329 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3332 /* Restore the file lock type if not TEST lock. */
3333 if (!(flags & LDLM_FL_TEST_LOCK))
3334 file_lock->fl_type = fl_type;
3336 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3337 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3338 !(flags & LDLM_FL_TEST_LOCK))
3339 rc2 = locks_lock_file_wait(file, file_lock);
3341 if ((file_lock->fl_flags & FL_FLOCK) &&
3342 (rc == 0 || file_lock->fl_type == F_UNLCK))
3343 rc2 = flock_lock_file_wait(file, file_lock);
3344 if ((file_lock->fl_flags & FL_POSIX) &&
3345 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3346 !(flags & LDLM_FL_TEST_LOCK))
3347 rc2 = posix_lock_file_wait(file, file_lock);
3348 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3350 if (rc2 && file_lock->fl_type != F_UNLCK) {
3351 einfo.ei_mode = LCK_NL;
3352 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3357 ll_finish_md_op_data(op_data);
3362 int ll_get_fid_by_name(struct inode *parent, const char *name,
3363 int namelen, struct lu_fid *fid,
3364 struct inode **inode)
3366 struct md_op_data *op_data = NULL;
3367 struct mdt_body *body;
3368 struct ptlrpc_request *req;
3372 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3373 LUSTRE_OPC_ANY, NULL);
3374 if (IS_ERR(op_data))
3375 RETURN(PTR_ERR(op_data));
3377 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3378 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3379 ll_finish_md_op_data(op_data);
3383 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3385 GOTO(out_req, rc = -EFAULT);
3387 *fid = body->mbo_fid1;
3390 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3392 ptlrpc_req_finished(req);
3396 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3397 const char *name, int namelen)
3399 struct dentry *dchild = NULL;
3400 struct inode *child_inode = NULL;
3401 struct md_op_data *op_data;
3402 struct ptlrpc_request *request = NULL;
3403 struct obd_client_handle *och = NULL;
3405 struct mdt_body *body;
3407 __u64 data_version = 0;
3410 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3411 name, PFID(ll_inode2fid(parent)), mdtidx);
3413 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3414 0, LUSTRE_OPC_ANY, NULL);
3415 if (IS_ERR(op_data))
3416 RETURN(PTR_ERR(op_data));
3418 /* Get child FID first */
3419 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3422 dchild = d_lookup(file_dentry(file), &qstr);
3423 if (dchild != NULL) {
3424 if (dchild->d_inode != NULL)
3425 child_inode = igrab(dchild->d_inode);
3429 if (child_inode == NULL) {
3430 rc = ll_get_fid_by_name(parent, name, namelen,
3431 &op_data->op_fid3, &child_inode);
3436 if (child_inode == NULL)
3437 GOTO(out_free, rc = -EINVAL);
3440 * lfs migrate command needs to be blocked on the client
3441 * by checking the migrate FID against the FID of the
3444 if (child_inode == parent->i_sb->s_root->d_inode)
3445 GOTO(out_iput, rc = -EINVAL);
3447 inode_lock(child_inode);
3448 op_data->op_fid3 = *ll_inode2fid(child_inode);
3449 if (!fid_is_sane(&op_data->op_fid3)) {
3450 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3451 ll_get_fsname(parent->i_sb, NULL, 0), name,
3452 PFID(&op_data->op_fid3));
3453 GOTO(out_unlock, rc = -EINVAL);
3456 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3458 GOTO(out_unlock, rc);
3461 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3462 PFID(&op_data->op_fid3), mdtidx);
3463 GOTO(out_unlock, rc = 0);
3466 if (S_ISREG(child_inode->i_mode)) {
3467 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3471 GOTO(out_unlock, rc);
3474 rc = ll_data_version(child_inode, &data_version,
3477 GOTO(out_close, rc);
3479 op_data->op_handle = och->och_fh;
3480 op_data->op_data = och->och_mod;
3481 op_data->op_data_version = data_version;
3482 op_data->op_lease_handle = och->och_lease_handle;
3483 op_data->op_bias |= MDS_RENAME_MIGRATE;
3486 op_data->op_mds = mdtidx;
3487 op_data->op_cli_flags = CLI_MIGRATE;
3488 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3489 namelen, name, namelen, &request);
3491 LASSERT(request != NULL);
3492 ll_update_times(request, parent);
3494 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
3495 LASSERT(body != NULL);
3497 /* If the server does release layout lock, then we cleanup
3498 * the client och here, otherwise release it in out_close: */
3500 body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
3501 obd_mod_put(och->och_mod);
3502 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
3504 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
3510 if (request != NULL) {
3511 ptlrpc_req_finished(request);
3515 /* Try again if the file layout has changed. */
3516 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
3520 if (och != NULL) /* close the file */
3521 ll_lease_close(och, child_inode, NULL);
3523 clear_nlink(child_inode);
3525 inode_unlock(child_inode);
3529 ll_finish_md_op_data(op_data);
3534 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3542 * test if some locks matching bits and l_req_mode are acquired
3543 * - bits can be in different locks
3544 * - if found clear the common lock bits in *bits
3545 * - the bits not found, are kept in *bits
3547 * \param bits [IN] searched lock bits [IN]
3548 * \param l_req_mode [IN] searched lock mode
3549 * \retval boolean, true iff all bits are found
3551 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
3553 struct lustre_handle lockh;
3554 union ldlm_policy_data policy;
3555 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
3556 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
3565 fid = &ll_i2info(inode)->lli_fid;
3566 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3567 ldlm_lockname[mode]);
3569 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3570 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3571 policy.l_inodebits.bits = *bits & (1 << i);
3572 if (policy.l_inodebits.bits == 0)
3575 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3576 &policy, mode, &lockh)) {
3577 struct ldlm_lock *lock;
3579 lock = ldlm_handle2lock(&lockh);
3582 ~(lock->l_policy_data.l_inodebits.bits);
3583 LDLM_LOCK_PUT(lock);
3585 *bits &= ~policy.l_inodebits.bits;
3592 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
3593 struct lustre_handle *lockh, __u64 flags,
3594 enum ldlm_mode mode)
3596 union ldlm_policy_data policy = { .l_inodebits = { bits } };
3601 fid = &ll_i2info(inode)->lli_fid;
3602 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3604 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3605 fid, LDLM_IBITS, &policy, mode, lockh);
3610 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3612 /* Already unlinked. Just update nlink and return success */
3613 if (rc == -ENOENT) {
3615 /* If it is striped directory, and there is bad stripe
3616 * Let's revalidate the dentry again, instead of returning
3618 if (S_ISDIR(inode->i_mode) &&
3619 ll_i2info(inode)->lli_lsm_md != NULL)
3622 /* This path cannot be hit for regular files unless in
3623 * case of obscure races, so no need to to validate
3625 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3627 } else if (rc != 0) {
3628 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3629 "%s: revalidate FID "DFID" error: rc = %d\n",
3630 ll_get_fsname(inode->i_sb, NULL, 0),
3631 PFID(ll_inode2fid(inode)), rc);
3637 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3639 struct inode *inode = dentry->d_inode;
3640 struct ptlrpc_request *req = NULL;
3641 struct obd_export *exp;
3645 LASSERT(inode != NULL);
3647 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3648 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3650 exp = ll_i2mdexp(inode);
3652 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3653 * But under CMD case, it caused some lock issues, should be fixed
3654 * with new CMD ibits lock. See bug 12718 */
3655 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3656 struct lookup_intent oit = { .it_op = IT_GETATTR };
3657 struct md_op_data *op_data;
3659 if (ibits == MDS_INODELOCK_LOOKUP)
3660 oit.it_op = IT_LOOKUP;
3662 /* Call getattr by fid, so do not provide name at all. */
3663 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3664 dentry->d_inode, NULL, 0, 0,
3665 LUSTRE_OPC_ANY, NULL);
3666 if (IS_ERR(op_data))
3667 RETURN(PTR_ERR(op_data));
3669 rc = md_intent_lock(exp, op_data, &oit, &req,
3670 &ll_md_blocking_ast, 0);
3671 ll_finish_md_op_data(op_data);
3673 rc = ll_inode_revalidate_fini(inode, rc);
3677 rc = ll_revalidate_it_finish(req, &oit, dentry);
3679 ll_intent_release(&oit);
3683 /* Unlinked? Unhash dentry, so it is not picked up later by
3684 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3685 here to preserve get_cwd functionality on 2.6.
3687 if (!dentry->d_inode->i_nlink) {
3688 ll_lock_dcache(inode);
3689 d_lustre_invalidate(dentry, 0);
3690 ll_unlock_dcache(inode);
3693 ll_lookup_finish_locks(&oit, dentry);
3694 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3695 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3696 u64 valid = OBD_MD_FLGETATTR;
3697 struct md_op_data *op_data;
3700 if (S_ISREG(inode->i_mode)) {
3701 rc = ll_get_default_mdsize(sbi, &ealen);
3704 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3707 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3708 0, ealen, LUSTRE_OPC_ANY,
3710 if (IS_ERR(op_data))
3711 RETURN(PTR_ERR(op_data));
3713 op_data->op_valid = valid;
3714 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3715 ll_finish_md_op_data(op_data);
3717 rc = ll_inode_revalidate_fini(inode, rc);
3721 rc = ll_prep_inode(&inode, req, NULL, NULL);
3724 ptlrpc_req_finished(req);
3728 static int ll_merge_md_attr(struct inode *inode)
3730 struct cl_attr attr = { 0 };
3733 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3734 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3735 &attr, ll_md_blocking_ast);
3739 set_nlink(inode, attr.cat_nlink);
3740 inode->i_blocks = attr.cat_blocks;
3741 i_size_write(inode, attr.cat_size);
3743 ll_i2info(inode)->lli_atime = attr.cat_atime;
3744 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3745 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3751 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3753 struct inode *inode = dentry->d_inode;
3757 rc = __ll_inode_revalidate(dentry, ibits);
3761 /* if object isn't regular file, don't validate size */
3762 if (!S_ISREG(inode->i_mode)) {
3763 if (S_ISDIR(inode->i_mode) &&
3764 ll_i2info(inode)->lli_lsm_md != NULL) {
3765 rc = ll_merge_md_attr(inode);
3770 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3771 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3772 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3774 /* In case of restore, the MDT has the right size and has
3775 * already send it back without granting the layout lock,
3776 * inode is up-to-date so glimpse is useless.
3777 * Also to glimpse we need the layout, in case of a running
3778 * restore the MDT holds the layout lock so the glimpse will
3779 * block up to the end of restore (getattr will block)
3781 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3782 rc = ll_glimpse_size(inode);
3787 static inline dev_t ll_compat_encode_dev(dev_t dev)
3789 /* The compat_sys_*stat*() syscalls will fail unless the
3790 * device majors and minors are both less than 256. Note that
3791 * the value returned here will be passed through
3792 * old_encode_dev() in cp_compat_stat(). And so we are not
3793 * trying to return a valid compat (u16) device number, just
3794 * one that will pass the old_valid_dev() check. */
3796 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
3799 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3801 struct inode *inode = de->d_inode;
3802 struct ll_sb_info *sbi = ll_i2sbi(inode);
3803 struct ll_inode_info *lli = ll_i2info(inode);
3806 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3807 MDS_INODELOCK_LOOKUP);
3808 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3813 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
3815 if (ll_need_32bit_api(sbi)) {
3816 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3817 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
3818 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
3820 stat->ino = inode->i_ino;
3821 stat->dev = inode->i_sb->s_dev;
3822 stat->rdev = inode->i_rdev;
3825 stat->mode = inode->i_mode;
3826 stat->uid = inode->i_uid;
3827 stat->gid = inode->i_gid;
3828 stat->atime = inode->i_atime;
3829 stat->mtime = inode->i_mtime;
3830 stat->ctime = inode->i_ctime;
3831 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
3833 stat->nlink = inode->i_nlink;
3834 stat->size = i_size_read(inode);
3835 stat->blocks = inode->i_blocks;
3840 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3841 __u64 start, __u64 len)
3845 struct fiemap *fiemap;
3846 unsigned int extent_count = fieinfo->fi_extents_max;
3848 num_bytes = sizeof(*fiemap) + (extent_count *
3849 sizeof(struct fiemap_extent));
3850 OBD_ALLOC_LARGE(fiemap, num_bytes);
3855 fiemap->fm_flags = fieinfo->fi_flags;
3856 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3857 fiemap->fm_start = start;
3858 fiemap->fm_length = len;
3859 if (extent_count > 0 &&
3860 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3861 sizeof(struct fiemap_extent)) != 0)
3862 GOTO(out, rc = -EFAULT);
3864 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3866 fieinfo->fi_flags = fiemap->fm_flags;
3867 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3868 if (extent_count > 0 &&
3869 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3870 fiemap->fm_mapped_extents *
3871 sizeof(struct fiemap_extent)) != 0)
3872 GOTO(out, rc = -EFAULT);
3874 OBD_FREE_LARGE(fiemap, num_bytes);
3878 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3880 struct ll_inode_info *lli = ll_i2info(inode);
3881 struct posix_acl *acl = NULL;
3884 spin_lock(&lli->lli_lock);
3885 /* VFS' acl_permission_check->check_acl will release the refcount */
3886 acl = posix_acl_dup(lli->lli_posix_acl);
3887 spin_unlock(&lli->lli_lock);
3892 #ifdef HAVE_IOP_SET_ACL
3893 #ifdef CONFIG_FS_POSIX_ACL
3894 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
3896 const char *name = NULL;
3903 case ACL_TYPE_ACCESS:
3905 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
3909 name = XATTR_NAME_POSIX_ACL_ACCESS;
3911 case ACL_TYPE_DEFAULT:
3912 if (!S_ISDIR(inode->i_mode))
3913 GOTO(out, rc = acl ? -EACCES : 0);
3914 name = XATTR_NAME_POSIX_ACL_DEFAULT;
3917 GOTO(out, rc = -EINVAL);
3921 size = posix_acl_xattr_size(acl->a_count);
3922 value = kmalloc(size, GFP_NOFS);
3924 GOTO(out, rc = -ENOMEM);
3926 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
3931 /* dentry is only used for *.lov attributes so it's safe to be NULL */
3932 rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
3937 set_cached_acl(inode, type, acl);
3939 forget_cached_acl(inode, type);
3942 #endif /* CONFIG_FS_POSIX_ACL */
3943 #endif /* HAVE_IOP_SET_ACL */
3945 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3947 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3948 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3950 ll_check_acl(struct inode *inode, int mask)
3953 # ifdef CONFIG_FS_POSIX_ACL
3954 struct posix_acl *acl;
3958 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3959 if (flags & IPERM_FLAG_RCU)
3962 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3967 rc = posix_acl_permission(inode, acl, mask);
3968 posix_acl_release(acl);
3971 # else /* !CONFIG_FS_POSIX_ACL */
3973 # endif /* CONFIG_FS_POSIX_ACL */
3975 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3977 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3978 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3980 # ifdef HAVE_INODE_PERMISION_2ARGS
3981 int ll_inode_permission(struct inode *inode, int mask)
3983 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3988 struct ll_sb_info *sbi;
3989 struct root_squash_info *squash;
3990 struct cred *cred = NULL;
3991 const struct cred *old_cred = NULL;
3993 bool squash_id = false;
3996 #ifdef MAY_NOT_BLOCK
3997 if (mask & MAY_NOT_BLOCK)
3999 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4000 if (flags & IPERM_FLAG_RCU)
4004 /* as root inode are NOT getting validated in lookup operation,
4005 * need to do it before permission check. */
4007 if (inode == inode->i_sb->s_root->d_inode) {
4008 rc = __ll_inode_revalidate(inode->i_sb->s_root,
4009 MDS_INODELOCK_LOOKUP);
4014 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4015 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4017 /* squash fsuid/fsgid if needed */
4018 sbi = ll_i2sbi(inode);
4019 squash = &sbi->ll_squash;
4020 if (unlikely(squash->rsi_uid != 0 &&
4021 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4022 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4026 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4027 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4028 squash->rsi_uid, squash->rsi_gid);
4030 /* update current process's credentials
4031 * and FS capability */
4032 cred = prepare_creds();
4036 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4037 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4038 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4039 if ((1 << cap) & CFS_CAP_FS_MASK)
4040 cap_lower(cred->cap_effective, cap);
4042 old_cred = override_creds(cred);
4045 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4046 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4047 /* restore current process's credentials and FS capability */
4049 revert_creds(old_cred);
4056 /* -o localflock - only provides locally consistent flock locks */
4057 struct file_operations ll_file_operations = {
4058 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4059 # ifdef HAVE_SYNC_READ_WRITE
4060 .read = new_sync_read,
4061 .write = new_sync_write,
4063 .read_iter = ll_file_read_iter,
4064 .write_iter = ll_file_write_iter,
4065 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4066 .read = ll_file_read,
4067 .aio_read = ll_file_aio_read,
4068 .write = ll_file_write,
4069 .aio_write = ll_file_aio_write,
4070 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4071 .unlocked_ioctl = ll_file_ioctl,
4072 .open = ll_file_open,
4073 .release = ll_file_release,
4074 .mmap = ll_file_mmap,
4075 .llseek = ll_file_seek,
4076 .splice_read = ll_file_splice_read,
4081 struct file_operations ll_file_operations_flock = {
4082 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4083 # ifdef HAVE_SYNC_READ_WRITE
4084 .read = new_sync_read,
4085 .write = new_sync_write,
4086 # endif /* HAVE_SYNC_READ_WRITE */
4087 .read_iter = ll_file_read_iter,
4088 .write_iter = ll_file_write_iter,
4089 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4090 .read = ll_file_read,
4091 .aio_read = ll_file_aio_read,
4092 .write = ll_file_write,
4093 .aio_write = ll_file_aio_write,
4094 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4095 .unlocked_ioctl = ll_file_ioctl,
4096 .open = ll_file_open,
4097 .release = ll_file_release,
4098 .mmap = ll_file_mmap,
4099 .llseek = ll_file_seek,
4100 .splice_read = ll_file_splice_read,
4103 .flock = ll_file_flock,
4104 .lock = ll_file_flock
4107 /* These are for -o noflock - to return ENOSYS on flock calls */
4108 struct file_operations ll_file_operations_noflock = {
4109 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4110 # ifdef HAVE_SYNC_READ_WRITE
4111 .read = new_sync_read,
4112 .write = new_sync_write,
4113 # endif /* HAVE_SYNC_READ_WRITE */
4114 .read_iter = ll_file_read_iter,
4115 .write_iter = ll_file_write_iter,
4116 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4117 .read = ll_file_read,
4118 .aio_read = ll_file_aio_read,
4119 .write = ll_file_write,
4120 .aio_write = ll_file_aio_write,
4121 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4122 .unlocked_ioctl = ll_file_ioctl,
4123 .open = ll_file_open,
4124 .release = ll_file_release,
4125 .mmap = ll_file_mmap,
4126 .llseek = ll_file_seek,
4127 .splice_read = ll_file_splice_read,
4130 .flock = ll_file_noflock,
4131 .lock = ll_file_noflock
4134 struct inode_operations ll_file_inode_operations = {
4135 .setattr = ll_setattr,
4136 .getattr = ll_getattr,
4137 .permission = ll_inode_permission,
4138 #ifdef HAVE_IOP_XATTR
4139 .setxattr = ll_setxattr,
4140 .getxattr = ll_getxattr,
4141 .removexattr = ll_removexattr,
4143 .listxattr = ll_listxattr,
4144 .fiemap = ll_fiemap,
4145 #ifdef HAVE_IOP_GET_ACL
4146 .get_acl = ll_get_acl,
4148 #ifdef HAVE_IOP_SET_ACL
4149 .set_acl = ll_set_acl,
4153 /* dynamic ioctl number support routins */
4154 static struct llioc_ctl_data {
4155 struct rw_semaphore ioc_sem;
4156 struct list_head ioc_head;
4158 __RWSEM_INITIALIZER(llioc.ioc_sem),
4159 LIST_HEAD_INIT(llioc.ioc_head)
4164 struct list_head iocd_list;
4165 unsigned int iocd_size;
4166 llioc_callback_t iocd_cb;
4167 unsigned int iocd_count;
4168 unsigned int iocd_cmd[0];
4171 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
4174 struct llioc_data *in_data = NULL;
4177 if (cb == NULL || cmd == NULL ||
4178 count > LLIOC_MAX_CMD || count < 0)
4181 size = sizeof(*in_data) + count * sizeof(unsigned int);
4182 OBD_ALLOC(in_data, size);
4183 if (in_data == NULL)
4186 memset(in_data, 0, sizeof(*in_data));
4187 in_data->iocd_size = size;
4188 in_data->iocd_cb = cb;
4189 in_data->iocd_count = count;
4190 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
4192 down_write(&llioc.ioc_sem);
4193 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
4194 up_write(&llioc.ioc_sem);
4199 void ll_iocontrol_unregister(void *magic)
4201 struct llioc_data *tmp;
4206 down_write(&llioc.ioc_sem);
4207 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
4209 unsigned int size = tmp->iocd_size;
4211 list_del(&tmp->iocd_list);
4212 up_write(&llioc.ioc_sem);
4214 OBD_FREE(tmp, size);
4218 up_write(&llioc.ioc_sem);
4220 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
4223 EXPORT_SYMBOL(ll_iocontrol_register);
4224 EXPORT_SYMBOL(ll_iocontrol_unregister);
4226 static enum llioc_iter
4227 ll_iocontrol_call(struct inode *inode, struct file *file,
4228 unsigned int cmd, unsigned long arg, int *rcp)
4230 enum llioc_iter ret = LLIOC_CONT;
4231 struct llioc_data *data;
4232 int rc = -EINVAL, i;
4234 down_read(&llioc.ioc_sem);
4235 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
4236 for (i = 0; i < data->iocd_count; i++) {
4237 if (cmd != data->iocd_cmd[i])
4240 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
4244 if (ret == LLIOC_STOP)
4247 up_read(&llioc.ioc_sem);
4254 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4256 struct ll_inode_info *lli = ll_i2info(inode);
4257 struct cl_object *obj = lli->lli_clob;
4266 env = cl_env_get(&refcheck);
4268 RETURN(PTR_ERR(env));
4270 rc = cl_conf_set(env, lli->lli_clob, conf);
4274 if (conf->coc_opc == OBJECT_CONF_SET) {
4275 struct ldlm_lock *lock = conf->coc_lock;
4276 struct cl_layout cl = {
4280 LASSERT(lock != NULL);
4281 LASSERT(ldlm_has_layout(lock));
4283 /* it can only be allowed to match after layout is
4284 * applied to inode otherwise false layout would be
4285 * seen. Applying layout shoud happen before dropping
4286 * the intent lock. */
4287 ldlm_lock_allow_match(lock);
4289 rc = cl_object_layout_get(env, obj, &cl);
4294 DFID": layout version change: %u -> %u\n",
4295 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4297 ll_layout_version_set(lli, cl.cl_layout_gen);
4301 cl_env_put(env, &refcheck);
4306 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4307 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4310 struct ll_sb_info *sbi = ll_i2sbi(inode);
4311 struct ptlrpc_request *req;
4312 struct mdt_body *body;
4319 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4320 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4321 lock->l_lvb_data, lock->l_lvb_len);
4323 if (lock->l_lvb_data != NULL)
4326 /* if layout lock was granted right away, the layout is returned
4327 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4328 * blocked and then granted via completion ast, we have to fetch
4329 * layout here. Please note that we can't use the LVB buffer in
4330 * completion AST because it doesn't have a large enough buffer */
4331 rc = ll_get_default_mdsize(sbi, &lmmsize);
4333 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4334 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4339 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4341 GOTO(out, rc = -EPROTO);
4343 lmmsize = body->mbo_eadatasize;
4344 if (lmmsize == 0) /* empty layout */
4347 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4349 GOTO(out, rc = -EFAULT);
4351 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4352 if (lvbdata == NULL)
4353 GOTO(out, rc = -ENOMEM);
4355 memcpy(lvbdata, lmm, lmmsize);
4356 lock_res_and_lock(lock);
4357 if (unlikely(lock->l_lvb_data == NULL)) {
4358 lock->l_lvb_type = LVB_T_LAYOUT;
4359 lock->l_lvb_data = lvbdata;
4360 lock->l_lvb_len = lmmsize;
4363 unlock_res_and_lock(lock);
4366 OBD_FREE_LARGE(lvbdata, lmmsize);
4371 ptlrpc_req_finished(req);
4376 * Apply the layout to the inode. Layout lock is held and will be released
4379 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4380 struct inode *inode)
4382 struct ll_inode_info *lli = ll_i2info(inode);
4383 struct ll_sb_info *sbi = ll_i2sbi(inode);
4384 struct ldlm_lock *lock;
4385 struct cl_object_conf conf;
4388 bool wait_layout = false;
4391 LASSERT(lustre_handle_is_used(lockh));
4393 lock = ldlm_handle2lock(lockh);
4394 LASSERT(lock != NULL);
4395 LASSERT(ldlm_has_layout(lock));
4397 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4398 PFID(&lli->lli_fid), inode);
4400 /* in case this is a caching lock and reinstate with new inode */
4401 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4403 lock_res_and_lock(lock);
4404 lvb_ready = ldlm_is_lvb_ready(lock);
4405 unlock_res_and_lock(lock);
4407 /* checking lvb_ready is racy but this is okay. The worst case is
4408 * that multi processes may configure the file on the same time. */
4412 rc = ll_layout_fetch(inode, lock);
4416 /* for layout lock, lmm is stored in lock's lvb.
4417 * lvb_data is immutable if the lock is held so it's safe to access it
4420 * set layout to file. Unlikely this will fail as old layout was
4421 * surely eliminated */
4422 memset(&conf, 0, sizeof conf);
4423 conf.coc_opc = OBJECT_CONF_SET;
4424 conf.coc_inode = inode;
4425 conf.coc_lock = lock;
4426 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4427 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4428 rc = ll_layout_conf(inode, &conf);
4430 /* refresh layout failed, need to wait */
4431 wait_layout = rc == -EBUSY;
4434 LDLM_LOCK_PUT(lock);
4435 ldlm_lock_decref(lockh, mode);
4437 /* wait for IO to complete if it's still being used. */
4439 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4440 ll_get_fsname(inode->i_sb, NULL, 0),
4441 PFID(&lli->lli_fid), inode);
4443 memset(&conf, 0, sizeof conf);
4444 conf.coc_opc = OBJECT_CONF_WAIT;
4445 conf.coc_inode = inode;
4446 rc = ll_layout_conf(inode, &conf);
4450 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4451 ll_get_fsname(inode->i_sb, NULL, 0),
4452 PFID(&lli->lli_fid), rc);
4458 * Issue layout intent RPC to MDS.
4459 * \param inode [in] file inode
4460 * \param intent [in] layout intent
4462 * \retval 0 on success
4463 * \retval < 0 error code
4465 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4467 struct ll_inode_info *lli = ll_i2info(inode);
4468 struct ll_sb_info *sbi = ll_i2sbi(inode);
4469 struct md_op_data *op_data;
4470 struct lookup_intent it;
4471 struct ptlrpc_request *req;
4475 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4476 0, 0, LUSTRE_OPC_ANY, NULL);
4477 if (IS_ERR(op_data))
4478 RETURN(PTR_ERR(op_data));
4480 op_data->op_data = intent;
4481 op_data->op_data_size = sizeof(*intent);
4483 memset(&it, 0, sizeof(it));
4484 it.it_op = IT_LAYOUT;
4485 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4486 intent->li_opc == LAYOUT_INTENT_TRUNC)
4487 it.it_flags = FMODE_WRITE;
4489 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4490 ll_get_fsname(inode->i_sb, NULL, 0),
4491 PFID(&lli->lli_fid), inode);
4493 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4494 &ll_md_blocking_ast, 0);
4495 if (it.it_request != NULL)
4496 ptlrpc_req_finished(it.it_request);
4497 it.it_request = NULL;
4499 ll_finish_md_op_data(op_data);
4501 /* set lock data in case this is a new lock */
4503 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4505 ll_intent_drop_lock(&it);
4511 * This function checks if there exists a LAYOUT lock on the client side,
4512 * or enqueues it if it doesn't have one in cache.
4514 * This function will not hold layout lock so it may be revoked any time after
4515 * this function returns. Any operations depend on layout should be redone
4518 * This function should be called before lov_io_init() to get an uptodate
4519 * layout version, the caller should save the version number and after IO
4520 * is finished, this function should be called again to verify that layout
4521 * is not changed during IO time.
4523 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4525 struct ll_inode_info *lli = ll_i2info(inode);
4526 struct ll_sb_info *sbi = ll_i2sbi(inode);
4527 struct lustre_handle lockh;
4528 struct layout_intent intent = {
4529 .li_opc = LAYOUT_INTENT_ACCESS,
4531 enum ldlm_mode mode;
4535 *gen = ll_layout_version_get(lli);
4536 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4540 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4541 LASSERT(S_ISREG(inode->i_mode));
4543 /* take layout lock mutex to enqueue layout lock exclusively. */
4544 mutex_lock(&lli->lli_layout_mutex);
4547 /* mostly layout lock is caching on the local side, so try to
4548 * match it before grabbing layout lock mutex. */
4549 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4550 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4551 if (mode != 0) { /* hit cached lock */
4552 rc = ll_layout_lock_set(&lockh, mode, inode);
4558 rc = ll_layout_intent(inode, &intent);
4564 *gen = ll_layout_version_get(lli);
4565 mutex_unlock(&lli->lli_layout_mutex);
4571 * Issue layout intent RPC indicating where in a file an IO is about to write.
4573 * \param[in] inode file inode.
4574 * \param[in] start start offset of fille in bytes where an IO is about to
4576 * \param[in] end exclusive end offset in bytes of the write range.
4578 * \retval 0 on success
4579 * \retval < 0 error code
4581 int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
4583 struct layout_intent intent = {
4584 .li_opc = LAYOUT_INTENT_WRITE,
4591 rc = ll_layout_intent(inode, &intent);
4597 * This function send a restore request to the MDT
4599 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4601 struct hsm_user_request *hur;
4605 len = sizeof(struct hsm_user_request) +
4606 sizeof(struct hsm_user_item);
4607 OBD_ALLOC(hur, len);
4611 hur->hur_request.hr_action = HUA_RESTORE;
4612 hur->hur_request.hr_archive_id = 0;
4613 hur->hur_request.hr_flags = 0;
4614 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4615 sizeof(hur->hur_user_item[0].hui_fid));
4616 hur->hur_user_item[0].hui_extent.offset = offset;
4617 hur->hur_user_item[0].hui_extent.length = length;
4618 hur->hur_request.hr_itemcount = 1;
4619 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,