4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
89 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
90 struct lustre_handle *fh)
92 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr_blocks = inode->i_blocks;
99 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
101 op_data->op_handle = *fh;
103 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
104 op_data->op_bias |= MDS_DATA_MODIFIED;
108 * Packs all the attributes into @op_data for the CLOSE rpc.
110 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
111 struct obd_client_handle *och)
115 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
116 ATTR_MTIME | ATTR_MTIME_SET |
117 ATTR_CTIME | ATTR_CTIME_SET;
119 if (!(och->och_flags & FMODE_WRITE))
122 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
125 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
126 ll_prep_md_op_data(op_data, inode, NULL, NULL,
127 0, 0, LUSTRE_OPC_ANY, NULL);
132 * Perform a close, possibly with a bias.
133 * The meaning of "data" depends on the value of "bias".
135 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
136 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
139 static int ll_close_inode_openhandle(struct obd_export *md_exp,
140 struct obd_client_handle *och,
142 enum mds_op_bias bias,
145 struct obd_export *exp = ll_i2mdexp(inode);
146 struct md_op_data *op_data;
147 struct ptlrpc_request *req = NULL;
148 struct obd_device *obd = class_exp2obd(exp);
154 * XXX: in case of LMV, is this correct to access
157 CERROR("Invalid MDC connection handle "LPX64"\n",
158 ll_i2mdexp(inode)->exp_handle.h_cookie);
162 OBD_ALLOC_PTR(op_data);
164 /* XXX We leak openhandle and request here. */
165 GOTO(out, rc = -ENOMEM);
167 ll_prepare_close(inode, op_data, och);
169 case MDS_CLOSE_LAYOUT_SWAP:
170 LASSERT(data != NULL);
171 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_HSM_RELEASE:
178 LASSERT(data != NULL);
179 op_data->op_bias |= MDS_HSM_RELEASE;
180 op_data->op_data_version = *(__u64 *)data;
181 op_data->op_lease_handle = och->och_lease_handle;
182 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
186 LASSERT(data == NULL);
190 rc = md_close(md_exp, op_data, och->och_mod, &req);
192 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
193 ll_i2mdexp(inode)->exp_obd->obd_name,
194 PFID(ll_inode2fid(inode)), rc);
197 /* DATA_MODIFIED flag was successfully sent on close, cancel data
198 * modification flag. */
199 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
200 struct ll_inode_info *lli = ll_i2info(inode);
202 spin_lock(&lli->lli_lock);
203 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
204 spin_unlock(&lli->lli_lock);
208 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
209 struct mdt_body *body;
211 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
212 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
216 ll_finish_md_op_data(op_data);
220 md_clear_open_replay_data(md_exp, och);
221 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
224 if (req) /* This is close request */
225 ptlrpc_req_finished(req);
229 int ll_md_real_close(struct inode *inode, fmode_t fmode)
231 struct ll_inode_info *lli = ll_i2info(inode);
232 struct obd_client_handle **och_p;
233 struct obd_client_handle *och;
238 if (fmode & FMODE_WRITE) {
239 och_p = &lli->lli_mds_write_och;
240 och_usecount = &lli->lli_open_fd_write_count;
241 } else if (fmode & FMODE_EXEC) {
242 och_p = &lli->lli_mds_exec_och;
243 och_usecount = &lli->lli_open_fd_exec_count;
245 LASSERT(fmode & FMODE_READ);
246 och_p = &lli->lli_mds_read_och;
247 och_usecount = &lli->lli_open_fd_read_count;
250 mutex_lock(&lli->lli_och_mutex);
251 if (*och_usecount > 0) {
252 /* There are still users of this handle, so skip
254 mutex_unlock(&lli->lli_och_mutex);
260 mutex_unlock(&lli->lli_och_mutex);
263 /* There might be a race and this handle may already
265 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
266 och, inode, 0, NULL);
272 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
275 ldlm_policy_data_t policy = {
276 .l_inodebits = { MDS_INODELOCK_OPEN },
278 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
279 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
280 struct ll_inode_info *lli = ll_i2info(inode);
281 struct lustre_handle lockh;
286 /* clear group lock, if present */
287 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
288 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
290 if (fd->fd_lease_och != NULL) {
293 /* Usually the lease is not released when the
294 * application crashed, we need to release here. */
295 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
296 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
297 PFID(&lli->lli_fid), rc, lease_broken);
299 fd->fd_lease_och = NULL;
302 if (fd->fd_och != NULL) {
303 rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
309 /* Let's see if we have good enough OPEN lock on the file and if
310 we can skip talking to MDS */
311 mutex_lock(&lli->lli_och_mutex);
312 if (fd->fd_omode & FMODE_WRITE) {
314 LASSERT(lli->lli_open_fd_write_count);
315 lli->lli_open_fd_write_count--;
316 } else if (fd->fd_omode & FMODE_EXEC) {
318 LASSERT(lli->lli_open_fd_exec_count);
319 lli->lli_open_fd_exec_count--;
322 LASSERT(lli->lli_open_fd_read_count);
323 lli->lli_open_fd_read_count--;
325 mutex_unlock(&lli->lli_och_mutex);
327 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
328 LDLM_IBITS, &policy, lockmode, &lockh))
329 rc = ll_md_real_close(inode, fd->fd_omode);
332 LUSTRE_FPRIVATE(file) = NULL;
333 ll_file_data_put(fd);
338 /* While this returns an error code, fput() the caller does not, so we need
339 * to make every effort to clean up all of our state here. Also, applications
340 * rarely check close errors and even if an error is returned they will not
341 * re-try the close call.
343 int ll_file_release(struct inode *inode, struct file *file)
345 struct ll_file_data *fd;
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
351 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
352 PFID(ll_inode2fid(inode)), inode);
354 #ifdef CONFIG_FS_POSIX_ACL
355 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
356 inode == inode->i_sb->s_root->d_inode) {
357 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
360 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
361 fd->fd_flags &= ~LL_FILE_RMTACL;
362 rct_del(&sbi->ll_rct, current_pid());
363 et_search_free(&sbi->ll_et, current_pid());
368 if (inode->i_sb->s_root != file->f_path.dentry)
369 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
370 fd = LUSTRE_FPRIVATE(file);
373 /* The last ref on @file, maybe not the the owner pid of statahead,
374 * because parent and child process can share the same file handle. */
375 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
376 ll_deauthorize_statahead(inode, fd);
378 if (inode->i_sb->s_root == file->f_path.dentry) {
379 LUSTRE_FPRIVATE(file) = NULL;
380 ll_file_data_put(fd);
384 if (!S_ISDIR(inode->i_mode)) {
385 if (lli->lli_clob != NULL)
386 lov_read_and_clear_async_rc(lli->lli_clob);
387 lli->lli_async_rc = 0;
390 rc = ll_md_close(sbi->ll_md_exp, inode, file);
392 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
393 libcfs_debug_dumplog();
398 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
399 struct lookup_intent *itp)
401 struct dentry *de = file->f_path.dentry;
402 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
403 struct dentry *parent = de->d_parent;
404 const char *name = NULL;
406 struct md_op_data *op_data;
407 struct ptlrpc_request *req = NULL;
411 LASSERT(parent != NULL);
412 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
414 /* if server supports open-by-fid, or file name is invalid, don't pack
415 * name in open request */
416 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
417 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
418 name = de->d_name.name;
419 len = de->d_name.len;
422 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
423 name, len, 0, LUSTRE_OPC_ANY, NULL);
425 RETURN(PTR_ERR(op_data));
426 op_data->op_data = lmm;
427 op_data->op_data_size = lmmsize;
429 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
430 &ll_md_blocking_ast, 0);
431 ll_finish_md_op_data(op_data);
433 /* reason for keep own exit path - don`t flood log
434 * with messages with -ESTALE errors.
436 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
437 it_open_error(DISP_OPEN_OPEN, itp))
439 ll_release_openhandle(de, itp);
443 if (it_disposition(itp, DISP_LOOKUP_NEG))
444 GOTO(out, rc = -ENOENT);
446 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
447 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
448 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
452 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
453 if (!rc && itp->d.lustre.it_lock_mode)
454 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
457 ptlrpc_req_finished(req);
458 ll_intent_drop_lock(itp);
463 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
464 struct obd_client_handle *och)
466 struct ptlrpc_request *req = it->d.lustre.it_data;
467 struct mdt_body *body;
469 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
470 och->och_fh = body->mbo_handle;
471 och->och_fid = body->mbo_fid1;
472 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
473 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
474 och->och_flags = it->it_flags;
476 return md_set_open_replay_data(md_exp, och, it);
479 static int ll_local_open(struct file *file, struct lookup_intent *it,
480 struct ll_file_data *fd, struct obd_client_handle *och)
482 struct inode *inode = file->f_path.dentry->d_inode;
485 LASSERT(!LUSTRE_FPRIVATE(file));
492 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
497 LUSTRE_FPRIVATE(file) = fd;
498 ll_readahead_init(inode, &fd->fd_ras);
499 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
501 /* ll_cl_context initialize */
502 rwlock_init(&fd->fd_lock);
503 INIT_LIST_HEAD(&fd->fd_lccs);
508 /* Open a file, and (for the very first open) create objects on the OSTs at
509 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
510 * creation or open until ll_lov_setstripe() ioctl is called.
512 * If we already have the stripe MD locally then we don't request it in
513 * md_open(), by passing a lmm_size = 0.
515 * It is up to the application to ensure no other processes open this file
516 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
517 * used. We might be able to avoid races of that sort by getting lli_open_sem
518 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
519 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
521 int ll_file_open(struct inode *inode, struct file *file)
523 struct ll_inode_info *lli = ll_i2info(inode);
524 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
525 .it_flags = file->f_flags };
526 struct obd_client_handle **och_p = NULL;
527 __u64 *och_usecount = NULL;
528 struct ll_file_data *fd;
532 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
533 PFID(ll_inode2fid(inode)), inode, file->f_flags);
535 it = file->private_data; /* XXX: compat macro */
536 file->private_data = NULL; /* prevent ll_local_open assertion */
538 fd = ll_file_data_get();
540 GOTO(out_openerr, rc = -ENOMEM);
543 if (S_ISDIR(inode->i_mode))
544 ll_authorize_statahead(inode, fd);
546 if (inode->i_sb->s_root == file->f_path.dentry) {
547 LUSTRE_FPRIVATE(file) = fd;
551 if (!it || !it->d.lustre.it_disposition) {
552 /* Convert f_flags into access mode. We cannot use file->f_mode,
553 * because everything but O_ACCMODE mask was stripped from
555 if ((oit.it_flags + 1) & O_ACCMODE)
557 if (file->f_flags & O_TRUNC)
558 oit.it_flags |= FMODE_WRITE;
560 /* kernel only call f_op->open in dentry_open. filp_open calls
561 * dentry_open after call to open_namei that checks permissions.
562 * Only nfsd_open call dentry_open directly without checking
563 * permissions and because of that this code below is safe. */
564 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
565 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
567 /* We do not want O_EXCL here, presumably we opened the file
568 * already? XXX - NFS implications? */
569 oit.it_flags &= ~O_EXCL;
571 /* bug20584, if "it_flags" contains O_CREAT, the file will be
572 * created if necessary, then "IT_CREAT" should be set to keep
573 * consistent with it */
574 if (oit.it_flags & O_CREAT)
575 oit.it_op |= IT_CREAT;
581 /* Let's see if we have file open on MDS already. */
582 if (it->it_flags & FMODE_WRITE) {
583 och_p = &lli->lli_mds_write_och;
584 och_usecount = &lli->lli_open_fd_write_count;
585 } else if (it->it_flags & FMODE_EXEC) {
586 och_p = &lli->lli_mds_exec_och;
587 och_usecount = &lli->lli_open_fd_exec_count;
589 och_p = &lli->lli_mds_read_och;
590 och_usecount = &lli->lli_open_fd_read_count;
593 mutex_lock(&lli->lli_och_mutex);
594 if (*och_p) { /* Open handle is present */
595 if (it_disposition(it, DISP_OPEN_OPEN)) {
596 /* Well, there's extra open request that we do not need,
597 let's close it somehow. This will decref request. */
598 rc = it_open_error(DISP_OPEN_OPEN, it);
600 mutex_unlock(&lli->lli_och_mutex);
601 GOTO(out_openerr, rc);
604 ll_release_openhandle(file->f_path.dentry, it);
608 rc = ll_local_open(file, it, fd, NULL);
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
615 LASSERT(*och_usecount == 0);
616 if (!it->d.lustre.it_disposition) {
617 /* We cannot just request lock handle now, new ELC code
618 means that one of other OPEN locks for this file
619 could be cancelled, and since blocking ast handler
620 would attempt to grab och_mutex as well, that would
621 result in a deadlock */
622 mutex_unlock(&lli->lli_och_mutex);
624 * Normally called under two situations:
626 * 2. A race/condition on MDS resulting in no open
627 * handle to be returned from LOOKUP|OPEN request,
628 * for example if the target entry was a symlink.
630 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
632 * Always specify MDS_OPEN_BY_FID because we don't want
633 * to get file with different fid.
635 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
636 rc = ll_intent_file_open(file, NULL, 0, it);
638 GOTO(out_openerr, rc);
642 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
644 GOTO(out_och_free, rc = -ENOMEM);
648 /* md_intent_lock() didn't get a request ref if there was an
649 * open error, so don't do cleanup on the request here
651 /* XXX (green): Should not we bail out on any error here, not
652 * just open error? */
653 rc = it_open_error(DISP_OPEN_OPEN, it);
655 GOTO(out_och_free, rc);
657 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
658 "inode %p: disposition %x, status %d\n", inode,
659 it_disposition(it, ~0), it->d.lustre.it_status);
661 rc = ll_local_open(file, it, fd, *och_p);
663 GOTO(out_och_free, rc);
665 mutex_unlock(&lli->lli_och_mutex);
668 /* Must do this outside lli_och_mutex lock to prevent deadlock where
669 different kind of OPEN lock for this same inode gets cancelled
670 by ldlm_cancel_lru */
671 if (!S_ISREG(inode->i_mode))
672 GOTO(out_och_free, rc);
674 cl_lov_delay_create_clear(&file->f_flags);
675 GOTO(out_och_free, rc);
679 if (och_p && *och_p) {
680 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
681 *och_p = NULL; /* OBD_FREE writes some magic there */
684 mutex_unlock(&lli->lli_och_mutex);
687 if (lli->lli_opendir_key == fd)
688 ll_deauthorize_statahead(inode, fd);
690 ll_file_data_put(fd);
692 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
695 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
696 ptlrpc_req_finished(it->d.lustre.it_data);
697 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
703 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
704 struct ldlm_lock_desc *desc, void *data, int flag)
707 struct lustre_handle lockh;
711 case LDLM_CB_BLOCKING:
712 ldlm_lock2handle(lock, &lockh);
713 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
715 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
719 case LDLM_CB_CANCELING:
727 * Acquire a lease and open the file.
729 static struct obd_client_handle *
730 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
733 struct lookup_intent it = { .it_op = IT_OPEN };
734 struct ll_sb_info *sbi = ll_i2sbi(inode);
735 struct md_op_data *op_data;
736 struct ptlrpc_request *req = NULL;
737 struct lustre_handle old_handle = { 0 };
738 struct obd_client_handle *och = NULL;
743 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
744 RETURN(ERR_PTR(-EINVAL));
747 struct ll_inode_info *lli = ll_i2info(inode);
748 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
749 struct obd_client_handle **och_p;
752 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
753 RETURN(ERR_PTR(-EPERM));
755 /* Get the openhandle of the file */
757 mutex_lock(&lli->lli_och_mutex);
758 if (fd->fd_lease_och != NULL) {
759 mutex_unlock(&lli->lli_och_mutex);
763 if (fd->fd_och == NULL) {
764 if (file->f_mode & FMODE_WRITE) {
765 LASSERT(lli->lli_mds_write_och != NULL);
766 och_p = &lli->lli_mds_write_och;
767 och_usecount = &lli->lli_open_fd_write_count;
769 LASSERT(lli->lli_mds_read_och != NULL);
770 och_p = &lli->lli_mds_read_och;
771 och_usecount = &lli->lli_open_fd_read_count;
773 if (*och_usecount == 1) {
780 mutex_unlock(&lli->lli_och_mutex);
781 if (rc < 0) /* more than 1 opener */
784 LASSERT(fd->fd_och != NULL);
785 old_handle = fd->fd_och->och_fh;
790 RETURN(ERR_PTR(-ENOMEM));
792 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
793 LUSTRE_OPC_ANY, NULL);
795 GOTO(out, rc = PTR_ERR(op_data));
797 /* To tell the MDT this openhandle is from the same owner */
798 op_data->op_handle = old_handle;
800 it.it_flags = fmode | open_flags;
801 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
802 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
803 &ll_md_blocking_lease_ast,
804 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
805 * it can be cancelled which may mislead applications that the lease is
807 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
808 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
809 * doesn't deal with openhandle, so normal openhandle will be leaked. */
810 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
811 ll_finish_md_op_data(op_data);
812 ptlrpc_req_finished(req);
814 GOTO(out_release_it, rc);
816 if (it_disposition(&it, DISP_LOOKUP_NEG))
817 GOTO(out_release_it, rc = -ENOENT);
819 rc = it_open_error(DISP_OPEN_OPEN, &it);
821 GOTO(out_release_it, rc);
823 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
824 ll_och_fill(sbi->ll_md_exp, &it, och);
826 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
827 GOTO(out_close, rc = -EOPNOTSUPP);
829 /* already get lease, handle lease lock */
830 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
831 if (it.d.lustre.it_lock_mode == 0 ||
832 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
833 /* open lock must return for lease */
834 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
835 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
836 it.d.lustre.it_lock_bits);
837 GOTO(out_close, rc = -EPROTO);
840 ll_intent_release(&it);
844 /* Cancel open lock */
845 if (it.d.lustre.it_lock_mode != 0) {
846 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
847 it.d.lustre.it_lock_mode);
848 it.d.lustre.it_lock_mode = 0;
849 och->och_lease_handle.cookie = 0ULL;
851 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
853 CERROR("%s: error closing file "DFID": %d\n",
854 ll_get_fsname(inode->i_sb, NULL, 0),
855 PFID(&ll_i2info(inode)->lli_fid), rc2);
856 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
858 ll_intent_release(&it);
866 * Check whether a layout swap can be done between two inodes.
868 * \param[in] inode1 First inode to check
869 * \param[in] inode2 Second inode to check
871 * \retval 0 on success, layout swap can be performed between both inodes
872 * \retval negative error code if requirements are not met
874 static int ll_check_swap_layouts_validity(struct inode *inode1,
875 struct inode *inode2)
877 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
880 if (inode_permission(inode1, MAY_WRITE) ||
881 inode_permission(inode2, MAY_WRITE))
884 if (inode1->i_sb != inode2->i_sb)
890 static int ll_swap_layouts_close(struct obd_client_handle *och,
891 struct inode *inode, struct inode *inode2)
893 const struct lu_fid *fid1 = ll_inode2fid(inode);
894 const struct lu_fid *fid2;
898 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
899 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
901 rc = ll_check_swap_layouts_validity(inode, inode2);
903 GOTO(out_free_och, rc);
905 /* We now know that inode2 is a lustre inode */
906 fid2 = ll_inode2fid(inode2);
908 rc = lu_fid_cmp(fid1, fid2);
910 GOTO(out_free_och, rc = -EINVAL);
912 /* Close the file and swap layouts between inode & inode2.
913 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
914 * because we still need it to pack l_remote_handle to MDT. */
915 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
916 MDS_CLOSE_LAYOUT_SWAP, inode2);
918 och = NULL; /* freed in ll_close_inode_openhandle() */
928 * Release lease and close the file.
929 * It will check if the lease has ever broken.
931 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
934 struct ldlm_lock *lock;
935 bool cancelled = true;
939 lock = ldlm_handle2lock(&och->och_lease_handle);
941 lock_res_and_lock(lock);
942 cancelled = ldlm_is_cancel(lock);
943 unlock_res_and_lock(lock);
947 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
948 PFID(&ll_i2info(inode)->lli_fid), cancelled);
951 ldlm_cli_cancel(&och->och_lease_handle, 0);
952 if (lease_broken != NULL)
953 *lease_broken = cancelled;
955 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
961 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
963 struct ll_inode_info *lli = ll_i2info(inode);
964 struct cl_object *obj = lli->lli_clob;
965 struct cl_attr *attr = vvp_env_thread_attr(env);
973 ll_inode_size_lock(inode);
975 /* merge timestamps the most recently obtained from mds with
976 timestamps obtained from osts */
977 LTIME_S(inode->i_atime) = lli->lli_atime;
978 LTIME_S(inode->i_mtime) = lli->lli_mtime;
979 LTIME_S(inode->i_ctime) = lli->lli_ctime;
981 atime = LTIME_S(inode->i_atime);
982 mtime = LTIME_S(inode->i_mtime);
983 ctime = LTIME_S(inode->i_ctime);
985 cl_object_attr_lock(obj);
986 rc = cl_object_attr_get(env, obj, attr);
987 cl_object_attr_unlock(obj);
990 GOTO(out_size_unlock, rc);
992 if (atime < attr->cat_atime)
993 atime = attr->cat_atime;
995 if (ctime < attr->cat_ctime)
996 ctime = attr->cat_ctime;
998 if (mtime < attr->cat_mtime)
999 mtime = attr->cat_mtime;
1001 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1002 PFID(&lli->lli_fid), attr->cat_size);
1004 i_size_write(inode, attr->cat_size);
1005 inode->i_blocks = attr->cat_blocks;
1007 LTIME_S(inode->i_atime) = atime;
1008 LTIME_S(inode->i_mtime) = mtime;
1009 LTIME_S(inode->i_ctime) = ctime;
1012 ll_inode_size_unlock(inode);
1017 static bool file_is_noatime(const struct file *file)
1019 const struct vfsmount *mnt = file->f_path.mnt;
1020 const struct inode *inode = file->f_path.dentry->d_inode;
1022 /* Adapted from file_accessed() and touch_atime().*/
1023 if (file->f_flags & O_NOATIME)
1026 if (inode->i_flags & S_NOATIME)
1029 if (IS_NOATIME(inode))
1032 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1035 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1038 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1044 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1046 struct inode *inode = file->f_path.dentry->d_inode;
1048 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1050 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1051 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1052 file->f_flags & O_DIRECT ||
1055 io->ci_obj = ll_i2info(inode)->lli_clob;
1056 io->ci_lockreq = CILR_MAYBE;
1057 if (ll_file_nolock(file)) {
1058 io->ci_lockreq = CILR_NEVER;
1059 io->ci_no_srvlock = 1;
1060 } else if (file->f_flags & O_APPEND) {
1061 io->ci_lockreq = CILR_MANDATORY;
1064 io->ci_noatime = file_is_noatime(file);
1068 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1069 struct file *file, enum cl_io_type iot,
1070 loff_t *ppos, size_t count)
1072 struct vvp_io *vio = vvp_env_io(env);
1073 struct inode *inode = file->f_path.dentry->d_inode;
1074 struct ll_inode_info *lli = ll_i2info(inode);
1075 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1079 struct range_lock range;
1083 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1084 file->f_path.dentry->d_name.name, iot, *ppos, count);
1087 io = vvp_env_thread_io(env);
1088 ll_io_init(io, file, iot == CIT_WRITE);
1090 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1091 bool range_locked = false;
1093 if (file->f_flags & O_APPEND)
1094 range_lock_init(&range, 0, LUSTRE_EOF);
1096 range_lock_init(&range, *ppos, *ppos + count - 1);
1098 vio->vui_fd = LUSTRE_FPRIVATE(file);
1099 vio->vui_io_subtype = args->via_io_subtype;
1101 switch (vio->vui_io_subtype) {
1103 vio->vui_iov = args->u.normal.via_iov;
1104 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1105 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1106 vio->vui_iocb = args->u.normal.via_iocb;
1107 /* Direct IO reads must also take range lock,
1108 * or multiple reads will try to work on the same pages
1109 * See LU-6227 for details. */
1110 if (((iot == CIT_WRITE) ||
1111 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1112 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1113 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1115 rc = range_lock(&lli->lli_write_tree, &range);
1119 range_locked = true;
1123 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1124 vio->u.splice.vui_flags = args->u.splice.via_flags;
1127 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1131 ll_cl_add(file, env, io);
1132 rc = cl_io_loop(env, io);
1133 ll_cl_remove(file, env);
1136 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1138 range_unlock(&lli->lli_write_tree, &range);
1141 /* cl_io_rw_init() handled IO */
1145 if (io->ci_nob > 0) {
1146 result += io->ci_nob;
1147 count -= io->ci_nob;
1148 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1150 /* prepare IO restart */
1151 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1152 args->u.normal.via_iov = vio->vui_iov;
1153 args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1158 cl_io_fini(env, io);
1160 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1162 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1163 file->f_path.dentry->d_name.name,
1164 iot == CIT_READ ? "read" : "write",
1165 *ppos, count, result);
1169 if (iot == CIT_READ) {
1171 ll_stats_ops_tally(ll_i2sbi(inode),
1172 LPROC_LL_READ_BYTES, result);
1173 } else if (iot == CIT_WRITE) {
1175 ll_stats_ops_tally(ll_i2sbi(inode),
1176 LPROC_LL_WRITE_BYTES, result);
1177 fd->fd_write_failed = false;
1178 } else if (rc != -ERESTARTSYS) {
1179 fd->fd_write_failed = true;
1183 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1185 return result > 0 ? result : rc;
1189 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1191 static int ll_file_get_iov_count(const struct iovec *iov,
1192 unsigned long *nr_segs, size_t *count)
1197 for (seg = 0; seg < *nr_segs; seg++) {
1198 const struct iovec *iv = &iov[seg];
1201 * If any segment has a negative length, or the cumulative
1202 * length ever wraps negative then return -EINVAL.
1205 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1207 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1212 cnt -= iv->iov_len; /* This segment is no good */
1219 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1220 unsigned long nr_segs, loff_t pos)
1223 struct vvp_io_args *args;
1224 struct iovec *local_iov;
1230 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1234 env = cl_env_get(&refcheck);
1236 RETURN(PTR_ERR(env));
1239 local_iov = &ll_env_info(env)->lti_local_iov;
1242 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1243 if (local_iov == NULL) {
1244 cl_env_put(env, &refcheck);
1248 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1251 args = ll_env_args(env, IO_NORMAL);
1252 args->u.normal.via_iov = local_iov;
1253 args->u.normal.via_nrsegs = nr_segs;
1254 args->u.normal.via_iocb = iocb;
1256 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1257 &iocb->ki_pos, count);
1259 cl_env_put(env, &refcheck);
1262 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1267 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1271 struct iovec iov = { .iov_base = buf, .iov_len = count };
1272 struct kiocb *kiocb;
1277 env = cl_env_get(&refcheck);
1279 RETURN(PTR_ERR(env));
1281 kiocb = &ll_env_info(env)->lti_kiocb;
1282 init_sync_kiocb(kiocb, file);
1283 kiocb->ki_pos = *ppos;
1284 #ifdef HAVE_KIOCB_KI_LEFT
1285 kiocb->ki_left = count;
1286 #elif defined(HAVE_KI_NBYTES)
1287 kiocb->ki_nbytes = count;
1290 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1291 *ppos = kiocb->ki_pos;
1293 cl_env_put(env, &refcheck);
1298 * Write to a file (through the page cache).
1301 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1302 unsigned long nr_segs, loff_t pos)
1305 struct vvp_io_args *args;
1306 struct iovec *local_iov;
1312 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1316 env = cl_env_get(&refcheck);
1318 RETURN(PTR_ERR(env));
1321 local_iov = &ll_env_info(env)->lti_local_iov;
1324 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1325 if (local_iov == NULL) {
1326 cl_env_put(env, &refcheck);
1330 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1333 args = ll_env_args(env, IO_NORMAL);
1334 args->u.normal.via_iov = local_iov;
1335 args->u.normal.via_nrsegs = nr_segs;
1336 args->u.normal.via_iocb = iocb;
1338 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1339 &iocb->ki_pos, count);
1340 cl_env_put(env, &refcheck);
1343 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1348 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1349 size_t count, loff_t *ppos)
1352 struct iovec iov = { .iov_base = (void __user *)buf,
1354 struct kiocb *kiocb;
1359 env = cl_env_get(&refcheck);
1361 RETURN(PTR_ERR(env));
1363 kiocb = &ll_env_info(env)->lti_kiocb;
1364 init_sync_kiocb(kiocb, file);
1365 kiocb->ki_pos = *ppos;
1366 #ifdef HAVE_KIOCB_KI_LEFT
1367 kiocb->ki_left = count;
1368 #elif defined(HAVE_KI_NBYTES)
1369 kiocb->ki_nbytes = count;
1372 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1373 *ppos = kiocb->ki_pos;
1375 cl_env_put(env, &refcheck);
1380 * Send file content (through pagecache) somewhere with helper
1382 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1383 struct pipe_inode_info *pipe, size_t count,
1387 struct vvp_io_args *args;
1392 env = cl_env_get(&refcheck);
1394 RETURN(PTR_ERR(env));
1396 args = ll_env_args(env, IO_SPLICE);
1397 args->u.splice.via_pipe = pipe;
1398 args->u.splice.via_flags = flags;
1400 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1401 cl_env_put(env, &refcheck);
1405 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1406 __u64 flags, struct lov_user_md *lum,
1409 struct lookup_intent oit = {
1411 .it_flags = flags | MDS_OPEN_BY_FID,
1416 ll_inode_size_lock(inode);
1417 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1419 GOTO(out_unlock, rc);
1421 ll_release_openhandle(file->f_path.dentry, &oit);
1424 ll_inode_size_unlock(inode);
1425 ll_intent_release(&oit);
1426 cl_lov_delay_create_clear(&file->f_flags);
1431 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1432 struct lov_mds_md **lmmp, int *lmm_size,
1433 struct ptlrpc_request **request)
1435 struct ll_sb_info *sbi = ll_i2sbi(inode);
1436 struct mdt_body *body;
1437 struct lov_mds_md *lmm = NULL;
1438 struct ptlrpc_request *req = NULL;
1439 struct md_op_data *op_data;
1442 rc = ll_get_default_mdsize(sbi, &lmmsize);
1446 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1447 strlen(filename), lmmsize,
1448 LUSTRE_OPC_ANY, NULL);
1449 if (IS_ERR(op_data))
1450 RETURN(PTR_ERR(op_data));
1452 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1453 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1454 ll_finish_md_op_data(op_data);
1456 CDEBUG(D_INFO, "md_getattr_name failed "
1457 "on %s: rc %d\n", filename, rc);
1461 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1462 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1464 lmmsize = body->mbo_eadatasize;
1466 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1468 GOTO(out, rc = -ENODATA);
1471 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1472 LASSERT(lmm != NULL);
1474 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1475 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1476 GOTO(out, rc = -EPROTO);
1480 * This is coming from the MDS, so is probably in
1481 * little endian. We convert it to host endian before
1482 * passing it to userspace.
1484 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1487 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1488 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1491 /* if function called for directory - we should
1492 * avoid swab not existent lsm objects */
1493 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1494 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1495 if (S_ISREG(body->mbo_mode))
1496 lustre_swab_lov_user_md_objects(
1497 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1499 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1500 lustre_swab_lov_user_md_v3(
1501 (struct lov_user_md_v3 *)lmm);
1502 if (S_ISREG(body->mbo_mode))
1503 lustre_swab_lov_user_md_objects(
1504 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1511 *lmm_size = lmmsize;
1516 static int ll_lov_setea(struct inode *inode, struct file *file,
1519 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1520 struct lov_user_md *lump;
1521 int lum_size = sizeof(struct lov_user_md) +
1522 sizeof(struct lov_user_ost_data);
1526 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1529 OBD_ALLOC_LARGE(lump, lum_size);
1533 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1534 OBD_FREE_LARGE(lump, lum_size);
1538 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1540 OBD_FREE_LARGE(lump, lum_size);
1544 static int ll_file_getstripe(struct inode *inode,
1545 struct lov_user_md __user *lum)
1552 env = cl_env_get(&refcheck);
1554 RETURN(PTR_ERR(env));
1556 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1557 cl_env_put(env, &refcheck);
1561 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1564 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1565 struct lov_user_md *klum;
1567 __u64 flags = FMODE_WRITE;
1570 rc = ll_copy_user_md(lum, &klum);
1575 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1579 put_user(0, &lum->lmm_stripe_count);
1581 ll_layout_refresh(inode, &gen);
1582 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1585 OBD_FREE(klum, lum_size);
1590 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1592 struct ll_inode_info *lli = ll_i2info(inode);
1593 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1594 struct ll_grouplock grouplock;
1599 CWARN("group id for group lock must not be 0\n");
1603 if (ll_file_nolock(file))
1604 RETURN(-EOPNOTSUPP);
1606 spin_lock(&lli->lli_lock);
1607 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1608 CWARN("group lock already existed with gid %lu\n",
1609 fd->fd_grouplock.lg_gid);
1610 spin_unlock(&lli->lli_lock);
1613 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1614 spin_unlock(&lli->lli_lock);
1616 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1617 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1621 spin_lock(&lli->lli_lock);
1622 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1623 spin_unlock(&lli->lli_lock);
1624 CERROR("another thread just won the race\n");
1625 cl_put_grouplock(&grouplock);
1629 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1630 fd->fd_grouplock = grouplock;
1631 spin_unlock(&lli->lli_lock);
1633 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1637 static int ll_put_grouplock(struct inode *inode, struct file *file,
1640 struct ll_inode_info *lli = ll_i2info(inode);
1641 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1642 struct ll_grouplock grouplock;
1645 spin_lock(&lli->lli_lock);
1646 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1647 spin_unlock(&lli->lli_lock);
1648 CWARN("no group lock held\n");
1652 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1654 if (fd->fd_grouplock.lg_gid != arg) {
1655 CWARN("group lock %lu doesn't match current id %lu\n",
1656 arg, fd->fd_grouplock.lg_gid);
1657 spin_unlock(&lli->lli_lock);
1661 grouplock = fd->fd_grouplock;
1662 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1663 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1664 spin_unlock(&lli->lli_lock);
1666 cl_put_grouplock(&grouplock);
1667 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1672 * Close inode open handle
1674 * \param dentry [in] dentry which contains the inode
1675 * \param it [in,out] intent which contains open info and result
1678 * \retval <0 failure
1680 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1682 struct inode *inode = dentry->d_inode;
1683 struct obd_client_handle *och;
1689 /* Root ? Do nothing. */
1690 if (dentry->d_inode->i_sb->s_root == dentry)
1693 /* No open handle to close? Move away */
1694 if (!it_disposition(it, DISP_OPEN_OPEN))
1697 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1699 OBD_ALLOC(och, sizeof(*och));
1701 GOTO(out, rc = -ENOMEM);
1703 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1705 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1706 och, inode, 0, NULL);
1708 /* this one is in place of ll_file_open */
1709 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1710 ptlrpc_req_finished(it->d.lustre.it_data);
1711 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1717 * Get size for inode for which FIEMAP mapping is requested.
1718 * Make the FIEMAP get_info call and returns the result.
1719 * \param fiemap kernel buffer to hold extens
1720 * \param num_bytes kernel buffer size
1722 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1728 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1731 /* Checks for fiemap flags */
1732 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1733 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1737 /* Check for FIEMAP_FLAG_SYNC */
1738 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1739 rc = filemap_fdatawrite(inode->i_mapping);
1744 env = cl_env_get(&refcheck);
1746 RETURN(PTR_ERR(env));
1748 if (i_size_read(inode) == 0) {
1749 rc = ll_glimpse_size(inode);
1754 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1755 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1756 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1758 /* If filesize is 0, then there would be no objects for mapping */
1759 if (fmkey.lfik_oa.o_size == 0) {
1760 fiemap->fm_mapped_extents = 0;
1764 fmkey.lfik_fiemap = *fiemap;
1766 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1767 &fmkey, fiemap, &num_bytes);
1769 cl_env_put(env, &refcheck);
1773 int ll_fid2path(struct inode *inode, void __user *arg)
1775 struct obd_export *exp = ll_i2mdexp(inode);
1776 const struct getinfo_fid2path __user *gfin = arg;
1778 struct getinfo_fid2path *gfout;
1784 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1785 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1788 /* Only need to get the buflen */
1789 if (get_user(pathlen, &gfin->gf_pathlen))
1792 if (pathlen > PATH_MAX)
1795 outsize = sizeof(*gfout) + pathlen;
1796 OBD_ALLOC(gfout, outsize);
1800 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1801 GOTO(gf_free, rc = -EFAULT);
1803 /* Call mdc_iocontrol */
1804 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1808 if (copy_to_user(arg, gfout, outsize))
1812 OBD_FREE(gfout, outsize);
1817 * Read the data_version for inode.
1819 * This value is computed using stripe object version on OST.
1820 * Version is computed using server side locking.
1822 * @param flags if do sync on the OST side;
1824 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1825 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1827 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1829 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1837 /* If no file object initialized, we consider its version is 0. */
1843 env = cl_env_get(&refcheck);
1845 RETURN(PTR_ERR(env));
1847 io = vvp_env_thread_io(env);
1849 io->u.ci_data_version.dv_data_version = 0;
1850 io->u.ci_data_version.dv_flags = flags;
1853 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1854 result = cl_io_loop(env, io);
1856 result = io->ci_result;
1858 *data_version = io->u.ci_data_version.dv_data_version;
1860 cl_io_fini(env, io);
1862 if (unlikely(io->ci_need_restart))
1865 cl_env_put(env, &refcheck);
1871 * Trigger a HSM release request for the provided inode.
1873 int ll_hsm_release(struct inode *inode)
1875 struct cl_env_nest nest;
1877 struct obd_client_handle *och = NULL;
1878 __u64 data_version = 0;
1882 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1883 ll_get_fsname(inode->i_sb, NULL, 0),
1884 PFID(&ll_i2info(inode)->lli_fid));
1886 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1888 GOTO(out, rc = PTR_ERR(och));
1890 /* Grab latest data_version and [am]time values */
1891 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1895 env = cl_env_nested_get(&nest);
1897 GOTO(out, rc = PTR_ERR(env));
1899 ll_merge_attr(env, inode);
1900 cl_env_nested_put(&nest, env);
1902 /* Release the file.
1903 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1904 * we still need it to pack l_remote_handle to MDT. */
1905 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
1906 MDS_HSM_RELEASE, &data_version);
1911 if (och != NULL && !IS_ERR(och)) /* close the file */
1912 ll_lease_close(och, inode, NULL);
1917 struct ll_swap_stack {
1920 struct inode *inode1;
1921 struct inode *inode2;
1926 static int ll_swap_layouts(struct file *file1, struct file *file2,
1927 struct lustre_swap_layouts *lsl)
1929 struct mdc_swap_layouts msl;
1930 struct md_op_data *op_data;
1933 struct ll_swap_stack *llss = NULL;
1936 OBD_ALLOC_PTR(llss);
1940 llss->inode1 = file1->f_path.dentry->d_inode;
1941 llss->inode2 = file2->f_path.dentry->d_inode;
1943 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1947 /* we use 2 bool because it is easier to swap than 2 bits */
1948 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1949 llss->check_dv1 = true;
1951 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1952 llss->check_dv2 = true;
1954 /* we cannot use lsl->sl_dvX directly because we may swap them */
1955 llss->dv1 = lsl->sl_dv1;
1956 llss->dv2 = lsl->sl_dv2;
1958 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1959 if (rc == 0) /* same file, done! */
1962 if (rc < 0) { /* sequentialize it */
1963 swap(llss->inode1, llss->inode2);
1965 swap(llss->dv1, llss->dv2);
1966 swap(llss->check_dv1, llss->check_dv2);
1970 if (gid != 0) { /* application asks to flush dirty cache */
1971 rc = ll_get_grouplock(llss->inode1, file1, gid);
1975 rc = ll_get_grouplock(llss->inode2, file2, gid);
1977 ll_put_grouplock(llss->inode1, file1, gid);
1982 /* ultimate check, before swaping the layouts we check if
1983 * dataversion has changed (if requested) */
1984 if (llss->check_dv1) {
1985 rc = ll_data_version(llss->inode1, &dv, 0);
1988 if (dv != llss->dv1)
1989 GOTO(putgl, rc = -EAGAIN);
1992 if (llss->check_dv2) {
1993 rc = ll_data_version(llss->inode2, &dv, 0);
1996 if (dv != llss->dv2)
1997 GOTO(putgl, rc = -EAGAIN);
2000 /* struct md_op_data is used to send the swap args to the mdt
2001 * only flags is missing, so we use struct mdc_swap_layouts
2002 * through the md_op_data->op_data */
2003 /* flags from user space have to be converted before they are send to
2004 * server, no flag is sent today, they are only used on the client */
2007 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2008 0, LUSTRE_OPC_ANY, &msl);
2009 if (IS_ERR(op_data))
2010 GOTO(free, rc = PTR_ERR(op_data));
2012 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2013 sizeof(*op_data), op_data, NULL);
2014 ll_finish_md_op_data(op_data);
2021 ll_put_grouplock(llss->inode2, file2, gid);
2022 ll_put_grouplock(llss->inode1, file1, gid);
2032 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2034 struct md_op_data *op_data;
2038 /* Detect out-of range masks */
2039 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2042 /* Non-root users are forbidden to set or clear flags which are
2043 * NOT defined in HSM_USER_MASK. */
2044 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2045 !cfs_capable(CFS_CAP_SYS_ADMIN))
2048 /* Detect out-of range archive id */
2049 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2050 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2053 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2054 LUSTRE_OPC_ANY, hss);
2055 if (IS_ERR(op_data))
2056 RETURN(PTR_ERR(op_data));
2058 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2059 sizeof(*op_data), op_data, NULL);
2061 ll_finish_md_op_data(op_data);
2066 static int ll_hsm_import(struct inode *inode, struct file *file,
2067 struct hsm_user_import *hui)
2069 struct hsm_state_set *hss = NULL;
2070 struct iattr *attr = NULL;
2074 if (!S_ISREG(inode->i_mode))
2080 GOTO(out, rc = -ENOMEM);
2082 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2083 hss->hss_archive_id = hui->hui_archive_id;
2084 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2085 rc = ll_hsm_state_set(inode, hss);
2089 OBD_ALLOC_PTR(attr);
2091 GOTO(out, rc = -ENOMEM);
2093 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2094 attr->ia_mode |= S_IFREG;
2095 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2096 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2097 attr->ia_size = hui->hui_size;
2098 attr->ia_mtime.tv_sec = hui->hui_mtime;
2099 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2100 attr->ia_atime.tv_sec = hui->hui_atime;
2101 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2103 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2104 ATTR_UID | ATTR_GID |
2105 ATTR_MTIME | ATTR_MTIME_SET |
2106 ATTR_ATIME | ATTR_ATIME_SET;
2108 mutex_lock(&inode->i_mutex);
2110 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2114 mutex_unlock(&inode->i_mutex);
2126 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2128 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2129 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2133 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2135 struct inode *inode = file->f_path.dentry->d_inode;
2136 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2140 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2141 PFID(ll_inode2fid(inode)), inode, cmd);
2142 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2144 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2145 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2149 case LL_IOC_GETFLAGS:
2150 /* Get the current value of the file flags */
2151 return put_user(fd->fd_flags, (int __user *)arg);
2152 case LL_IOC_SETFLAGS:
2153 case LL_IOC_CLRFLAGS:
2154 /* Set or clear specific file flags */
2155 /* XXX This probably needs checks to ensure the flags are
2156 * not abused, and to handle any flag side effects.
2158 if (get_user(flags, (int __user *) arg))
2161 if (cmd == LL_IOC_SETFLAGS) {
2162 if ((flags & LL_FILE_IGNORE_LOCK) &&
2163 !(file->f_flags & O_DIRECT)) {
2164 CERROR("%s: unable to disable locking on "
2165 "non-O_DIRECT file\n", current->comm);
2169 fd->fd_flags |= flags;
2171 fd->fd_flags &= ~flags;
2174 case LL_IOC_LOV_SETSTRIPE:
2175 RETURN(ll_lov_setstripe(inode, file, arg));
2176 case LL_IOC_LOV_SETEA:
2177 RETURN(ll_lov_setea(inode, file, arg));
2178 case LL_IOC_LOV_SWAP_LAYOUTS: {
2180 struct lustre_swap_layouts lsl;
2182 if (copy_from_user(&lsl, (char __user *)arg,
2183 sizeof(struct lustre_swap_layouts)))
2186 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2189 file2 = fget(lsl.sl_fd);
2193 /* O_WRONLY or O_RDWR */
2194 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2195 GOTO(out, rc = -EPERM);
2197 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2198 struct inode *inode2;
2199 struct ll_inode_info *lli;
2200 struct obd_client_handle *och = NULL;
2202 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2203 GOTO(out, rc = -EINVAL);
2205 lli = ll_i2info(inode);
2206 mutex_lock(&lli->lli_och_mutex);
2207 if (fd->fd_lease_och != NULL) {
2208 och = fd->fd_lease_och;
2209 fd->fd_lease_och = NULL;
2211 mutex_unlock(&lli->lli_och_mutex);
2213 GOTO(out, rc = -ENOLCK);
2214 inode2 = file2->f_path.dentry->d_inode;
2215 rc = ll_swap_layouts_close(och, inode, inode2);
2217 rc = ll_swap_layouts(file, file2, &lsl);
2223 case LL_IOC_LOV_GETSTRIPE:
2224 RETURN(ll_file_getstripe(inode,
2225 (struct lov_user_md __user *)arg));
2226 case FSFILT_IOC_GETFLAGS:
2227 case FSFILT_IOC_SETFLAGS:
2228 RETURN(ll_iocontrol(inode, file, cmd, arg));
2229 case FSFILT_IOC_GETVERSION_OLD:
2230 case FSFILT_IOC_GETVERSION:
2231 RETURN(put_user(inode->i_generation, (int __user *)arg));
2232 case LL_IOC_GROUP_LOCK:
2233 RETURN(ll_get_grouplock(inode, file, arg));
2234 case LL_IOC_GROUP_UNLOCK:
2235 RETURN(ll_put_grouplock(inode, file, arg));
2236 case IOC_OBD_STATFS:
2237 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2239 /* We need to special case any other ioctls we want to handle,
2240 * to send them to the MDS/OST as appropriate and to properly
2241 * network encode the arg field.
2242 case FSFILT_IOC_SETVERSION_OLD:
2243 case FSFILT_IOC_SETVERSION:
2245 case LL_IOC_FLUSHCTX:
2246 RETURN(ll_flush_ctx(inode));
2247 case LL_IOC_PATH2FID: {
2248 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2249 sizeof(struct lu_fid)))
2254 case LL_IOC_GETPARENT:
2255 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2257 case OBD_IOC_FID2PATH:
2258 RETURN(ll_fid2path(inode, (void __user *)arg));
2259 case LL_IOC_DATA_VERSION: {
2260 struct ioc_data_version idv;
2263 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2266 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2267 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2270 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2276 case LL_IOC_GET_MDTIDX: {
2279 mdtidx = ll_get_mdt_idx(inode);
2283 if (put_user((int)mdtidx, (int __user *)arg))
2288 case OBD_IOC_GETDTNAME:
2289 case OBD_IOC_GETMDNAME:
2290 RETURN(ll_get_obd_name(inode, cmd, arg));
2291 case LL_IOC_HSM_STATE_GET: {
2292 struct md_op_data *op_data;
2293 struct hsm_user_state *hus;
2300 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2301 LUSTRE_OPC_ANY, hus);
2302 if (IS_ERR(op_data)) {
2304 RETURN(PTR_ERR(op_data));
2307 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2310 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2313 ll_finish_md_op_data(op_data);
2317 case LL_IOC_HSM_STATE_SET: {
2318 struct hsm_state_set *hss;
2325 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2330 rc = ll_hsm_state_set(inode, hss);
2335 case LL_IOC_HSM_ACTION: {
2336 struct md_op_data *op_data;
2337 struct hsm_current_action *hca;
2344 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2345 LUSTRE_OPC_ANY, hca);
2346 if (IS_ERR(op_data)) {
2348 RETURN(PTR_ERR(op_data));
2351 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2354 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2357 ll_finish_md_op_data(op_data);
2361 case LL_IOC_SET_LEASE: {
2362 struct ll_inode_info *lli = ll_i2info(inode);
2363 struct obd_client_handle *och = NULL;
2368 case LL_LEASE_WRLCK:
2369 if (!(file->f_mode & FMODE_WRITE))
2371 fmode = FMODE_WRITE;
2373 case LL_LEASE_RDLCK:
2374 if (!(file->f_mode & FMODE_READ))
2378 case LL_LEASE_UNLCK:
2379 mutex_lock(&lli->lli_och_mutex);
2380 if (fd->fd_lease_och != NULL) {
2381 och = fd->fd_lease_och;
2382 fd->fd_lease_och = NULL;
2384 mutex_unlock(&lli->lli_och_mutex);
2389 fmode = och->och_flags;
2390 rc = ll_lease_close(och, inode, &lease_broken);
2397 RETURN(ll_lease_type_from_fmode(fmode));
2402 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2404 /* apply for lease */
2405 och = ll_lease_open(inode, file, fmode, 0);
2407 RETURN(PTR_ERR(och));
2410 mutex_lock(&lli->lli_och_mutex);
2411 if (fd->fd_lease_och == NULL) {
2412 fd->fd_lease_och = och;
2415 mutex_unlock(&lli->lli_och_mutex);
2417 /* impossible now that only excl is supported for now */
2418 ll_lease_close(och, inode, &lease_broken);
2423 case LL_IOC_GET_LEASE: {
2424 struct ll_inode_info *lli = ll_i2info(inode);
2425 struct ldlm_lock *lock = NULL;
2428 mutex_lock(&lli->lli_och_mutex);
2429 if (fd->fd_lease_och != NULL) {
2430 struct obd_client_handle *och = fd->fd_lease_och;
2432 lock = ldlm_handle2lock(&och->och_lease_handle);
2434 lock_res_and_lock(lock);
2435 if (!ldlm_is_cancel(lock))
2436 fmode = och->och_flags;
2438 unlock_res_and_lock(lock);
2439 LDLM_LOCK_PUT(lock);
2442 mutex_unlock(&lli->lli_och_mutex);
2444 RETURN(ll_lease_type_from_fmode(fmode));
2446 case LL_IOC_HSM_IMPORT: {
2447 struct hsm_user_import *hui;
2453 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2458 rc = ll_hsm_import(inode, file, hui);
2468 ll_iocontrol_call(inode, file, cmd, arg, &err))
2471 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2472 (void __user *)arg));
2477 #ifndef HAVE_FILE_LLSEEK_SIZE
2478 static inline loff_t
2479 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2481 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2483 if (offset > maxsize)
2486 if (offset != file->f_pos) {
2487 file->f_pos = offset;
2488 file->f_version = 0;
2494 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2495 loff_t maxsize, loff_t eof)
2497 struct inode *inode = file->f_path.dentry->d_inode;
2505 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2506 * position-querying operation. Avoid rewriting the "same"
2507 * f_pos value back to the file because a concurrent read(),
2508 * write() or lseek() might have altered it
2513 * f_lock protects against read/modify/write race with other
2514 * SEEK_CURs. Note that parallel writes and reads behave
2517 mutex_lock(&inode->i_mutex);
2518 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2519 mutex_unlock(&inode->i_mutex);
2523 * In the generic case the entire file is data, so as long as
2524 * offset isn't at the end of the file then the offset is data.
2531 * There is a virtual hole at the end of the file, so as long as
2532 * offset isn't i_size or larger, return i_size.
2540 return llseek_execute(file, offset, maxsize);
2544 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2546 struct inode *inode = file->f_path.dentry->d_inode;
2547 loff_t retval, eof = 0;
2550 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2551 (origin == SEEK_CUR) ? file->f_pos : 0);
2552 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2553 PFID(ll_inode2fid(inode)), inode, retval, retval,
2555 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2557 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2558 retval = ll_glimpse_size(inode);
2561 eof = i_size_read(inode);
2564 retval = ll_generic_file_llseek_size(file, offset, origin,
2565 ll_file_maxbytes(inode), eof);
2569 static int ll_flush(struct file *file, fl_owner_t id)
2571 struct inode *inode = file->f_path.dentry->d_inode;
2572 struct ll_inode_info *lli = ll_i2info(inode);
2573 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2576 LASSERT(!S_ISDIR(inode->i_mode));
2578 /* catch async errors that were recorded back when async writeback
2579 * failed for pages in this mapping. */
2580 rc = lli->lli_async_rc;
2581 lli->lli_async_rc = 0;
2582 if (lli->lli_clob != NULL) {
2583 err = lov_read_and_clear_async_rc(lli->lli_clob);
2588 /* The application has been told write failure already.
2589 * Do not report failure again. */
2590 if (fd->fd_write_failed)
2592 return rc ? -EIO : 0;
2596 * Called to make sure a portion of file has been written out.
2597 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2599 * Return how many pages have been written.
2601 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2602 enum cl_fsync_mode mode, int ignore_layout)
2604 struct cl_env_nest nest;
2607 struct cl_fsync_io *fio;
2611 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2612 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2615 env = cl_env_nested_get(&nest);
2617 RETURN(PTR_ERR(env));
2619 io = vvp_env_thread_io(env);
2620 io->ci_obj = ll_i2info(inode)->lli_clob;
2621 io->ci_ignore_layout = ignore_layout;
2623 /* initialize parameters for sync */
2624 fio = &io->u.ci_fsync;
2625 fio->fi_start = start;
2627 fio->fi_fid = ll_inode2fid(inode);
2628 fio->fi_mode = mode;
2629 fio->fi_nr_written = 0;
2631 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2632 result = cl_io_loop(env, io);
2634 result = io->ci_result;
2636 result = fio->fi_nr_written;
2637 cl_io_fini(env, io);
2638 cl_env_nested_put(&nest, env);
2644 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2645 * null and dentry must be used directly rather than pulled from
2646 * *file->f_path.dentry as is done otherwise.
2649 #ifdef HAVE_FILE_FSYNC_4ARGS
2650 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2652 struct dentry *dentry = file->f_path.dentry;
2653 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2654 int ll_fsync(struct file *file, int datasync)
2656 struct dentry *dentry = file->f_path.dentry;
2658 loff_t end = LLONG_MAX;
2660 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2663 loff_t end = LLONG_MAX;
2665 struct inode *inode = dentry->d_inode;
2666 struct ll_inode_info *lli = ll_i2info(inode);
2667 struct ptlrpc_request *req;
2671 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2672 PFID(ll_inode2fid(inode)), inode);
2673 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2675 #ifdef HAVE_FILE_FSYNC_4ARGS
2676 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2677 mutex_lock(&inode->i_mutex);
2679 /* fsync's caller has already called _fdata{sync,write}, we want
2680 * that IO to finish before calling the osc and mdc sync methods */
2681 rc = filemap_fdatawait(inode->i_mapping);
2684 /* catch async errors that were recorded back when async writeback
2685 * failed for pages in this mapping. */
2686 if (!S_ISDIR(inode->i_mode)) {
2687 err = lli->lli_async_rc;
2688 lli->lli_async_rc = 0;
2691 err = lov_read_and_clear_async_rc(lli->lli_clob);
2696 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2700 ptlrpc_req_finished(req);
2702 if (S_ISREG(inode->i_mode)) {
2703 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2705 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2706 if (rc == 0 && err < 0)
2709 fd->fd_write_failed = true;
2711 fd->fd_write_failed = false;
2714 #ifdef HAVE_FILE_FSYNC_4ARGS
2715 mutex_unlock(&inode->i_mutex);
2721 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2723 struct inode *inode = file->f_path.dentry->d_inode;
2724 struct ll_sb_info *sbi = ll_i2sbi(inode);
2725 struct ldlm_enqueue_info einfo = {
2726 .ei_type = LDLM_FLOCK,
2727 .ei_cb_cp = ldlm_flock_completion_ast,
2728 .ei_cbdata = file_lock,
2730 struct md_op_data *op_data;
2731 struct lustre_handle lockh = {0};
2732 ldlm_policy_data_t flock = {{0}};
2733 int fl_type = file_lock->fl_type;
2739 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2740 PFID(ll_inode2fid(inode)), file_lock);
2742 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2744 if (file_lock->fl_flags & FL_FLOCK) {
2745 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2746 /* flocks are whole-file locks */
2747 flock.l_flock.end = OFFSET_MAX;
2748 /* For flocks owner is determined by the local file desctiptor*/
2749 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2750 } else if (file_lock->fl_flags & FL_POSIX) {
2751 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2752 flock.l_flock.start = file_lock->fl_start;
2753 flock.l_flock.end = file_lock->fl_end;
2757 flock.l_flock.pid = file_lock->fl_pid;
2759 /* Somewhat ugly workaround for svc lockd.
2760 * lockd installs custom fl_lmops->lm_compare_owner that checks
2761 * for the fl_owner to be the same (which it always is on local node
2762 * I guess between lockd processes) and then compares pid.
2763 * As such we assign pid to the owner field to make it all work,
2764 * conflict with normal locks is unlikely since pid space and
2765 * pointer space for current->files are not intersecting */
2766 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2767 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2771 einfo.ei_mode = LCK_PR;
2774 /* An unlock request may or may not have any relation to
2775 * existing locks so we may not be able to pass a lock handle
2776 * via a normal ldlm_lock_cancel() request. The request may even
2777 * unlock a byte range in the middle of an existing lock. In
2778 * order to process an unlock request we need all of the same
2779 * information that is given with a normal read or write record
2780 * lock request. To avoid creating another ldlm unlock (cancel)
2781 * message we'll treat a LCK_NL flock request as an unlock. */
2782 einfo.ei_mode = LCK_NL;
2785 einfo.ei_mode = LCK_PW;
2788 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2803 flags = LDLM_FL_BLOCK_NOWAIT;
2809 flags = LDLM_FL_TEST_LOCK;
2812 CERROR("unknown fcntl lock command: %d\n", cmd);
2816 /* Save the old mode so that if the mode in the lock changes we
2817 * can decrement the appropriate reader or writer refcount. */
2818 file_lock->fl_type = einfo.ei_mode;
2820 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2821 LUSTRE_OPC_ANY, NULL);
2822 if (IS_ERR(op_data))
2823 RETURN(PTR_ERR(op_data));
2825 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2826 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2827 flock.l_flock.pid, flags, einfo.ei_mode,
2828 flock.l_flock.start, flock.l_flock.end);
2830 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2833 /* Restore the file lock type if not TEST lock. */
2834 if (!(flags & LDLM_FL_TEST_LOCK))
2835 file_lock->fl_type = fl_type;
2837 if ((file_lock->fl_flags & FL_FLOCK) &&
2838 (rc == 0 || file_lock->fl_type == F_UNLCK))
2839 rc2 = flock_lock_file_wait(file, file_lock);
2840 if ((file_lock->fl_flags & FL_POSIX) &&
2841 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2842 !(flags & LDLM_FL_TEST_LOCK))
2843 rc2 = posix_lock_file_wait(file, file_lock);
2845 if (rc2 && file_lock->fl_type != F_UNLCK) {
2846 einfo.ei_mode = LCK_NL;
2847 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2852 ll_finish_md_op_data(op_data);
2857 int ll_get_fid_by_name(struct inode *parent, const char *name,
2858 int namelen, struct lu_fid *fid)
2860 struct md_op_data *op_data = NULL;
2861 struct mdt_body *body;
2862 struct ptlrpc_request *req;
2866 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2867 LUSTRE_OPC_ANY, NULL);
2868 if (IS_ERR(op_data))
2869 RETURN(PTR_ERR(op_data));
2871 op_data->op_valid = OBD_MD_FLID;
2872 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2873 ll_finish_md_op_data(op_data);
2877 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2879 GOTO(out_req, rc = -EFAULT);
2881 *fid = body->mbo_fid1;
2883 ptlrpc_req_finished(req);
2887 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2888 const char *name, int namelen)
2890 struct dentry *dchild = NULL;
2891 struct inode *child_inode = NULL;
2892 struct md_op_data *op_data;
2893 struct ptlrpc_request *request = NULL;
2898 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2899 name, PFID(ll_inode2fid(parent)), mdtidx);
2901 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2902 0, LUSTRE_OPC_ANY, NULL);
2903 if (IS_ERR(op_data))
2904 RETURN(PTR_ERR(op_data));
2906 /* Get child FID first */
2907 qstr.hash = full_name_hash(name, namelen);
2910 dchild = d_lookup(file->f_path.dentry, &qstr);
2911 if (dchild != NULL) {
2912 if (dchild->d_inode != NULL) {
2913 child_inode = igrab(dchild->d_inode);
2914 if (child_inode != NULL) {
2915 mutex_lock(&child_inode->i_mutex);
2916 op_data->op_fid3 = *ll_inode2fid(child_inode);
2917 ll_invalidate_aliases(child_inode);
2922 rc = ll_get_fid_by_name(parent, name, namelen,
2928 if (!fid_is_sane(&op_data->op_fid3)) {
2929 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2930 ll_get_fsname(parent->i_sb, NULL, 0), name,
2931 PFID(&op_data->op_fid3));
2932 GOTO(out_free, rc = -EINVAL);
2935 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2940 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2941 PFID(&op_data->op_fid3), mdtidx);
2942 GOTO(out_free, rc = 0);
2945 op_data->op_mds = mdtidx;
2946 op_data->op_cli_flags = CLI_MIGRATE;
2947 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2948 namelen, name, namelen, &request);
2950 ll_update_times(request, parent);
2952 ptlrpc_req_finished(request);
2957 if (child_inode != NULL) {
2958 clear_nlink(child_inode);
2959 mutex_unlock(&child_inode->i_mutex);
2963 ll_finish_md_op_data(op_data);
2968 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2976 * test if some locks matching bits and l_req_mode are acquired
2977 * - bits can be in different locks
2978 * - if found clear the common lock bits in *bits
2979 * - the bits not found, are kept in *bits
2981 * \param bits [IN] searched lock bits [IN]
2982 * \param l_req_mode [IN] searched lock mode
2983 * \retval boolean, true iff all bits are found
2985 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2987 struct lustre_handle lockh;
2988 ldlm_policy_data_t policy;
2989 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2990 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2999 fid = &ll_i2info(inode)->lli_fid;
3000 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3001 ldlm_lockname[mode]);
3003 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3004 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3005 policy.l_inodebits.bits = *bits & (1 << i);
3006 if (policy.l_inodebits.bits == 0)
3009 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3010 &policy, mode, &lockh)) {
3011 struct ldlm_lock *lock;
3013 lock = ldlm_handle2lock(&lockh);
3016 ~(lock->l_policy_data.l_inodebits.bits);
3017 LDLM_LOCK_PUT(lock);
3019 *bits &= ~policy.l_inodebits.bits;
3026 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3027 struct lustre_handle *lockh, __u64 flags,
3030 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3035 fid = &ll_i2info(inode)->lli_fid;
3036 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3038 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3039 fid, LDLM_IBITS, &policy, mode, lockh);
3044 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3046 /* Already unlinked. Just update nlink and return success */
3047 if (rc == -ENOENT) {
3049 /* This path cannot be hit for regular files unless in
3050 * case of obscure races, so no need to to validate
3052 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3054 } else if (rc != 0) {
3055 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3056 "%s: revalidate FID "DFID" error: rc = %d\n",
3057 ll_get_fsname(inode->i_sb, NULL, 0),
3058 PFID(ll_inode2fid(inode)), rc);
3064 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3066 struct inode *inode = dentry->d_inode;
3067 struct ptlrpc_request *req = NULL;
3068 struct obd_export *exp;
3072 LASSERT(inode != NULL);
3074 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3075 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3077 exp = ll_i2mdexp(inode);
3079 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3080 * But under CMD case, it caused some lock issues, should be fixed
3081 * with new CMD ibits lock. See bug 12718 */
3082 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3083 struct lookup_intent oit = { .it_op = IT_GETATTR };
3084 struct md_op_data *op_data;
3086 if (ibits == MDS_INODELOCK_LOOKUP)
3087 oit.it_op = IT_LOOKUP;
3089 /* Call getattr by fid, so do not provide name at all. */
3090 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3091 dentry->d_inode, NULL, 0, 0,
3092 LUSTRE_OPC_ANY, NULL);
3093 if (IS_ERR(op_data))
3094 RETURN(PTR_ERR(op_data));
3096 rc = md_intent_lock(exp, op_data, &oit, &req,
3097 &ll_md_blocking_ast, 0);
3098 ll_finish_md_op_data(op_data);
3100 rc = ll_inode_revalidate_fini(inode, rc);
3104 rc = ll_revalidate_it_finish(req, &oit, dentry);
3106 ll_intent_release(&oit);
3110 /* Unlinked? Unhash dentry, so it is not picked up later by
3111 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3112 here to preserve get_cwd functionality on 2.6.
3114 if (!dentry->d_inode->i_nlink)
3115 d_lustre_invalidate(dentry, 0);
3117 ll_lookup_finish_locks(&oit, dentry);
3118 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3119 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3120 u64 valid = OBD_MD_FLGETATTR;
3121 struct md_op_data *op_data;
3124 if (S_ISREG(inode->i_mode)) {
3125 rc = ll_get_default_mdsize(sbi, &ealen);
3128 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3131 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3132 0, ealen, LUSTRE_OPC_ANY,
3134 if (IS_ERR(op_data))
3135 RETURN(PTR_ERR(op_data));
3137 op_data->op_valid = valid;
3138 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3139 ll_finish_md_op_data(op_data);
3141 rc = ll_inode_revalidate_fini(inode, rc);
3145 rc = ll_prep_inode(&inode, req, NULL, NULL);
3148 ptlrpc_req_finished(req);
3152 static int ll_merge_md_attr(struct inode *inode)
3154 struct cl_attr attr = { 0 };
3157 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3158 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3159 &attr, ll_md_blocking_ast);
3163 set_nlink(inode, attr.cat_nlink);
3164 inode->i_blocks = attr.cat_blocks;
3165 i_size_write(inode, attr.cat_size);
3167 ll_i2info(inode)->lli_atime = attr.cat_atime;
3168 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3169 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3175 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3177 struct inode *inode = dentry->d_inode;
3181 rc = __ll_inode_revalidate(dentry, ibits);
3185 /* if object isn't regular file, don't validate size */
3186 if (!S_ISREG(inode->i_mode)) {
3187 if (S_ISDIR(inode->i_mode) &&
3188 ll_i2info(inode)->lli_lsm_md != NULL) {
3189 rc = ll_merge_md_attr(inode);
3194 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3195 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3196 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3198 /* In case of restore, the MDT has the right size and has
3199 * already send it back without granting the layout lock,
3200 * inode is up-to-date so glimpse is useless.
3201 * Also to glimpse we need the layout, in case of a running
3202 * restore the MDT holds the layout lock so the glimpse will
3203 * block up to the end of restore (getattr will block)
3205 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3206 rc = ll_glimpse_size(inode);
3211 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3213 struct inode *inode = de->d_inode;
3214 struct ll_sb_info *sbi = ll_i2sbi(inode);
3215 struct ll_inode_info *lli = ll_i2info(inode);
3218 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3219 MDS_INODELOCK_LOOKUP);
3220 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3225 stat->dev = inode->i_sb->s_dev;
3226 if (ll_need_32bit_api(sbi))
3227 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3229 stat->ino = inode->i_ino;
3230 stat->mode = inode->i_mode;
3231 stat->uid = inode->i_uid;
3232 stat->gid = inode->i_gid;
3233 stat->rdev = inode->i_rdev;
3234 stat->atime = inode->i_atime;
3235 stat->mtime = inode->i_mtime;
3236 stat->ctime = inode->i_ctime;
3237 stat->blksize = 1 << inode->i_blkbits;
3239 stat->nlink = inode->i_nlink;
3240 stat->size = i_size_read(inode);
3241 stat->blocks = inode->i_blocks;
3246 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3247 __u64 start, __u64 len)
3251 struct fiemap *fiemap;
3252 unsigned int extent_count = fieinfo->fi_extents_max;
3254 num_bytes = sizeof(*fiemap) + (extent_count *
3255 sizeof(struct fiemap_extent));
3256 OBD_ALLOC_LARGE(fiemap, num_bytes);
3261 fiemap->fm_flags = fieinfo->fi_flags;
3262 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3263 fiemap->fm_start = start;
3264 fiemap->fm_length = len;
3265 if (extent_count > 0 &&
3266 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3267 sizeof(struct fiemap_extent)) != 0)
3268 GOTO(out, rc = -EFAULT);
3270 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3272 fieinfo->fi_flags = fiemap->fm_flags;
3273 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3274 if (extent_count > 0 &&
3275 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3276 fiemap->fm_mapped_extents *
3277 sizeof(struct fiemap_extent)) != 0)
3278 GOTO(out, rc = -EFAULT);
3280 OBD_FREE_LARGE(fiemap, num_bytes);
3284 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3286 struct ll_inode_info *lli = ll_i2info(inode);
3287 struct posix_acl *acl = NULL;
3290 spin_lock(&lli->lli_lock);
3291 /* VFS' acl_permission_check->check_acl will release the refcount */
3292 acl = posix_acl_dup(lli->lli_posix_acl);
3293 spin_unlock(&lli->lli_lock);
3298 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3300 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3301 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3303 ll_check_acl(struct inode *inode, int mask)
3306 # ifdef CONFIG_FS_POSIX_ACL
3307 struct posix_acl *acl;
3311 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3312 if (flags & IPERM_FLAG_RCU)
3315 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3320 rc = posix_acl_permission(inode, acl, mask);
3321 posix_acl_release(acl);
3324 # else /* !CONFIG_FS_POSIX_ACL */
3326 # endif /* CONFIG_FS_POSIX_ACL */
3328 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3330 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3331 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3333 # ifdef HAVE_INODE_PERMISION_2ARGS
3334 int ll_inode_permission(struct inode *inode, int mask)
3336 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3341 struct ll_sb_info *sbi;
3342 struct root_squash_info *squash;
3343 struct cred *cred = NULL;
3344 const struct cred *old_cred = NULL;
3346 bool squash_id = false;
3349 #ifdef MAY_NOT_BLOCK
3350 if (mask & MAY_NOT_BLOCK)
3352 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3353 if (flags & IPERM_FLAG_RCU)
3357 /* as root inode are NOT getting validated in lookup operation,
3358 * need to do it before permission check. */
3360 if (inode == inode->i_sb->s_root->d_inode) {
3361 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3362 MDS_INODELOCK_LOOKUP);
3367 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3368 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3370 /* squash fsuid/fsgid if needed */
3371 sbi = ll_i2sbi(inode);
3372 squash = &sbi->ll_squash;
3373 if (unlikely(squash->rsi_uid != 0 &&
3374 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3375 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3379 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3380 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3381 squash->rsi_uid, squash->rsi_gid);
3383 /* update current process's credentials
3384 * and FS capability */
3385 cred = prepare_creds();
3389 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3390 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3391 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3392 if ((1 << cap) & CFS_CAP_FS_MASK)
3393 cap_lower(cred->cap_effective, cap);
3395 old_cred = override_creds(cred);
3398 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3400 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3401 rc = lustre_check_remote_perm(inode, mask);
3403 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3405 /* restore current process's credentials and FS capability */
3407 revert_creds(old_cred);
3414 /* -o localflock - only provides locally consistent flock locks */
3415 struct file_operations ll_file_operations = {
3416 .read = ll_file_read,
3417 .aio_read = ll_file_aio_read,
3418 .write = ll_file_write,
3419 .aio_write = ll_file_aio_write,
3420 .unlocked_ioctl = ll_file_ioctl,
3421 .open = ll_file_open,
3422 .release = ll_file_release,
3423 .mmap = ll_file_mmap,
3424 .llseek = ll_file_seek,
3425 .splice_read = ll_file_splice_read,
3430 struct file_operations ll_file_operations_flock = {
3431 .read = ll_file_read,
3432 .aio_read = ll_file_aio_read,
3433 .write = ll_file_write,
3434 .aio_write = ll_file_aio_write,
3435 .unlocked_ioctl = ll_file_ioctl,
3436 .open = ll_file_open,
3437 .release = ll_file_release,
3438 .mmap = ll_file_mmap,
3439 .llseek = ll_file_seek,
3440 .splice_read = ll_file_splice_read,
3443 .flock = ll_file_flock,
3444 .lock = ll_file_flock
3447 /* These are for -o noflock - to return ENOSYS on flock calls */
3448 struct file_operations ll_file_operations_noflock = {
3449 .read = ll_file_read,
3450 .aio_read = ll_file_aio_read,
3451 .write = ll_file_write,
3452 .aio_write = ll_file_aio_write,
3453 .unlocked_ioctl = ll_file_ioctl,
3454 .open = ll_file_open,
3455 .release = ll_file_release,
3456 .mmap = ll_file_mmap,
3457 .llseek = ll_file_seek,
3458 .splice_read = ll_file_splice_read,
3461 .flock = ll_file_noflock,
3462 .lock = ll_file_noflock
3465 struct inode_operations ll_file_inode_operations = {
3466 .setattr = ll_setattr,
3467 .getattr = ll_getattr,
3468 .permission = ll_inode_permission,
3469 .setxattr = ll_setxattr,
3470 .getxattr = ll_getxattr,
3471 .listxattr = ll_listxattr,
3472 .removexattr = ll_removexattr,
3473 .fiemap = ll_fiemap,
3474 #ifdef HAVE_IOP_GET_ACL
3475 .get_acl = ll_get_acl,
3479 /* dynamic ioctl number support routins */
3480 static struct llioc_ctl_data {
3481 struct rw_semaphore ioc_sem;
3482 struct list_head ioc_head;
3484 __RWSEM_INITIALIZER(llioc.ioc_sem),
3485 LIST_HEAD_INIT(llioc.ioc_head)
3490 struct list_head iocd_list;
3491 unsigned int iocd_size;
3492 llioc_callback_t iocd_cb;
3493 unsigned int iocd_count;
3494 unsigned int iocd_cmd[0];
3497 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3500 struct llioc_data *in_data = NULL;
3503 if (cb == NULL || cmd == NULL ||
3504 count > LLIOC_MAX_CMD || count < 0)
3507 size = sizeof(*in_data) + count * sizeof(unsigned int);
3508 OBD_ALLOC(in_data, size);
3509 if (in_data == NULL)
3512 memset(in_data, 0, sizeof(*in_data));
3513 in_data->iocd_size = size;
3514 in_data->iocd_cb = cb;
3515 in_data->iocd_count = count;
3516 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3518 down_write(&llioc.ioc_sem);
3519 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3520 up_write(&llioc.ioc_sem);
3525 void ll_iocontrol_unregister(void *magic)
3527 struct llioc_data *tmp;
3532 down_write(&llioc.ioc_sem);
3533 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3535 unsigned int size = tmp->iocd_size;
3537 list_del(&tmp->iocd_list);
3538 up_write(&llioc.ioc_sem);
3540 OBD_FREE(tmp, size);
3544 up_write(&llioc.ioc_sem);
3546 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3549 EXPORT_SYMBOL(ll_iocontrol_register);
3550 EXPORT_SYMBOL(ll_iocontrol_unregister);
3552 static enum llioc_iter
3553 ll_iocontrol_call(struct inode *inode, struct file *file,
3554 unsigned int cmd, unsigned long arg, int *rcp)
3556 enum llioc_iter ret = LLIOC_CONT;
3557 struct llioc_data *data;
3558 int rc = -EINVAL, i;
3560 down_read(&llioc.ioc_sem);
3561 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3562 for (i = 0; i < data->iocd_count; i++) {
3563 if (cmd != data->iocd_cmd[i])
3566 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3570 if (ret == LLIOC_STOP)
3573 up_read(&llioc.ioc_sem);
3580 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3582 struct ll_inode_info *lli = ll_i2info(inode);
3583 struct cl_object *obj = lli->lli_clob;
3584 struct cl_env_nest nest;
3592 env = cl_env_nested_get(&nest);
3594 RETURN(PTR_ERR(env));
3596 rc = cl_conf_set(env, lli->lli_clob, conf);
3600 if (conf->coc_opc == OBJECT_CONF_SET) {
3601 struct ldlm_lock *lock = conf->coc_lock;
3602 struct cl_layout cl = {
3606 LASSERT(lock != NULL);
3607 LASSERT(ldlm_has_layout(lock));
3609 /* it can only be allowed to match after layout is
3610 * applied to inode otherwise false layout would be
3611 * seen. Applying layout shoud happen before dropping
3612 * the intent lock. */
3613 ldlm_lock_allow_match(lock);
3615 rc = cl_object_layout_get(env, obj, &cl);
3620 DFID": layout version change: %u -> %u\n",
3621 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3623 ll_layout_version_set(lli, cl.cl_layout_gen);
3627 cl_env_nested_put(&nest, env);
3632 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3633 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3636 struct ll_sb_info *sbi = ll_i2sbi(inode);
3637 struct ptlrpc_request *req;
3638 struct mdt_body *body;
3645 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3646 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3647 lock->l_lvb_data, lock->l_lvb_len);
3649 if (lock->l_lvb_data != NULL)
3652 /* if layout lock was granted right away, the layout is returned
3653 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3654 * blocked and then granted via completion ast, we have to fetch
3655 * layout here. Please note that we can't use the LVB buffer in
3656 * completion AST because it doesn't have a large enough buffer */
3657 rc = ll_get_default_mdsize(sbi, &lmmsize);
3659 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3660 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3665 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3667 GOTO(out, rc = -EPROTO);
3669 lmmsize = body->mbo_eadatasize;
3670 if (lmmsize == 0) /* empty layout */
3673 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3675 GOTO(out, rc = -EFAULT);
3677 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3678 if (lvbdata == NULL)
3679 GOTO(out, rc = -ENOMEM);
3681 memcpy(lvbdata, lmm, lmmsize);
3682 lock_res_and_lock(lock);
3683 if (unlikely(lock->l_lvb_data == NULL)) {
3684 lock->l_lvb_type = LVB_T_LAYOUT;
3685 lock->l_lvb_data = lvbdata;
3686 lock->l_lvb_len = lmmsize;
3689 unlock_res_and_lock(lock);
3691 if (lvbdata != NULL)
3692 OBD_FREE_LARGE(lvbdata, lmmsize);
3697 ptlrpc_req_finished(req);
3702 * Apply the layout to the inode. Layout lock is held and will be released
3705 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3706 struct inode *inode)
3708 struct ll_inode_info *lli = ll_i2info(inode);
3709 struct ll_sb_info *sbi = ll_i2sbi(inode);
3710 struct ldlm_lock *lock;
3711 struct cl_object_conf conf;
3714 bool wait_layout = false;
3717 LASSERT(lustre_handle_is_used(lockh));
3719 lock = ldlm_handle2lock(lockh);
3720 LASSERT(lock != NULL);
3721 LASSERT(ldlm_has_layout(lock));
3723 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3724 PFID(&lli->lli_fid), inode);
3726 /* in case this is a caching lock and reinstate with new inode */
3727 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3729 lock_res_and_lock(lock);
3730 lvb_ready = ldlm_is_lvb_ready(lock);
3731 unlock_res_and_lock(lock);
3732 /* checking lvb_ready is racy but this is okay. The worst case is
3733 * that multi processes may configure the file on the same time. */
3738 rc = ll_layout_fetch(inode, lock);
3742 /* for layout lock, lmm is stored in lock's lvb.
3743 * lvb_data is immutable if the lock is held so it's safe to access it
3746 * set layout to file. Unlikely this will fail as old layout was
3747 * surely eliminated */
3748 memset(&conf, 0, sizeof conf);
3749 conf.coc_opc = OBJECT_CONF_SET;
3750 conf.coc_inode = inode;
3751 conf.coc_lock = lock;
3752 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
3753 conf.u.coc_layout.lb_len = lock->l_lvb_len;
3754 rc = ll_layout_conf(inode, &conf);
3756 /* refresh layout failed, need to wait */
3757 wait_layout = rc == -EBUSY;
3761 LDLM_LOCK_PUT(lock);
3762 ldlm_lock_decref(lockh, mode);
3764 /* wait for IO to complete if it's still being used. */
3766 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3767 ll_get_fsname(inode->i_sb, NULL, 0),
3768 PFID(&lli->lli_fid), inode);
3770 memset(&conf, 0, sizeof conf);
3771 conf.coc_opc = OBJECT_CONF_WAIT;
3772 conf.coc_inode = inode;
3773 rc = ll_layout_conf(inode, &conf);
3777 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3778 ll_get_fsname(inode->i_sb, NULL, 0),
3779 PFID(&lli->lli_fid), rc);
3784 static int ll_layout_refresh_locked(struct inode *inode)
3786 struct ll_inode_info *lli = ll_i2info(inode);
3787 struct ll_sb_info *sbi = ll_i2sbi(inode);
3788 struct md_op_data *op_data;
3789 struct lookup_intent it;
3790 struct lustre_handle lockh;
3792 struct ldlm_enqueue_info einfo = {
3793 .ei_type = LDLM_IBITS,
3795 .ei_cb_bl = &ll_md_blocking_ast,
3796 .ei_cb_cp = &ldlm_completion_ast,
3802 /* mostly layout lock is caching on the local side, so try to match
3803 * it before grabbing layout lock mutex. */
3804 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3805 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3806 if (mode != 0) { /* hit cached lock */
3807 rc = ll_layout_lock_set(&lockh, mode, inode);
3814 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3815 0, 0, LUSTRE_OPC_ANY, NULL);
3816 if (IS_ERR(op_data))
3817 RETURN(PTR_ERR(op_data));
3819 /* have to enqueue one */
3820 memset(&it, 0, sizeof(it));
3821 it.it_op = IT_LAYOUT;
3822 lockh.cookie = 0ULL;
3824 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3825 ll_get_fsname(inode->i_sb, NULL, 0),
3826 PFID(&lli->lli_fid), inode);
3828 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3829 if (it.d.lustre.it_data != NULL)
3830 ptlrpc_req_finished(it.d.lustre.it_data);
3831 it.d.lustre.it_data = NULL;
3833 ll_finish_md_op_data(op_data);
3835 mode = it.d.lustre.it_lock_mode;
3836 it.d.lustre.it_lock_mode = 0;
3837 ll_intent_drop_lock(&it);
3840 /* set lock data in case this is a new lock */
3841 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3842 rc = ll_layout_lock_set(&lockh, mode, inode);
3851 * This function checks if there exists a LAYOUT lock on the client side,
3852 * or enqueues it if it doesn't have one in cache.
3854 * This function will not hold layout lock so it may be revoked any time after
3855 * this function returns. Any operations depend on layout should be redone
3858 * This function should be called before lov_io_init() to get an uptodate
3859 * layout version, the caller should save the version number and after IO
3860 * is finished, this function should be called again to verify that layout
3861 * is not changed during IO time.
3863 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3865 struct ll_inode_info *lli = ll_i2info(inode);
3866 struct ll_sb_info *sbi = ll_i2sbi(inode);
3870 *gen = ll_layout_version_get(lli);
3871 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3875 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3876 LASSERT(S_ISREG(inode->i_mode));
3878 /* take layout lock mutex to enqueue layout lock exclusively. */
3879 mutex_lock(&lli->lli_layout_mutex);
3881 rc = ll_layout_refresh_locked(inode);
3885 *gen = ll_layout_version_get(lli);
3887 mutex_unlock(&lli->lli_layout_mutex);
3893 * This function send a restore request to the MDT
3895 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3897 struct hsm_user_request *hur;
3901 len = sizeof(struct hsm_user_request) +
3902 sizeof(struct hsm_user_item);
3903 OBD_ALLOC(hur, len);
3907 hur->hur_request.hr_action = HUA_RESTORE;
3908 hur->hur_request.hr_archive_id = 0;
3909 hur->hur_request.hr_flags = 0;
3910 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3911 sizeof(hur->hur_user_item[0].hui_fid));
3912 hur->hur_user_item[0].hui_extent.offset = offset;
3913 hur->hur_user_item[0].hui_extent.length = length;
3914 hur->hur_request.hr_itemcount = 1;
3915 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,