4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
90 * Packs all the attributes into @op_data for the CLOSE rpc.
92 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
93 struct obd_client_handle *och)
97 ll_prep_md_op_data(op_data, inode, NULL, NULL,
98 0, 0, LUSTRE_OPC_ANY, NULL);
100 op_data->op_attr.ia_mode = inode->i_mode;
101 op_data->op_attr.ia_atime = inode->i_atime;
102 op_data->op_attr.ia_mtime = inode->i_mtime;
103 op_data->op_attr.ia_ctime = inode->i_ctime;
104 op_data->op_attr.ia_size = i_size_read(inode);
105 op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
106 ATTR_MTIME | ATTR_MTIME_SET |
107 ATTR_CTIME | ATTR_CTIME_SET;
108 op_data->op_attr_blocks = inode->i_blocks;
109 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
110 op_data->op_handle = och->och_fh;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct obd_export *md_exp,
130 struct obd_client_handle *och,
132 enum mds_op_bias bias,
135 struct obd_export *exp = ll_i2mdexp(inode);
136 struct md_op_data *op_data;
137 struct ptlrpc_request *req = NULL;
138 struct obd_device *obd = class_exp2obd(exp);
144 * XXX: in case of LMV, is this correct to access
147 CERROR("Invalid MDC connection handle "LPX64"\n",
148 ll_i2mdexp(inode)->exp_handle.h_cookie);
152 OBD_ALLOC_PTR(op_data);
154 /* XXX We leak openhandle and request here. */
155 GOTO(out, rc = -ENOMEM);
157 ll_prepare_close(inode, op_data, och);
159 case MDS_CLOSE_LAYOUT_SWAP:
160 LASSERT(data != NULL);
161 op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
162 op_data->op_data_version = 0;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_fid2 = *ll_inode2fid(data);
167 case MDS_HSM_RELEASE:
168 LASSERT(data != NULL);
169 op_data->op_bias |= MDS_HSM_RELEASE;
170 op_data->op_data_version = *(__u64 *)data;
171 op_data->op_lease_handle = och->och_lease_handle;
172 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
176 LASSERT(data == NULL);
180 rc = md_close(md_exp, op_data, och->och_mod, &req);
182 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
183 ll_i2mdexp(inode)->exp_obd->obd_name,
184 PFID(ll_inode2fid(inode)), rc);
188 op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
189 struct mdt_body *body;
191 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
192 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
196 ll_finish_md_op_data(op_data);
200 md_clear_open_replay_data(md_exp, och);
201 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
204 if (req) /* This is close request */
205 ptlrpc_req_finished(req);
209 int ll_md_real_close(struct inode *inode, fmode_t fmode)
211 struct ll_inode_info *lli = ll_i2info(inode);
212 struct obd_client_handle **och_p;
213 struct obd_client_handle *och;
218 if (fmode & FMODE_WRITE) {
219 och_p = &lli->lli_mds_write_och;
220 och_usecount = &lli->lli_open_fd_write_count;
221 } else if (fmode & FMODE_EXEC) {
222 och_p = &lli->lli_mds_exec_och;
223 och_usecount = &lli->lli_open_fd_exec_count;
225 LASSERT(fmode & FMODE_READ);
226 och_p = &lli->lli_mds_read_och;
227 och_usecount = &lli->lli_open_fd_read_count;
230 mutex_lock(&lli->lli_och_mutex);
231 if (*och_usecount > 0) {
232 /* There are still users of this handle, so skip
234 mutex_unlock(&lli->lli_och_mutex);
240 mutex_unlock(&lli->lli_och_mutex);
243 /* There might be a race and this handle may already
245 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
246 och, inode, 0, NULL);
252 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
255 ldlm_policy_data_t policy = {
256 .l_inodebits = { MDS_INODELOCK_OPEN },
258 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
260 struct ll_inode_info *lli = ll_i2info(inode);
261 struct lustre_handle lockh;
266 /* clear group lock, if present */
267 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
268 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
270 if (fd->fd_lease_och != NULL) {
273 /* Usually the lease is not released when the
274 * application crashed, we need to release here. */
275 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
276 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
277 PFID(&lli->lli_fid), rc, lease_broken);
279 fd->fd_lease_och = NULL;
282 if (fd->fd_och != NULL) {
283 rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
289 /* Let's see if we have good enough OPEN lock on the file and if
290 we can skip talking to MDS */
291 mutex_lock(&lli->lli_och_mutex);
292 if (fd->fd_omode & FMODE_WRITE) {
294 LASSERT(lli->lli_open_fd_write_count);
295 lli->lli_open_fd_write_count--;
296 } else if (fd->fd_omode & FMODE_EXEC) {
298 LASSERT(lli->lli_open_fd_exec_count);
299 lli->lli_open_fd_exec_count--;
302 LASSERT(lli->lli_open_fd_read_count);
303 lli->lli_open_fd_read_count--;
305 mutex_unlock(&lli->lli_och_mutex);
307 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
308 LDLM_IBITS, &policy, lockmode, &lockh))
309 rc = ll_md_real_close(inode, fd->fd_omode);
312 LUSTRE_FPRIVATE(file) = NULL;
313 ll_file_data_put(fd);
318 /* While this returns an error code, fput() the caller does not, so we need
319 * to make every effort to clean up all of our state here. Also, applications
320 * rarely check close errors and even if an error is returned they will not
321 * re-try the close call.
323 int ll_file_release(struct inode *inode, struct file *file)
325 struct ll_file_data *fd;
326 struct ll_sb_info *sbi = ll_i2sbi(inode);
327 struct ll_inode_info *lli = ll_i2info(inode);
331 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
332 PFID(ll_inode2fid(inode)), inode);
334 #ifdef CONFIG_FS_POSIX_ACL
335 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
336 inode == inode->i_sb->s_root->d_inode) {
337 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
340 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
341 fd->fd_flags &= ~LL_FILE_RMTACL;
342 rct_del(&sbi->ll_rct, current_pid());
343 et_search_free(&sbi->ll_et, current_pid());
348 if (inode->i_sb->s_root != file->f_path.dentry)
349 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
350 fd = LUSTRE_FPRIVATE(file);
353 /* The last ref on @file, maybe not the the owner pid of statahead,
354 * because parent and child process can share the same file handle. */
355 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
356 ll_deauthorize_statahead(inode, fd);
358 if (inode->i_sb->s_root == file->f_path.dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 if (lli->lli_clob != NULL)
366 lov_read_and_clear_async_rc(lli->lli_clob);
367 lli->lli_async_rc = 0;
370 rc = ll_md_close(sbi->ll_md_exp, inode, file);
372 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
373 libcfs_debug_dumplog();
378 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
379 struct lookup_intent *itp)
381 struct dentry *de = file->f_path.dentry;
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->d.lustre.it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
443 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
444 struct obd_client_handle *och)
446 struct ptlrpc_request *req = it->d.lustre.it_data;
447 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 och->och_fh = body->mbo_handle;
451 och->och_fid = body->mbo_fid1;
452 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_flags = it->it_flags;
456 return md_set_open_replay_data(md_exp, och, it);
459 static int ll_local_open(struct file *file, struct lookup_intent *it,
460 struct ll_file_data *fd, struct obd_client_handle *och)
462 struct inode *inode = file->f_path.dentry->d_inode;
465 LASSERT(!LUSTRE_FPRIVATE(file));
472 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
477 LUSTRE_FPRIVATE(file) = fd;
478 ll_readahead_init(inode, &fd->fd_ras);
479 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
481 /* ll_cl_context initialize */
482 rwlock_init(&fd->fd_lock);
483 INIT_LIST_HEAD(&fd->fd_lccs);
488 /* Open a file, and (for the very first open) create objects on the OSTs at
489 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
490 * creation or open until ll_lov_setstripe() ioctl is called.
492 * If we already have the stripe MD locally then we don't request it in
493 * md_open(), by passing a lmm_size = 0.
495 * It is up to the application to ensure no other processes open this file
496 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
497 * used. We might be able to avoid races of that sort by getting lli_open_sem
498 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
499 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
501 int ll_file_open(struct inode *inode, struct file *file)
503 struct ll_inode_info *lli = ll_i2info(inode);
504 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
505 .it_flags = file->f_flags };
506 struct obd_client_handle **och_p = NULL;
507 __u64 *och_usecount = NULL;
508 struct ll_file_data *fd;
512 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
513 PFID(ll_inode2fid(inode)), inode, file->f_flags);
515 it = file->private_data; /* XXX: compat macro */
516 file->private_data = NULL; /* prevent ll_local_open assertion */
518 fd = ll_file_data_get();
520 GOTO(out_openerr, rc = -ENOMEM);
523 if (S_ISDIR(inode->i_mode))
524 ll_authorize_statahead(inode, fd);
526 if (inode->i_sb->s_root == file->f_path.dentry) {
527 LUSTRE_FPRIVATE(file) = fd;
531 if (!it || !it->d.lustre.it_disposition) {
532 /* Convert f_flags into access mode. We cannot use file->f_mode,
533 * because everything but O_ACCMODE mask was stripped from
535 if ((oit.it_flags + 1) & O_ACCMODE)
537 if (file->f_flags & O_TRUNC)
538 oit.it_flags |= FMODE_WRITE;
540 /* kernel only call f_op->open in dentry_open. filp_open calls
541 * dentry_open after call to open_namei that checks permissions.
542 * Only nfsd_open call dentry_open directly without checking
543 * permissions and because of that this code below is safe. */
544 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
545 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
547 /* We do not want O_EXCL here, presumably we opened the file
548 * already? XXX - NFS implications? */
549 oit.it_flags &= ~O_EXCL;
551 /* bug20584, if "it_flags" contains O_CREAT, the file will be
552 * created if necessary, then "IT_CREAT" should be set to keep
553 * consistent with it */
554 if (oit.it_flags & O_CREAT)
555 oit.it_op |= IT_CREAT;
561 /* Let's see if we have file open on MDS already. */
562 if (it->it_flags & FMODE_WRITE) {
563 och_p = &lli->lli_mds_write_och;
564 och_usecount = &lli->lli_open_fd_write_count;
565 } else if (it->it_flags & FMODE_EXEC) {
566 och_p = &lli->lli_mds_exec_och;
567 och_usecount = &lli->lli_open_fd_exec_count;
569 och_p = &lli->lli_mds_read_och;
570 och_usecount = &lli->lli_open_fd_read_count;
573 mutex_lock(&lli->lli_och_mutex);
574 if (*och_p) { /* Open handle is present */
575 if (it_disposition(it, DISP_OPEN_OPEN)) {
576 /* Well, there's extra open request that we do not need,
577 let's close it somehow. This will decref request. */
578 rc = it_open_error(DISP_OPEN_OPEN, it);
580 mutex_unlock(&lli->lli_och_mutex);
581 GOTO(out_openerr, rc);
584 ll_release_openhandle(file->f_path.dentry, it);
588 rc = ll_local_open(file, it, fd, NULL);
591 mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 LASSERT(*och_usecount == 0);
596 if (!it->d.lustre.it_disposition) {
597 /* We cannot just request lock handle now, new ELC code
598 means that one of other OPEN locks for this file
599 could be cancelled, and since blocking ast handler
600 would attempt to grab och_mutex as well, that would
601 result in a deadlock */
602 mutex_unlock(&lli->lli_och_mutex);
604 * Normally called under two situations:
606 * 2. A race/condition on MDS resulting in no open
607 * handle to be returned from LOOKUP|OPEN request,
608 * for example if the target entry was a symlink.
610 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
612 * Always specify MDS_OPEN_BY_FID because we don't want
613 * to get file with different fid.
615 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
616 rc = ll_intent_file_open(file, NULL, 0, it);
618 GOTO(out_openerr, rc);
622 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
624 GOTO(out_och_free, rc = -ENOMEM);
628 /* md_intent_lock() didn't get a request ref if there was an
629 * open error, so don't do cleanup on the request here
631 /* XXX (green): Should not we bail out on any error here, not
632 * just open error? */
633 rc = it_open_error(DISP_OPEN_OPEN, it);
635 GOTO(out_och_free, rc);
637 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
638 "inode %p: disposition %x, status %d\n", inode,
639 it_disposition(it, ~0), it->d.lustre.it_status);
641 rc = ll_local_open(file, it, fd, *och_p);
643 GOTO(out_och_free, rc);
645 mutex_unlock(&lli->lli_och_mutex);
648 /* Must do this outside lli_och_mutex lock to prevent deadlock where
649 different kind of OPEN lock for this same inode gets cancelled
650 by ldlm_cancel_lru */
651 if (!S_ISREG(inode->i_mode))
652 GOTO(out_och_free, rc);
654 cl_lov_delay_create_clear(&file->f_flags);
655 GOTO(out_och_free, rc);
659 if (och_p && *och_p) {
660 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
661 *och_p = NULL; /* OBD_FREE writes some magic there */
664 mutex_unlock(&lli->lli_och_mutex);
667 if (lli->lli_opendir_key == fd)
668 ll_deauthorize_statahead(inode, fd);
670 ll_file_data_put(fd);
672 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
675 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
676 ptlrpc_req_finished(it->d.lustre.it_data);
677 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
683 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
684 struct ldlm_lock_desc *desc, void *data, int flag)
687 struct lustre_handle lockh;
691 case LDLM_CB_BLOCKING:
692 ldlm_lock2handle(lock, &lockh);
693 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
695 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
699 case LDLM_CB_CANCELING:
707 * Acquire a lease and open the file.
709 static struct obd_client_handle *
710 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
713 struct lookup_intent it = { .it_op = IT_OPEN };
714 struct ll_sb_info *sbi = ll_i2sbi(inode);
715 struct md_op_data *op_data;
716 struct ptlrpc_request *req = NULL;
717 struct lustre_handle old_handle = { 0 };
718 struct obd_client_handle *och = NULL;
723 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
724 RETURN(ERR_PTR(-EINVAL));
727 struct ll_inode_info *lli = ll_i2info(inode);
728 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
729 struct obd_client_handle **och_p;
732 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
733 RETURN(ERR_PTR(-EPERM));
735 /* Get the openhandle of the file */
737 mutex_lock(&lli->lli_och_mutex);
738 if (fd->fd_lease_och != NULL) {
739 mutex_unlock(&lli->lli_och_mutex);
743 if (fd->fd_och == NULL) {
744 if (file->f_mode & FMODE_WRITE) {
745 LASSERT(lli->lli_mds_write_och != NULL);
746 och_p = &lli->lli_mds_write_och;
747 och_usecount = &lli->lli_open_fd_write_count;
749 LASSERT(lli->lli_mds_read_och != NULL);
750 och_p = &lli->lli_mds_read_och;
751 och_usecount = &lli->lli_open_fd_read_count;
753 if (*och_usecount == 1) {
760 mutex_unlock(&lli->lli_och_mutex);
761 if (rc < 0) /* more than 1 opener */
764 LASSERT(fd->fd_och != NULL);
765 old_handle = fd->fd_och->och_fh;
770 RETURN(ERR_PTR(-ENOMEM));
772 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
773 LUSTRE_OPC_ANY, NULL);
775 GOTO(out, rc = PTR_ERR(op_data));
777 /* To tell the MDT this openhandle is from the same owner */
778 op_data->op_handle = old_handle;
780 it.it_flags = fmode | open_flags;
781 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
782 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
783 &ll_md_blocking_lease_ast,
784 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
785 * it can be cancelled which may mislead applications that the lease is
787 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
788 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
789 * doesn't deal with openhandle, so normal openhandle will be leaked. */
790 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
791 ll_finish_md_op_data(op_data);
792 ptlrpc_req_finished(req);
794 GOTO(out_release_it, rc);
796 if (it_disposition(&it, DISP_LOOKUP_NEG))
797 GOTO(out_release_it, rc = -ENOENT);
799 rc = it_open_error(DISP_OPEN_OPEN, &it);
801 GOTO(out_release_it, rc);
803 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
804 ll_och_fill(sbi->ll_md_exp, &it, och);
806 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
807 GOTO(out_close, rc = -EOPNOTSUPP);
809 /* already get lease, handle lease lock */
810 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
811 if (it.d.lustre.it_lock_mode == 0 ||
812 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
813 /* open lock must return for lease */
814 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
815 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
816 it.d.lustre.it_lock_bits);
817 GOTO(out_close, rc = -EPROTO);
820 ll_intent_release(&it);
824 /* Cancel open lock */
825 if (it.d.lustre.it_lock_mode != 0) {
826 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
827 it.d.lustre.it_lock_mode);
828 it.d.lustre.it_lock_mode = 0;
829 och->och_lease_handle.cookie = 0ULL;
831 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
833 CERROR("%s: error closing file "DFID": %d\n",
834 ll_get_fsname(inode->i_sb, NULL, 0),
835 PFID(&ll_i2info(inode)->lli_fid), rc2);
836 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
838 ll_intent_release(&it);
846 * Check whether a layout swap can be done between two inodes.
848 * \param[in] inode1 First inode to check
849 * \param[in] inode2 Second inode to check
851 * \retval 0 on success, layout swap can be performed between both inodes
852 * \retval negative error code if requirements are not met
854 static int ll_check_swap_layouts_validity(struct inode *inode1,
855 struct inode *inode2)
857 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
860 if (inode_permission(inode1, MAY_WRITE) ||
861 inode_permission(inode2, MAY_WRITE))
864 if (inode1->i_sb != inode2->i_sb)
870 static int ll_swap_layouts_close(struct obd_client_handle *och,
871 struct inode *inode, struct inode *inode2)
873 const struct lu_fid *fid1 = ll_inode2fid(inode);
874 const struct lu_fid *fid2;
878 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
879 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
881 rc = ll_check_swap_layouts_validity(inode, inode2);
883 GOTO(out_free_och, rc);
885 /* We now know that inode2 is a lustre inode */
886 fid2 = ll_inode2fid(inode2);
888 rc = lu_fid_cmp(fid1, fid2);
890 GOTO(out_free_och, rc = -EINVAL);
892 /* Close the file and swap layouts between inode & inode2.
893 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
894 * because we still need it to pack l_remote_handle to MDT. */
895 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
896 MDS_CLOSE_LAYOUT_SWAP, inode2);
898 och = NULL; /* freed in ll_close_inode_openhandle() */
908 * Release lease and close the file.
909 * It will check if the lease has ever broken.
911 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
914 struct ldlm_lock *lock;
915 bool cancelled = true;
919 lock = ldlm_handle2lock(&och->och_lease_handle);
921 lock_res_and_lock(lock);
922 cancelled = ldlm_is_cancel(lock);
923 unlock_res_and_lock(lock);
927 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
928 PFID(&ll_i2info(inode)->lli_fid), cancelled);
931 ldlm_cli_cancel(&och->och_lease_handle, 0);
932 if (lease_broken != NULL)
933 *lease_broken = cancelled;
935 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
941 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
943 struct ll_inode_info *lli = ll_i2info(inode);
944 struct cl_object *obj = lli->lli_clob;
945 struct cl_attr *attr = vvp_env_thread_attr(env);
953 ll_inode_size_lock(inode);
955 /* merge timestamps the most recently obtained from mds with
956 timestamps obtained from osts */
957 LTIME_S(inode->i_atime) = lli->lli_atime;
958 LTIME_S(inode->i_mtime) = lli->lli_mtime;
959 LTIME_S(inode->i_ctime) = lli->lli_ctime;
961 atime = LTIME_S(inode->i_atime);
962 mtime = LTIME_S(inode->i_mtime);
963 ctime = LTIME_S(inode->i_ctime);
965 cl_object_attr_lock(obj);
966 rc = cl_object_attr_get(env, obj, attr);
967 cl_object_attr_unlock(obj);
970 GOTO(out_size_unlock, rc);
972 if (atime < attr->cat_atime)
973 atime = attr->cat_atime;
975 if (ctime < attr->cat_ctime)
976 ctime = attr->cat_ctime;
978 if (mtime < attr->cat_mtime)
979 mtime = attr->cat_mtime;
981 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
982 PFID(&lli->lli_fid), attr->cat_size);
984 i_size_write(inode, attr->cat_size);
985 inode->i_blocks = attr->cat_blocks;
987 LTIME_S(inode->i_atime) = atime;
988 LTIME_S(inode->i_mtime) = mtime;
989 LTIME_S(inode->i_ctime) = ctime;
992 ll_inode_size_unlock(inode);
997 static bool file_is_noatime(const struct file *file)
999 const struct vfsmount *mnt = file->f_path.mnt;
1000 const struct inode *inode = file->f_path.dentry->d_inode;
1002 /* Adapted from file_accessed() and touch_atime().*/
1003 if (file->f_flags & O_NOATIME)
1006 if (inode->i_flags & S_NOATIME)
1009 if (IS_NOATIME(inode))
1012 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1015 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1018 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1024 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1026 struct inode *inode = file->f_path.dentry->d_inode;
1028 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1030 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1031 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1032 file->f_flags & O_DIRECT ||
1035 io->ci_obj = ll_i2info(inode)->lli_clob;
1036 io->ci_lockreq = CILR_MAYBE;
1037 if (ll_file_nolock(file)) {
1038 io->ci_lockreq = CILR_NEVER;
1039 io->ci_no_srvlock = 1;
1040 } else if (file->f_flags & O_APPEND) {
1041 io->ci_lockreq = CILR_MANDATORY;
1044 io->ci_noatime = file_is_noatime(file);
1048 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1049 struct file *file, enum cl_io_type iot,
1050 loff_t *ppos, size_t count)
1052 struct vvp_io *vio = vvp_env_io(env);
1053 struct inode *inode = file->f_path.dentry->d_inode;
1054 struct ll_inode_info *lli = ll_i2info(inode);
1055 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1059 struct range_lock range;
1063 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1064 file->f_path.dentry->d_name.name, iot, *ppos, count);
1067 io = vvp_env_thread_io(env);
1068 ll_io_init(io, file, iot == CIT_WRITE);
1070 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1071 bool range_locked = false;
1073 if (file->f_flags & O_APPEND)
1074 range_lock_init(&range, 0, LUSTRE_EOF);
1076 range_lock_init(&range, *ppos, *ppos + count - 1);
1078 vio->vui_fd = LUSTRE_FPRIVATE(file);
1079 vio->vui_io_subtype = args->via_io_subtype;
1081 switch (vio->vui_io_subtype) {
1083 vio->vui_iov = args->u.normal.via_iov;
1084 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1085 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1086 vio->vui_iocb = args->u.normal.via_iocb;
1087 /* Direct IO reads must also take range lock,
1088 * or multiple reads will try to work on the same pages
1089 * See LU-6227 for details. */
1090 if (((iot == CIT_WRITE) ||
1091 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1092 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1093 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1095 rc = range_lock(&lli->lli_write_tree, &range);
1099 range_locked = true;
1103 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1104 vio->u.splice.vui_flags = args->u.splice.via_flags;
1107 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1111 ll_cl_add(file, env, io);
1112 rc = cl_io_loop(env, io);
1113 ll_cl_remove(file, env);
1116 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1118 range_unlock(&lli->lli_write_tree, &range);
1121 /* cl_io_rw_init() handled IO */
1125 if (io->ci_nob > 0) {
1126 result += io->ci_nob;
1127 count -= io->ci_nob;
1128 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1130 /* prepare IO restart */
1131 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1132 args->u.normal.via_iov = vio->vui_iov;
1133 args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1138 cl_io_fini(env, io);
1140 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1142 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1143 file->f_path.dentry->d_name.name,
1144 iot == CIT_READ ? "read" : "write",
1145 *ppos, count, result);
1149 if (iot == CIT_READ) {
1151 ll_stats_ops_tally(ll_i2sbi(inode),
1152 LPROC_LL_READ_BYTES, result);
1153 } else if (iot == CIT_WRITE) {
1155 ll_stats_ops_tally(ll_i2sbi(inode),
1156 LPROC_LL_WRITE_BYTES, result);
1157 fd->fd_write_failed = false;
1158 } else if (rc != -ERESTARTSYS) {
1159 fd->fd_write_failed = true;
1163 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1165 return result > 0 ? result : rc;
1169 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1171 static int ll_file_get_iov_count(const struct iovec *iov,
1172 unsigned long *nr_segs, size_t *count)
1177 for (seg = 0; seg < *nr_segs; seg++) {
1178 const struct iovec *iv = &iov[seg];
1181 * If any segment has a negative length, or the cumulative
1182 * length ever wraps negative then return -EINVAL.
1185 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1187 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1192 cnt -= iv->iov_len; /* This segment is no good */
1199 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1200 unsigned long nr_segs, loff_t pos)
1203 struct vvp_io_args *args;
1204 struct iovec *local_iov;
1210 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1214 env = cl_env_get(&refcheck);
1216 RETURN(PTR_ERR(env));
1219 local_iov = &ll_env_info(env)->lti_local_iov;
1222 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1223 if (local_iov == NULL) {
1224 cl_env_put(env, &refcheck);
1228 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1231 args = ll_env_args(env, IO_NORMAL);
1232 args->u.normal.via_iov = local_iov;
1233 args->u.normal.via_nrsegs = nr_segs;
1234 args->u.normal.via_iocb = iocb;
1236 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1237 &iocb->ki_pos, count);
1239 cl_env_put(env, &refcheck);
1242 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1247 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1251 struct iovec iov = { .iov_base = buf, .iov_len = count };
1252 struct kiocb *kiocb;
1257 env = cl_env_get(&refcheck);
1259 RETURN(PTR_ERR(env));
1261 kiocb = &ll_env_info(env)->lti_kiocb;
1262 init_sync_kiocb(kiocb, file);
1263 kiocb->ki_pos = *ppos;
1264 #ifdef HAVE_KIOCB_KI_LEFT
1265 kiocb->ki_left = count;
1266 #elif defined(HAVE_KI_NBYTES)
1267 kiocb->ki_nbytes = count;
1270 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1271 *ppos = kiocb->ki_pos;
1273 cl_env_put(env, &refcheck);
1278 * Write to a file (through the page cache).
1281 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1282 unsigned long nr_segs, loff_t pos)
1285 struct vvp_io_args *args;
1286 struct iovec *local_iov;
1292 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1296 env = cl_env_get(&refcheck);
1298 RETURN(PTR_ERR(env));
1301 local_iov = &ll_env_info(env)->lti_local_iov;
1304 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1305 if (local_iov == NULL) {
1306 cl_env_put(env, &refcheck);
1310 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1313 args = ll_env_args(env, IO_NORMAL);
1314 args->u.normal.via_iov = local_iov;
1315 args->u.normal.via_nrsegs = nr_segs;
1316 args->u.normal.via_iocb = iocb;
1318 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1319 &iocb->ki_pos, count);
1320 cl_env_put(env, &refcheck);
1323 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1328 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1329 size_t count, loff_t *ppos)
1332 struct iovec iov = { .iov_base = (void __user *)buf,
1334 struct kiocb *kiocb;
1339 env = cl_env_get(&refcheck);
1341 RETURN(PTR_ERR(env));
1343 kiocb = &ll_env_info(env)->lti_kiocb;
1344 init_sync_kiocb(kiocb, file);
1345 kiocb->ki_pos = *ppos;
1346 #ifdef HAVE_KIOCB_KI_LEFT
1347 kiocb->ki_left = count;
1348 #elif defined(HAVE_KI_NBYTES)
1349 kiocb->ki_nbytes = count;
1352 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1353 *ppos = kiocb->ki_pos;
1355 cl_env_put(env, &refcheck);
1360 * Send file content (through pagecache) somewhere with helper
1362 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1363 struct pipe_inode_info *pipe, size_t count,
1367 struct vvp_io_args *args;
1372 env = cl_env_get(&refcheck);
1374 RETURN(PTR_ERR(env));
1376 args = ll_env_args(env, IO_SPLICE);
1377 args->u.splice.via_pipe = pipe;
1378 args->u.splice.via_flags = flags;
1380 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1381 cl_env_put(env, &refcheck);
1385 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1386 __u64 flags, struct lov_user_md *lum,
1389 struct lookup_intent oit = {
1391 .it_flags = flags | MDS_OPEN_BY_FID,
1396 ll_inode_size_lock(inode);
1397 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1399 GOTO(out_unlock, rc);
1401 ll_release_openhandle(file->f_path.dentry, &oit);
1404 ll_inode_size_unlock(inode);
1405 ll_intent_release(&oit);
1406 cl_lov_delay_create_clear(&file->f_flags);
1411 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1412 struct lov_mds_md **lmmp, int *lmm_size,
1413 struct ptlrpc_request **request)
1415 struct ll_sb_info *sbi = ll_i2sbi(inode);
1416 struct mdt_body *body;
1417 struct lov_mds_md *lmm = NULL;
1418 struct ptlrpc_request *req = NULL;
1419 struct md_op_data *op_data;
1422 rc = ll_get_default_mdsize(sbi, &lmmsize);
1426 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1427 strlen(filename), lmmsize,
1428 LUSTRE_OPC_ANY, NULL);
1429 if (IS_ERR(op_data))
1430 RETURN(PTR_ERR(op_data));
1432 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1433 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1434 ll_finish_md_op_data(op_data);
1436 CDEBUG(D_INFO, "md_getattr_name failed "
1437 "on %s: rc %d\n", filename, rc);
1441 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1442 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1444 lmmsize = body->mbo_eadatasize;
1446 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1448 GOTO(out, rc = -ENODATA);
1451 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1452 LASSERT(lmm != NULL);
1454 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1455 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1456 GOTO(out, rc = -EPROTO);
1460 * This is coming from the MDS, so is probably in
1461 * little endian. We convert it to host endian before
1462 * passing it to userspace.
1464 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1467 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1468 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1471 /* if function called for directory - we should
1472 * avoid swab not existent lsm objects */
1473 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1474 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1475 if (S_ISREG(body->mbo_mode))
1476 lustre_swab_lov_user_md_objects(
1477 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1479 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1480 lustre_swab_lov_user_md_v3(
1481 (struct lov_user_md_v3 *)lmm);
1482 if (S_ISREG(body->mbo_mode))
1483 lustre_swab_lov_user_md_objects(
1484 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1491 *lmm_size = lmmsize;
1496 static int ll_lov_setea(struct inode *inode, struct file *file,
1499 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1500 struct lov_user_md *lump;
1501 int lum_size = sizeof(struct lov_user_md) +
1502 sizeof(struct lov_user_ost_data);
1506 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1509 OBD_ALLOC_LARGE(lump, lum_size);
1513 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1514 OBD_FREE_LARGE(lump, lum_size);
1518 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1520 OBD_FREE_LARGE(lump, lum_size);
1524 static int ll_file_getstripe(struct inode *inode,
1525 struct lov_user_md __user *lum)
1532 env = cl_env_get(&refcheck);
1534 RETURN(PTR_ERR(env));
1536 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1537 cl_env_put(env, &refcheck);
1541 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1544 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1545 struct lov_user_md *klum;
1547 __u64 flags = FMODE_WRITE;
1550 rc = ll_copy_user_md(lum, &klum);
1555 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1559 put_user(0, &lum->lmm_stripe_count);
1561 ll_layout_refresh(inode, &gen);
1562 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1565 OBD_FREE(klum, lum_size);
1570 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1572 struct ll_inode_info *lli = ll_i2info(inode);
1573 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1574 struct ll_grouplock grouplock;
1579 CWARN("group id for group lock must not be 0\n");
1583 if (ll_file_nolock(file))
1584 RETURN(-EOPNOTSUPP);
1586 spin_lock(&lli->lli_lock);
1587 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1588 CWARN("group lock already existed with gid %lu\n",
1589 fd->fd_grouplock.lg_gid);
1590 spin_unlock(&lli->lli_lock);
1593 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1594 spin_unlock(&lli->lli_lock);
1596 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1597 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1601 spin_lock(&lli->lli_lock);
1602 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1603 spin_unlock(&lli->lli_lock);
1604 CERROR("another thread just won the race\n");
1605 cl_put_grouplock(&grouplock);
1609 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1610 fd->fd_grouplock = grouplock;
1611 spin_unlock(&lli->lli_lock);
1613 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1617 static int ll_put_grouplock(struct inode *inode, struct file *file,
1620 struct ll_inode_info *lli = ll_i2info(inode);
1621 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1622 struct ll_grouplock grouplock;
1625 spin_lock(&lli->lli_lock);
1626 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1627 spin_unlock(&lli->lli_lock);
1628 CWARN("no group lock held\n");
1632 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1634 if (fd->fd_grouplock.lg_gid != arg) {
1635 CWARN("group lock %lu doesn't match current id %lu\n",
1636 arg, fd->fd_grouplock.lg_gid);
1637 spin_unlock(&lli->lli_lock);
1641 grouplock = fd->fd_grouplock;
1642 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1643 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1644 spin_unlock(&lli->lli_lock);
1646 cl_put_grouplock(&grouplock);
1647 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1652 * Close inode open handle
1654 * \param dentry [in] dentry which contains the inode
1655 * \param it [in,out] intent which contains open info and result
1658 * \retval <0 failure
1660 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1662 struct inode *inode = dentry->d_inode;
1663 struct obd_client_handle *och;
1669 /* Root ? Do nothing. */
1670 if (dentry->d_inode->i_sb->s_root == dentry)
1673 /* No open handle to close? Move away */
1674 if (!it_disposition(it, DISP_OPEN_OPEN))
1677 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1679 OBD_ALLOC(och, sizeof(*och));
1681 GOTO(out, rc = -ENOMEM);
1683 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1685 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1686 och, inode, 0, NULL);
1688 /* this one is in place of ll_file_open */
1689 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1690 ptlrpc_req_finished(it->d.lustre.it_data);
1691 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1697 * Get size for inode for which FIEMAP mapping is requested.
1698 * Make the FIEMAP get_info call and returns the result.
1699 * \param fiemap kernel buffer to hold extens
1700 * \param num_bytes kernel buffer size
1702 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1708 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
1711 /* Checks for fiemap flags */
1712 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1713 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1717 /* Check for FIEMAP_FLAG_SYNC */
1718 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1719 rc = filemap_fdatawrite(inode->i_mapping);
1724 env = cl_env_get(&refcheck);
1726 RETURN(PTR_ERR(env));
1728 if (i_size_read(inode) == 0) {
1729 rc = ll_glimpse_size(inode);
1734 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1735 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
1736 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
1738 /* If filesize is 0, then there would be no objects for mapping */
1739 if (fmkey.lfik_oa.o_size == 0) {
1740 fiemap->fm_mapped_extents = 0;
1744 fmkey.lfik_fiemap = *fiemap;
1746 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1747 &fmkey, fiemap, &num_bytes);
1749 cl_env_put(env, &refcheck);
1753 int ll_fid2path(struct inode *inode, void __user *arg)
1755 struct obd_export *exp = ll_i2mdexp(inode);
1756 const struct getinfo_fid2path __user *gfin = arg;
1758 struct getinfo_fid2path *gfout;
1764 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1765 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1768 /* Only need to get the buflen */
1769 if (get_user(pathlen, &gfin->gf_pathlen))
1772 if (pathlen > PATH_MAX)
1775 outsize = sizeof(*gfout) + pathlen;
1776 OBD_ALLOC(gfout, outsize);
1780 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1781 GOTO(gf_free, rc = -EFAULT);
1783 /* Call mdc_iocontrol */
1784 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1788 if (copy_to_user(arg, gfout, outsize))
1792 OBD_FREE(gfout, outsize);
1797 * Read the data_version for inode.
1799 * This value is computed using stripe object version on OST.
1800 * Version is computed using server side locking.
1802 * @param flags if do sync on the OST side;
1804 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1805 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1807 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1809 struct cl_object *obj = ll_i2info(inode)->lli_clob;
1817 /* If no file object initialized, we consider its version is 0. */
1823 env = cl_env_get(&refcheck);
1825 RETURN(PTR_ERR(env));
1827 io = vvp_env_thread_io(env);
1829 io->u.ci_data_version.dv_data_version = 0;
1830 io->u.ci_data_version.dv_flags = flags;
1833 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
1834 result = cl_io_loop(env, io);
1836 result = io->ci_result;
1838 *data_version = io->u.ci_data_version.dv_data_version;
1840 cl_io_fini(env, io);
1842 if (unlikely(io->ci_need_restart))
1845 cl_env_put(env, &refcheck);
1851 * Trigger a HSM release request for the provided inode.
1853 int ll_hsm_release(struct inode *inode)
1855 struct cl_env_nest nest;
1857 struct obd_client_handle *och = NULL;
1858 __u64 data_version = 0;
1862 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1863 ll_get_fsname(inode->i_sb, NULL, 0),
1864 PFID(&ll_i2info(inode)->lli_fid));
1866 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1868 GOTO(out, rc = PTR_ERR(och));
1870 /* Grab latest data_version and [am]time values */
1871 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1875 env = cl_env_nested_get(&nest);
1877 GOTO(out, rc = PTR_ERR(env));
1879 ll_merge_attr(env, inode);
1880 cl_env_nested_put(&nest, env);
1882 /* Release the file.
1883 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1884 * we still need it to pack l_remote_handle to MDT. */
1885 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
1886 MDS_HSM_RELEASE, &data_version);
1891 if (och != NULL && !IS_ERR(och)) /* close the file */
1892 ll_lease_close(och, inode, NULL);
1897 struct ll_swap_stack {
1900 struct inode *inode1;
1901 struct inode *inode2;
1906 static int ll_swap_layouts(struct file *file1, struct file *file2,
1907 struct lustre_swap_layouts *lsl)
1909 struct mdc_swap_layouts msl;
1910 struct md_op_data *op_data;
1913 struct ll_swap_stack *llss = NULL;
1916 OBD_ALLOC_PTR(llss);
1920 llss->inode1 = file1->f_path.dentry->d_inode;
1921 llss->inode2 = file2->f_path.dentry->d_inode;
1923 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
1927 /* we use 2 bool because it is easier to swap than 2 bits */
1928 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1929 llss->check_dv1 = true;
1931 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1932 llss->check_dv2 = true;
1934 /* we cannot use lsl->sl_dvX directly because we may swap them */
1935 llss->dv1 = lsl->sl_dv1;
1936 llss->dv2 = lsl->sl_dv2;
1938 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1939 if (rc == 0) /* same file, done! */
1942 if (rc < 0) { /* sequentialize it */
1943 swap(llss->inode1, llss->inode2);
1945 swap(llss->dv1, llss->dv2);
1946 swap(llss->check_dv1, llss->check_dv2);
1950 if (gid != 0) { /* application asks to flush dirty cache */
1951 rc = ll_get_grouplock(llss->inode1, file1, gid);
1955 rc = ll_get_grouplock(llss->inode2, file2, gid);
1957 ll_put_grouplock(llss->inode1, file1, gid);
1962 /* ultimate check, before swaping the layouts we check if
1963 * dataversion has changed (if requested) */
1964 if (llss->check_dv1) {
1965 rc = ll_data_version(llss->inode1, &dv, 0);
1968 if (dv != llss->dv1)
1969 GOTO(putgl, rc = -EAGAIN);
1972 if (llss->check_dv2) {
1973 rc = ll_data_version(llss->inode2, &dv, 0);
1976 if (dv != llss->dv2)
1977 GOTO(putgl, rc = -EAGAIN);
1980 /* struct md_op_data is used to send the swap args to the mdt
1981 * only flags is missing, so we use struct mdc_swap_layouts
1982 * through the md_op_data->op_data */
1983 /* flags from user space have to be converted before they are send to
1984 * server, no flag is sent today, they are only used on the client */
1987 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1988 0, LUSTRE_OPC_ANY, &msl);
1989 if (IS_ERR(op_data))
1990 GOTO(free, rc = PTR_ERR(op_data));
1992 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1993 sizeof(*op_data), op_data, NULL);
1994 ll_finish_md_op_data(op_data);
2001 ll_put_grouplock(llss->inode2, file2, gid);
2002 ll_put_grouplock(llss->inode1, file1, gid);
2012 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2014 struct md_op_data *op_data;
2018 /* Detect out-of range masks */
2019 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2022 /* Non-root users are forbidden to set or clear flags which are
2023 * NOT defined in HSM_USER_MASK. */
2024 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2025 !cfs_capable(CFS_CAP_SYS_ADMIN))
2028 /* Detect out-of range archive id */
2029 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2030 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2033 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2034 LUSTRE_OPC_ANY, hss);
2035 if (IS_ERR(op_data))
2036 RETURN(PTR_ERR(op_data));
2038 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2039 sizeof(*op_data), op_data, NULL);
2041 ll_finish_md_op_data(op_data);
2046 static int ll_hsm_import(struct inode *inode, struct file *file,
2047 struct hsm_user_import *hui)
2049 struct hsm_state_set *hss = NULL;
2050 struct iattr *attr = NULL;
2054 if (!S_ISREG(inode->i_mode))
2060 GOTO(out, rc = -ENOMEM);
2062 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2063 hss->hss_archive_id = hui->hui_archive_id;
2064 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2065 rc = ll_hsm_state_set(inode, hss);
2069 OBD_ALLOC_PTR(attr);
2071 GOTO(out, rc = -ENOMEM);
2073 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2074 attr->ia_mode |= S_IFREG;
2075 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2076 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2077 attr->ia_size = hui->hui_size;
2078 attr->ia_mtime.tv_sec = hui->hui_mtime;
2079 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2080 attr->ia_atime.tv_sec = hui->hui_atime;
2081 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2083 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2084 ATTR_UID | ATTR_GID |
2085 ATTR_MTIME | ATTR_MTIME_SET |
2086 ATTR_ATIME | ATTR_ATIME_SET;
2088 mutex_lock(&inode->i_mutex);
2090 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2094 mutex_unlock(&inode->i_mutex);
2106 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2108 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2109 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2113 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2115 struct inode *inode = file->f_path.dentry->d_inode;
2116 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2120 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2121 PFID(ll_inode2fid(inode)), inode, cmd);
2122 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2124 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2125 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2129 case LL_IOC_GETFLAGS:
2130 /* Get the current value of the file flags */
2131 return put_user(fd->fd_flags, (int __user *)arg);
2132 case LL_IOC_SETFLAGS:
2133 case LL_IOC_CLRFLAGS:
2134 /* Set or clear specific file flags */
2135 /* XXX This probably needs checks to ensure the flags are
2136 * not abused, and to handle any flag side effects.
2138 if (get_user(flags, (int __user *) arg))
2141 if (cmd == LL_IOC_SETFLAGS) {
2142 if ((flags & LL_FILE_IGNORE_LOCK) &&
2143 !(file->f_flags & O_DIRECT)) {
2144 CERROR("%s: unable to disable locking on "
2145 "non-O_DIRECT file\n", current->comm);
2149 fd->fd_flags |= flags;
2151 fd->fd_flags &= ~flags;
2154 case LL_IOC_LOV_SETSTRIPE:
2155 RETURN(ll_lov_setstripe(inode, file, arg));
2156 case LL_IOC_LOV_SETEA:
2157 RETURN(ll_lov_setea(inode, file, arg));
2158 case LL_IOC_LOV_SWAP_LAYOUTS: {
2160 struct lustre_swap_layouts lsl;
2162 if (copy_from_user(&lsl, (char __user *)arg,
2163 sizeof(struct lustre_swap_layouts)))
2166 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
2169 file2 = fget(lsl.sl_fd);
2173 /* O_WRONLY or O_RDWR */
2174 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
2175 GOTO(out, rc = -EPERM);
2177 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
2178 struct inode *inode2;
2179 struct ll_inode_info *lli;
2180 struct obd_client_handle *och = NULL;
2182 if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
2183 GOTO(out, rc = -EINVAL);
2185 lli = ll_i2info(inode);
2186 mutex_lock(&lli->lli_och_mutex);
2187 if (fd->fd_lease_och != NULL) {
2188 och = fd->fd_lease_och;
2189 fd->fd_lease_och = NULL;
2191 mutex_unlock(&lli->lli_och_mutex);
2193 GOTO(out, rc = -ENOLCK);
2194 inode2 = file2->f_path.dentry->d_inode;
2195 rc = ll_swap_layouts_close(och, inode, inode2);
2197 rc = ll_swap_layouts(file, file2, &lsl);
2203 case LL_IOC_LOV_GETSTRIPE:
2204 RETURN(ll_file_getstripe(inode,
2205 (struct lov_user_md __user *)arg));
2206 case FSFILT_IOC_GETFLAGS:
2207 case FSFILT_IOC_SETFLAGS:
2208 RETURN(ll_iocontrol(inode, file, cmd, arg));
2209 case FSFILT_IOC_GETVERSION_OLD:
2210 case FSFILT_IOC_GETVERSION:
2211 RETURN(put_user(inode->i_generation, (int __user *)arg));
2212 case LL_IOC_GROUP_LOCK:
2213 RETURN(ll_get_grouplock(inode, file, arg));
2214 case LL_IOC_GROUP_UNLOCK:
2215 RETURN(ll_put_grouplock(inode, file, arg));
2216 case IOC_OBD_STATFS:
2217 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2219 /* We need to special case any other ioctls we want to handle,
2220 * to send them to the MDS/OST as appropriate and to properly
2221 * network encode the arg field.
2222 case FSFILT_IOC_SETVERSION_OLD:
2223 case FSFILT_IOC_SETVERSION:
2225 case LL_IOC_FLUSHCTX:
2226 RETURN(ll_flush_ctx(inode));
2227 case LL_IOC_PATH2FID: {
2228 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2229 sizeof(struct lu_fid)))
2234 case LL_IOC_GETPARENT:
2235 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2237 case OBD_IOC_FID2PATH:
2238 RETURN(ll_fid2path(inode, (void __user *)arg));
2239 case LL_IOC_DATA_VERSION: {
2240 struct ioc_data_version idv;
2243 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2246 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2247 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2250 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2256 case LL_IOC_GET_MDTIDX: {
2259 mdtidx = ll_get_mdt_idx(inode);
2263 if (put_user((int)mdtidx, (int __user *)arg))
2268 case OBD_IOC_GETDTNAME:
2269 case OBD_IOC_GETMDNAME:
2270 RETURN(ll_get_obd_name(inode, cmd, arg));
2271 case LL_IOC_HSM_STATE_GET: {
2272 struct md_op_data *op_data;
2273 struct hsm_user_state *hus;
2280 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2281 LUSTRE_OPC_ANY, hus);
2282 if (IS_ERR(op_data)) {
2284 RETURN(PTR_ERR(op_data));
2287 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2290 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2293 ll_finish_md_op_data(op_data);
2297 case LL_IOC_HSM_STATE_SET: {
2298 struct hsm_state_set *hss;
2305 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2310 rc = ll_hsm_state_set(inode, hss);
2315 case LL_IOC_HSM_ACTION: {
2316 struct md_op_data *op_data;
2317 struct hsm_current_action *hca;
2324 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2325 LUSTRE_OPC_ANY, hca);
2326 if (IS_ERR(op_data)) {
2328 RETURN(PTR_ERR(op_data));
2331 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2334 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2337 ll_finish_md_op_data(op_data);
2341 case LL_IOC_SET_LEASE: {
2342 struct ll_inode_info *lli = ll_i2info(inode);
2343 struct obd_client_handle *och = NULL;
2348 case LL_LEASE_WRLCK:
2349 if (!(file->f_mode & FMODE_WRITE))
2351 fmode = FMODE_WRITE;
2353 case LL_LEASE_RDLCK:
2354 if (!(file->f_mode & FMODE_READ))
2358 case LL_LEASE_UNLCK:
2359 mutex_lock(&lli->lli_och_mutex);
2360 if (fd->fd_lease_och != NULL) {
2361 och = fd->fd_lease_och;
2362 fd->fd_lease_och = NULL;
2364 mutex_unlock(&lli->lli_och_mutex);
2369 fmode = och->och_flags;
2370 rc = ll_lease_close(och, inode, &lease_broken);
2377 RETURN(ll_lease_type_from_fmode(fmode));
2382 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2384 /* apply for lease */
2385 och = ll_lease_open(inode, file, fmode, 0);
2387 RETURN(PTR_ERR(och));
2390 mutex_lock(&lli->lli_och_mutex);
2391 if (fd->fd_lease_och == NULL) {
2392 fd->fd_lease_och = och;
2395 mutex_unlock(&lli->lli_och_mutex);
2397 /* impossible now that only excl is supported for now */
2398 ll_lease_close(och, inode, &lease_broken);
2403 case LL_IOC_GET_LEASE: {
2404 struct ll_inode_info *lli = ll_i2info(inode);
2405 struct ldlm_lock *lock = NULL;
2408 mutex_lock(&lli->lli_och_mutex);
2409 if (fd->fd_lease_och != NULL) {
2410 struct obd_client_handle *och = fd->fd_lease_och;
2412 lock = ldlm_handle2lock(&och->och_lease_handle);
2414 lock_res_and_lock(lock);
2415 if (!ldlm_is_cancel(lock))
2416 fmode = och->och_flags;
2418 unlock_res_and_lock(lock);
2419 LDLM_LOCK_PUT(lock);
2422 mutex_unlock(&lli->lli_och_mutex);
2424 RETURN(ll_lease_type_from_fmode(fmode));
2426 case LL_IOC_HSM_IMPORT: {
2427 struct hsm_user_import *hui;
2433 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2438 rc = ll_hsm_import(inode, file, hui);
2448 ll_iocontrol_call(inode, file, cmd, arg, &err))
2451 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2452 (void __user *)arg));
2457 #ifndef HAVE_FILE_LLSEEK_SIZE
2458 static inline loff_t
2459 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2461 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2463 if (offset > maxsize)
2466 if (offset != file->f_pos) {
2467 file->f_pos = offset;
2468 file->f_version = 0;
2474 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2475 loff_t maxsize, loff_t eof)
2477 struct inode *inode = file->f_path.dentry->d_inode;
2485 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2486 * position-querying operation. Avoid rewriting the "same"
2487 * f_pos value back to the file because a concurrent read(),
2488 * write() or lseek() might have altered it
2493 * f_lock protects against read/modify/write race with other
2494 * SEEK_CURs. Note that parallel writes and reads behave
2497 mutex_lock(&inode->i_mutex);
2498 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2499 mutex_unlock(&inode->i_mutex);
2503 * In the generic case the entire file is data, so as long as
2504 * offset isn't at the end of the file then the offset is data.
2511 * There is a virtual hole at the end of the file, so as long as
2512 * offset isn't i_size or larger, return i_size.
2520 return llseek_execute(file, offset, maxsize);
2524 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2526 struct inode *inode = file->f_path.dentry->d_inode;
2527 loff_t retval, eof = 0;
2530 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2531 (origin == SEEK_CUR) ? file->f_pos : 0);
2532 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2533 PFID(ll_inode2fid(inode)), inode, retval, retval,
2535 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2537 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2538 retval = ll_glimpse_size(inode);
2541 eof = i_size_read(inode);
2544 retval = ll_generic_file_llseek_size(file, offset, origin,
2545 ll_file_maxbytes(inode), eof);
2549 static int ll_flush(struct file *file, fl_owner_t id)
2551 struct inode *inode = file->f_path.dentry->d_inode;
2552 struct ll_inode_info *lli = ll_i2info(inode);
2553 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2556 LASSERT(!S_ISDIR(inode->i_mode));
2558 /* catch async errors that were recorded back when async writeback
2559 * failed for pages in this mapping. */
2560 rc = lli->lli_async_rc;
2561 lli->lli_async_rc = 0;
2562 if (lli->lli_clob != NULL) {
2563 err = lov_read_and_clear_async_rc(lli->lli_clob);
2568 /* The application has been told write failure already.
2569 * Do not report failure again. */
2570 if (fd->fd_write_failed)
2572 return rc ? -EIO : 0;
2576 * Called to make sure a portion of file has been written out.
2577 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2579 * Return how many pages have been written.
2581 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2582 enum cl_fsync_mode mode, int ignore_layout)
2584 struct cl_env_nest nest;
2587 struct cl_fsync_io *fio;
2591 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2592 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2595 env = cl_env_nested_get(&nest);
2597 RETURN(PTR_ERR(env));
2599 io = vvp_env_thread_io(env);
2600 io->ci_obj = ll_i2info(inode)->lli_clob;
2601 io->ci_ignore_layout = ignore_layout;
2603 /* initialize parameters for sync */
2604 fio = &io->u.ci_fsync;
2605 fio->fi_start = start;
2607 fio->fi_fid = ll_inode2fid(inode);
2608 fio->fi_mode = mode;
2609 fio->fi_nr_written = 0;
2611 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2612 result = cl_io_loop(env, io);
2614 result = io->ci_result;
2616 result = fio->fi_nr_written;
2617 cl_io_fini(env, io);
2618 cl_env_nested_put(&nest, env);
2624 * When dentry is provided (the 'else' case), *file->f_path.dentry may be
2625 * null and dentry must be used directly rather than pulled from
2626 * *file->f_path.dentry as is done otherwise.
2629 #ifdef HAVE_FILE_FSYNC_4ARGS
2630 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2632 struct dentry *dentry = file->f_path.dentry;
2633 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2634 int ll_fsync(struct file *file, int datasync)
2636 struct dentry *dentry = file->f_path.dentry;
2638 loff_t end = LLONG_MAX;
2640 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2643 loff_t end = LLONG_MAX;
2645 struct inode *inode = dentry->d_inode;
2646 struct ll_inode_info *lli = ll_i2info(inode);
2647 struct ptlrpc_request *req;
2651 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2652 PFID(ll_inode2fid(inode)), inode);
2653 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2655 #ifdef HAVE_FILE_FSYNC_4ARGS
2656 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2657 mutex_lock(&inode->i_mutex);
2659 /* fsync's caller has already called _fdata{sync,write}, we want
2660 * that IO to finish before calling the osc and mdc sync methods */
2661 rc = filemap_fdatawait(inode->i_mapping);
2664 /* catch async errors that were recorded back when async writeback
2665 * failed for pages in this mapping. */
2666 if (!S_ISDIR(inode->i_mode)) {
2667 err = lli->lli_async_rc;
2668 lli->lli_async_rc = 0;
2671 err = lov_read_and_clear_async_rc(lli->lli_clob);
2676 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2680 ptlrpc_req_finished(req);
2682 if (S_ISREG(inode->i_mode)) {
2683 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2685 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2686 if (rc == 0 && err < 0)
2689 fd->fd_write_failed = true;
2691 fd->fd_write_failed = false;
2694 #ifdef HAVE_FILE_FSYNC_4ARGS
2695 mutex_unlock(&inode->i_mutex);
2701 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2703 struct inode *inode = file->f_path.dentry->d_inode;
2704 struct ll_sb_info *sbi = ll_i2sbi(inode);
2705 struct ldlm_enqueue_info einfo = {
2706 .ei_type = LDLM_FLOCK,
2707 .ei_cb_cp = ldlm_flock_completion_ast,
2708 .ei_cbdata = file_lock,
2710 struct md_op_data *op_data;
2711 struct lustre_handle lockh = {0};
2712 ldlm_policy_data_t flock = {{0}};
2713 int fl_type = file_lock->fl_type;
2719 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2720 PFID(ll_inode2fid(inode)), file_lock);
2722 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2724 if (file_lock->fl_flags & FL_FLOCK) {
2725 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2726 /* flocks are whole-file locks */
2727 flock.l_flock.end = OFFSET_MAX;
2728 /* For flocks owner is determined by the local file desctiptor*/
2729 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2730 } else if (file_lock->fl_flags & FL_POSIX) {
2731 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2732 flock.l_flock.start = file_lock->fl_start;
2733 flock.l_flock.end = file_lock->fl_end;
2737 flock.l_flock.pid = file_lock->fl_pid;
2739 /* Somewhat ugly workaround for svc lockd.
2740 * lockd installs custom fl_lmops->lm_compare_owner that checks
2741 * for the fl_owner to be the same (which it always is on local node
2742 * I guess between lockd processes) and then compares pid.
2743 * As such we assign pid to the owner field to make it all work,
2744 * conflict with normal locks is unlikely since pid space and
2745 * pointer space for current->files are not intersecting */
2746 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2747 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2751 einfo.ei_mode = LCK_PR;
2754 /* An unlock request may or may not have any relation to
2755 * existing locks so we may not be able to pass a lock handle
2756 * via a normal ldlm_lock_cancel() request. The request may even
2757 * unlock a byte range in the middle of an existing lock. In
2758 * order to process an unlock request we need all of the same
2759 * information that is given with a normal read or write record
2760 * lock request. To avoid creating another ldlm unlock (cancel)
2761 * message we'll treat a LCK_NL flock request as an unlock. */
2762 einfo.ei_mode = LCK_NL;
2765 einfo.ei_mode = LCK_PW;
2768 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2783 flags = LDLM_FL_BLOCK_NOWAIT;
2789 flags = LDLM_FL_TEST_LOCK;
2792 CERROR("unknown fcntl lock command: %d\n", cmd);
2796 /* Save the old mode so that if the mode in the lock changes we
2797 * can decrement the appropriate reader or writer refcount. */
2798 file_lock->fl_type = einfo.ei_mode;
2800 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2801 LUSTRE_OPC_ANY, NULL);
2802 if (IS_ERR(op_data))
2803 RETURN(PTR_ERR(op_data));
2805 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2806 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2807 flock.l_flock.pid, flags, einfo.ei_mode,
2808 flock.l_flock.start, flock.l_flock.end);
2810 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2813 /* Restore the file lock type if not TEST lock. */
2814 if (!(flags & LDLM_FL_TEST_LOCK))
2815 file_lock->fl_type = fl_type;
2817 if ((file_lock->fl_flags & FL_FLOCK) &&
2818 (rc == 0 || file_lock->fl_type == F_UNLCK))
2819 rc2 = flock_lock_file_wait(file, file_lock);
2820 if ((file_lock->fl_flags & FL_POSIX) &&
2821 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2822 !(flags & LDLM_FL_TEST_LOCK))
2823 rc2 = posix_lock_file_wait(file, file_lock);
2825 if (rc2 && file_lock->fl_type != F_UNLCK) {
2826 einfo.ei_mode = LCK_NL;
2827 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2832 ll_finish_md_op_data(op_data);
2837 int ll_get_fid_by_name(struct inode *parent, const char *name,
2838 int namelen, struct lu_fid *fid)
2840 struct md_op_data *op_data = NULL;
2841 struct mdt_body *body;
2842 struct ptlrpc_request *req;
2846 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2847 LUSTRE_OPC_ANY, NULL);
2848 if (IS_ERR(op_data))
2849 RETURN(PTR_ERR(op_data));
2851 op_data->op_valid = OBD_MD_FLID;
2852 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2853 ll_finish_md_op_data(op_data);
2857 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2859 GOTO(out_req, rc = -EFAULT);
2861 *fid = body->mbo_fid1;
2863 ptlrpc_req_finished(req);
2867 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2868 const char *name, int namelen)
2870 struct dentry *dchild = NULL;
2871 struct inode *child_inode = NULL;
2872 struct md_op_data *op_data;
2873 struct ptlrpc_request *request = NULL;
2878 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2879 name, PFID(ll_inode2fid(parent)), mdtidx);
2881 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2882 0, LUSTRE_OPC_ANY, NULL);
2883 if (IS_ERR(op_data))
2884 RETURN(PTR_ERR(op_data));
2886 /* Get child FID first */
2887 qstr.hash = full_name_hash(name, namelen);
2890 dchild = d_lookup(file->f_path.dentry, &qstr);
2891 if (dchild != NULL) {
2892 if (dchild->d_inode != NULL) {
2893 child_inode = igrab(dchild->d_inode);
2894 if (child_inode != NULL) {
2895 mutex_lock(&child_inode->i_mutex);
2896 op_data->op_fid3 = *ll_inode2fid(child_inode);
2897 ll_invalidate_aliases(child_inode);
2902 rc = ll_get_fid_by_name(parent, name, namelen,
2908 if (!fid_is_sane(&op_data->op_fid3)) {
2909 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2910 ll_get_fsname(parent->i_sb, NULL, 0), name,
2911 PFID(&op_data->op_fid3));
2912 GOTO(out_free, rc = -EINVAL);
2915 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2920 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2921 PFID(&op_data->op_fid3), mdtidx);
2922 GOTO(out_free, rc = 0);
2925 op_data->op_mds = mdtidx;
2926 op_data->op_cli_flags = CLI_MIGRATE;
2927 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2928 namelen, name, namelen, &request);
2930 ll_update_times(request, parent);
2932 ptlrpc_req_finished(request);
2937 if (child_inode != NULL) {
2938 clear_nlink(child_inode);
2939 mutex_unlock(&child_inode->i_mutex);
2943 ll_finish_md_op_data(op_data);
2948 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2956 * test if some locks matching bits and l_req_mode are acquired
2957 * - bits can be in different locks
2958 * - if found clear the common lock bits in *bits
2959 * - the bits not found, are kept in *bits
2961 * \param bits [IN] searched lock bits [IN]
2962 * \param l_req_mode [IN] searched lock mode
2963 * \retval boolean, true iff all bits are found
2965 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2967 struct lustre_handle lockh;
2968 ldlm_policy_data_t policy;
2969 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2970 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2979 fid = &ll_i2info(inode)->lli_fid;
2980 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2981 ldlm_lockname[mode]);
2983 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2984 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2985 policy.l_inodebits.bits = *bits & (1 << i);
2986 if (policy.l_inodebits.bits == 0)
2989 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2990 &policy, mode, &lockh)) {
2991 struct ldlm_lock *lock;
2993 lock = ldlm_handle2lock(&lockh);
2996 ~(lock->l_policy_data.l_inodebits.bits);
2997 LDLM_LOCK_PUT(lock);
2999 *bits &= ~policy.l_inodebits.bits;
3006 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3007 struct lustre_handle *lockh, __u64 flags,
3010 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3015 fid = &ll_i2info(inode)->lli_fid;
3016 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3018 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3019 fid, LDLM_IBITS, &policy, mode, lockh);
3024 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3026 /* Already unlinked. Just update nlink and return success */
3027 if (rc == -ENOENT) {
3029 /* This path cannot be hit for regular files unless in
3030 * case of obscure races, so no need to to validate
3032 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3034 } else if (rc != 0) {
3035 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3036 "%s: revalidate FID "DFID" error: rc = %d\n",
3037 ll_get_fsname(inode->i_sb, NULL, 0),
3038 PFID(ll_inode2fid(inode)), rc);
3044 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3046 struct inode *inode = dentry->d_inode;
3047 struct ptlrpc_request *req = NULL;
3048 struct obd_export *exp;
3052 LASSERT(inode != NULL);
3054 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3055 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3057 exp = ll_i2mdexp(inode);
3059 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3060 * But under CMD case, it caused some lock issues, should be fixed
3061 * with new CMD ibits lock. See bug 12718 */
3062 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3063 struct lookup_intent oit = { .it_op = IT_GETATTR };
3064 struct md_op_data *op_data;
3066 if (ibits == MDS_INODELOCK_LOOKUP)
3067 oit.it_op = IT_LOOKUP;
3069 /* Call getattr by fid, so do not provide name at all. */
3070 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3071 dentry->d_inode, NULL, 0, 0,
3072 LUSTRE_OPC_ANY, NULL);
3073 if (IS_ERR(op_data))
3074 RETURN(PTR_ERR(op_data));
3076 rc = md_intent_lock(exp, op_data, &oit, &req,
3077 &ll_md_blocking_ast, 0);
3078 ll_finish_md_op_data(op_data);
3080 rc = ll_inode_revalidate_fini(inode, rc);
3084 rc = ll_revalidate_it_finish(req, &oit, dentry);
3086 ll_intent_release(&oit);
3090 /* Unlinked? Unhash dentry, so it is not picked up later by
3091 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3092 here to preserve get_cwd functionality on 2.6.
3094 if (!dentry->d_inode->i_nlink)
3095 d_lustre_invalidate(dentry, 0);
3097 ll_lookup_finish_locks(&oit, dentry);
3098 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3099 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3100 u64 valid = OBD_MD_FLGETATTR;
3101 struct md_op_data *op_data;
3104 if (S_ISREG(inode->i_mode)) {
3105 rc = ll_get_default_mdsize(sbi, &ealen);
3108 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3111 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3112 0, ealen, LUSTRE_OPC_ANY,
3114 if (IS_ERR(op_data))
3115 RETURN(PTR_ERR(op_data));
3117 op_data->op_valid = valid;
3118 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3119 ll_finish_md_op_data(op_data);
3121 rc = ll_inode_revalidate_fini(inode, rc);
3125 rc = ll_prep_inode(&inode, req, NULL, NULL);
3128 ptlrpc_req_finished(req);
3132 static int ll_merge_md_attr(struct inode *inode)
3134 struct cl_attr attr = { 0 };
3137 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3138 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3139 &attr, ll_md_blocking_ast);
3143 set_nlink(inode, attr.cat_nlink);
3144 inode->i_blocks = attr.cat_blocks;
3145 i_size_write(inode, attr.cat_size);
3147 ll_i2info(inode)->lli_atime = attr.cat_atime;
3148 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3149 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3155 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3157 struct inode *inode = dentry->d_inode;
3161 rc = __ll_inode_revalidate(dentry, ibits);
3165 /* if object isn't regular file, don't validate size */
3166 if (!S_ISREG(inode->i_mode)) {
3167 if (S_ISDIR(inode->i_mode) &&
3168 ll_i2info(inode)->lli_lsm_md != NULL) {
3169 rc = ll_merge_md_attr(inode);
3174 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3175 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3176 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3178 /* In case of restore, the MDT has the right size and has
3179 * already send it back without granting the layout lock,
3180 * inode is up-to-date so glimpse is useless.
3181 * Also to glimpse we need the layout, in case of a running
3182 * restore the MDT holds the layout lock so the glimpse will
3183 * block up to the end of restore (getattr will block)
3185 if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
3186 rc = ll_glimpse_size(inode);
3191 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3193 struct inode *inode = de->d_inode;
3194 struct ll_sb_info *sbi = ll_i2sbi(inode);
3195 struct ll_inode_info *lli = ll_i2info(inode);
3198 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3199 MDS_INODELOCK_LOOKUP);
3200 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3205 stat->dev = inode->i_sb->s_dev;
3206 if (ll_need_32bit_api(sbi))
3207 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3209 stat->ino = inode->i_ino;
3210 stat->mode = inode->i_mode;
3211 stat->uid = inode->i_uid;
3212 stat->gid = inode->i_gid;
3213 stat->rdev = inode->i_rdev;
3214 stat->atime = inode->i_atime;
3215 stat->mtime = inode->i_mtime;
3216 stat->ctime = inode->i_ctime;
3217 stat->blksize = 1 << inode->i_blkbits;
3219 stat->nlink = inode->i_nlink;
3220 stat->size = i_size_read(inode);
3221 stat->blocks = inode->i_blocks;
3226 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3227 __u64 start, __u64 len)
3231 struct fiemap *fiemap;
3232 unsigned int extent_count = fieinfo->fi_extents_max;
3234 num_bytes = sizeof(*fiemap) + (extent_count *
3235 sizeof(struct fiemap_extent));
3236 OBD_ALLOC_LARGE(fiemap, num_bytes);
3241 fiemap->fm_flags = fieinfo->fi_flags;
3242 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3243 fiemap->fm_start = start;
3244 fiemap->fm_length = len;
3245 if (extent_count > 0 &&
3246 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3247 sizeof(struct fiemap_extent)) != 0)
3248 GOTO(out, rc = -EFAULT);
3250 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3252 fieinfo->fi_flags = fiemap->fm_flags;
3253 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3254 if (extent_count > 0 &&
3255 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3256 fiemap->fm_mapped_extents *
3257 sizeof(struct fiemap_extent)) != 0)
3258 GOTO(out, rc = -EFAULT);
3260 OBD_FREE_LARGE(fiemap, num_bytes);
3264 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3266 struct ll_inode_info *lli = ll_i2info(inode);
3267 struct posix_acl *acl = NULL;
3270 spin_lock(&lli->lli_lock);
3271 /* VFS' acl_permission_check->check_acl will release the refcount */
3272 acl = posix_acl_dup(lli->lli_posix_acl);
3273 spin_unlock(&lli->lli_lock);
3278 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3280 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3281 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3283 ll_check_acl(struct inode *inode, int mask)
3286 # ifdef CONFIG_FS_POSIX_ACL
3287 struct posix_acl *acl;
3291 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3292 if (flags & IPERM_FLAG_RCU)
3295 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3300 rc = posix_acl_permission(inode, acl, mask);
3301 posix_acl_release(acl);
3304 # else /* !CONFIG_FS_POSIX_ACL */
3306 # endif /* CONFIG_FS_POSIX_ACL */
3308 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3310 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3311 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3313 # ifdef HAVE_INODE_PERMISION_2ARGS
3314 int ll_inode_permission(struct inode *inode, int mask)
3316 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3321 struct ll_sb_info *sbi;
3322 struct root_squash_info *squash;
3323 struct cred *cred = NULL;
3324 const struct cred *old_cred = NULL;
3326 bool squash_id = false;
3329 #ifdef MAY_NOT_BLOCK
3330 if (mask & MAY_NOT_BLOCK)
3332 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3333 if (flags & IPERM_FLAG_RCU)
3337 /* as root inode are NOT getting validated in lookup operation,
3338 * need to do it before permission check. */
3340 if (inode == inode->i_sb->s_root->d_inode) {
3341 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3342 MDS_INODELOCK_LOOKUP);
3347 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3348 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3350 /* squash fsuid/fsgid if needed */
3351 sbi = ll_i2sbi(inode);
3352 squash = &sbi->ll_squash;
3353 if (unlikely(squash->rsi_uid != 0 &&
3354 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3355 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3359 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3360 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3361 squash->rsi_uid, squash->rsi_gid);
3363 /* update current process's credentials
3364 * and FS capability */
3365 cred = prepare_creds();
3369 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3370 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3371 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3372 if ((1 << cap) & CFS_CAP_FS_MASK)
3373 cap_lower(cred->cap_effective, cap);
3375 old_cred = override_creds(cred);
3378 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3380 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3381 rc = lustre_check_remote_perm(inode, mask);
3383 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3385 /* restore current process's credentials and FS capability */
3387 revert_creds(old_cred);
3394 /* -o localflock - only provides locally consistent flock locks */
3395 struct file_operations ll_file_operations = {
3396 .read = ll_file_read,
3397 .aio_read = ll_file_aio_read,
3398 .write = ll_file_write,
3399 .aio_write = ll_file_aio_write,
3400 .unlocked_ioctl = ll_file_ioctl,
3401 .open = ll_file_open,
3402 .release = ll_file_release,
3403 .mmap = ll_file_mmap,
3404 .llseek = ll_file_seek,
3405 .splice_read = ll_file_splice_read,
3410 struct file_operations ll_file_operations_flock = {
3411 .read = ll_file_read,
3412 .aio_read = ll_file_aio_read,
3413 .write = ll_file_write,
3414 .aio_write = ll_file_aio_write,
3415 .unlocked_ioctl = ll_file_ioctl,
3416 .open = ll_file_open,
3417 .release = ll_file_release,
3418 .mmap = ll_file_mmap,
3419 .llseek = ll_file_seek,
3420 .splice_read = ll_file_splice_read,
3423 .flock = ll_file_flock,
3424 .lock = ll_file_flock
3427 /* These are for -o noflock - to return ENOSYS on flock calls */
3428 struct file_operations ll_file_operations_noflock = {
3429 .read = ll_file_read,
3430 .aio_read = ll_file_aio_read,
3431 .write = ll_file_write,
3432 .aio_write = ll_file_aio_write,
3433 .unlocked_ioctl = ll_file_ioctl,
3434 .open = ll_file_open,
3435 .release = ll_file_release,
3436 .mmap = ll_file_mmap,
3437 .llseek = ll_file_seek,
3438 .splice_read = ll_file_splice_read,
3441 .flock = ll_file_noflock,
3442 .lock = ll_file_noflock
3445 struct inode_operations ll_file_inode_operations = {
3446 .setattr = ll_setattr,
3447 .getattr = ll_getattr,
3448 .permission = ll_inode_permission,
3449 .setxattr = ll_setxattr,
3450 .getxattr = ll_getxattr,
3451 .listxattr = ll_listxattr,
3452 .removexattr = ll_removexattr,
3453 .fiemap = ll_fiemap,
3454 #ifdef HAVE_IOP_GET_ACL
3455 .get_acl = ll_get_acl,
3459 /* dynamic ioctl number support routins */
3460 static struct llioc_ctl_data {
3461 struct rw_semaphore ioc_sem;
3462 struct list_head ioc_head;
3464 __RWSEM_INITIALIZER(llioc.ioc_sem),
3465 LIST_HEAD_INIT(llioc.ioc_head)
3470 struct list_head iocd_list;
3471 unsigned int iocd_size;
3472 llioc_callback_t iocd_cb;
3473 unsigned int iocd_count;
3474 unsigned int iocd_cmd[0];
3477 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3480 struct llioc_data *in_data = NULL;
3483 if (cb == NULL || cmd == NULL ||
3484 count > LLIOC_MAX_CMD || count < 0)
3487 size = sizeof(*in_data) + count * sizeof(unsigned int);
3488 OBD_ALLOC(in_data, size);
3489 if (in_data == NULL)
3492 memset(in_data, 0, sizeof(*in_data));
3493 in_data->iocd_size = size;
3494 in_data->iocd_cb = cb;
3495 in_data->iocd_count = count;
3496 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3498 down_write(&llioc.ioc_sem);
3499 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3500 up_write(&llioc.ioc_sem);
3505 void ll_iocontrol_unregister(void *magic)
3507 struct llioc_data *tmp;
3512 down_write(&llioc.ioc_sem);
3513 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3515 unsigned int size = tmp->iocd_size;
3517 list_del(&tmp->iocd_list);
3518 up_write(&llioc.ioc_sem);
3520 OBD_FREE(tmp, size);
3524 up_write(&llioc.ioc_sem);
3526 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3529 EXPORT_SYMBOL(ll_iocontrol_register);
3530 EXPORT_SYMBOL(ll_iocontrol_unregister);
3532 static enum llioc_iter
3533 ll_iocontrol_call(struct inode *inode, struct file *file,
3534 unsigned int cmd, unsigned long arg, int *rcp)
3536 enum llioc_iter ret = LLIOC_CONT;
3537 struct llioc_data *data;
3538 int rc = -EINVAL, i;
3540 down_read(&llioc.ioc_sem);
3541 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3542 for (i = 0; i < data->iocd_count; i++) {
3543 if (cmd != data->iocd_cmd[i])
3546 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3550 if (ret == LLIOC_STOP)
3553 up_read(&llioc.ioc_sem);
3560 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3562 struct ll_inode_info *lli = ll_i2info(inode);
3563 struct cl_object *obj = lli->lli_clob;
3564 struct cl_env_nest nest;
3572 env = cl_env_nested_get(&nest);
3574 RETURN(PTR_ERR(env));
3576 rc = cl_conf_set(env, lli->lli_clob, conf);
3580 if (conf->coc_opc == OBJECT_CONF_SET) {
3581 struct ldlm_lock *lock = conf->coc_lock;
3582 struct cl_layout cl = {
3586 LASSERT(lock != NULL);
3587 LASSERT(ldlm_has_layout(lock));
3589 /* it can only be allowed to match after layout is
3590 * applied to inode otherwise false layout would be
3591 * seen. Applying layout shoud happen before dropping
3592 * the intent lock. */
3593 ldlm_lock_allow_match(lock);
3595 rc = cl_object_layout_get(env, obj, &cl);
3600 DFID": layout version change: %u -> %u\n",
3601 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3603 ll_layout_version_set(lli, cl.cl_layout_gen);
3607 cl_env_nested_put(&nest, env);
3612 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3613 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3616 struct ll_sb_info *sbi = ll_i2sbi(inode);
3617 struct ptlrpc_request *req;
3618 struct mdt_body *body;
3625 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3626 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3627 lock->l_lvb_data, lock->l_lvb_len);
3629 if (lock->l_lvb_data != NULL)
3632 /* if layout lock was granted right away, the layout is returned
3633 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3634 * blocked and then granted via completion ast, we have to fetch
3635 * layout here. Please note that we can't use the LVB buffer in
3636 * completion AST because it doesn't have a large enough buffer */
3637 rc = ll_get_default_mdsize(sbi, &lmmsize);
3639 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3640 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3645 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3647 GOTO(out, rc = -EPROTO);
3649 lmmsize = body->mbo_eadatasize;
3650 if (lmmsize == 0) /* empty layout */
3653 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3655 GOTO(out, rc = -EFAULT);
3657 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3658 if (lvbdata == NULL)
3659 GOTO(out, rc = -ENOMEM);
3661 memcpy(lvbdata, lmm, lmmsize);
3662 lock_res_and_lock(lock);
3663 if (unlikely(lock->l_lvb_data == NULL)) {
3664 lock->l_lvb_type = LVB_T_LAYOUT;
3665 lock->l_lvb_data = lvbdata;
3666 lock->l_lvb_len = lmmsize;
3669 unlock_res_and_lock(lock);
3671 if (lvbdata != NULL)
3672 OBD_FREE_LARGE(lvbdata, lmmsize);
3677 ptlrpc_req_finished(req);
3682 * Apply the layout to the inode. Layout lock is held and will be released
3685 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3686 struct inode *inode)
3688 struct ll_inode_info *lli = ll_i2info(inode);
3689 struct ll_sb_info *sbi = ll_i2sbi(inode);
3690 struct ldlm_lock *lock;
3691 struct cl_object_conf conf;
3694 bool wait_layout = false;
3697 LASSERT(lustre_handle_is_used(lockh));
3699 lock = ldlm_handle2lock(lockh);
3700 LASSERT(lock != NULL);
3701 LASSERT(ldlm_has_layout(lock));
3703 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3704 PFID(&lli->lli_fid), inode);
3706 /* in case this is a caching lock and reinstate with new inode */
3707 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3709 lock_res_and_lock(lock);
3710 lvb_ready = ldlm_is_lvb_ready(lock);
3711 unlock_res_and_lock(lock);
3712 /* checking lvb_ready is racy but this is okay. The worst case is
3713 * that multi processes may configure the file on the same time. */
3718 rc = ll_layout_fetch(inode, lock);
3722 /* for layout lock, lmm is stored in lock's lvb.
3723 * lvb_data is immutable if the lock is held so it's safe to access it
3726 * set layout to file. Unlikely this will fail as old layout was
3727 * surely eliminated */
3728 memset(&conf, 0, sizeof conf);
3729 conf.coc_opc = OBJECT_CONF_SET;
3730 conf.coc_inode = inode;
3731 conf.coc_lock = lock;
3732 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
3733 conf.u.coc_layout.lb_len = lock->l_lvb_len;
3734 rc = ll_layout_conf(inode, &conf);
3736 /* refresh layout failed, need to wait */
3737 wait_layout = rc == -EBUSY;
3741 LDLM_LOCK_PUT(lock);
3742 ldlm_lock_decref(lockh, mode);
3744 /* wait for IO to complete if it's still being used. */
3746 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3747 ll_get_fsname(inode->i_sb, NULL, 0),
3748 PFID(&lli->lli_fid), inode);
3750 memset(&conf, 0, sizeof conf);
3751 conf.coc_opc = OBJECT_CONF_WAIT;
3752 conf.coc_inode = inode;
3753 rc = ll_layout_conf(inode, &conf);
3757 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3758 ll_get_fsname(inode->i_sb, NULL, 0),
3759 PFID(&lli->lli_fid), rc);
3764 static int ll_layout_refresh_locked(struct inode *inode)
3766 struct ll_inode_info *lli = ll_i2info(inode);
3767 struct ll_sb_info *sbi = ll_i2sbi(inode);
3768 struct md_op_data *op_data;
3769 struct lookup_intent it;
3770 struct lustre_handle lockh;
3772 struct ldlm_enqueue_info einfo = {
3773 .ei_type = LDLM_IBITS,
3775 .ei_cb_bl = &ll_md_blocking_ast,
3776 .ei_cb_cp = &ldlm_completion_ast,
3782 /* mostly layout lock is caching on the local side, so try to match
3783 * it before grabbing layout lock mutex. */
3784 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3785 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3786 if (mode != 0) { /* hit cached lock */
3787 rc = ll_layout_lock_set(&lockh, mode, inode);
3794 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3795 0, 0, LUSTRE_OPC_ANY, NULL);
3796 if (IS_ERR(op_data))
3797 RETURN(PTR_ERR(op_data));
3799 /* have to enqueue one */
3800 memset(&it, 0, sizeof(it));
3801 it.it_op = IT_LAYOUT;
3802 lockh.cookie = 0ULL;
3804 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3805 ll_get_fsname(inode->i_sb, NULL, 0),
3806 PFID(&lli->lli_fid), inode);
3808 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3809 if (it.d.lustre.it_data != NULL)
3810 ptlrpc_req_finished(it.d.lustre.it_data);
3811 it.d.lustre.it_data = NULL;
3813 ll_finish_md_op_data(op_data);
3815 mode = it.d.lustre.it_lock_mode;
3816 it.d.lustre.it_lock_mode = 0;
3817 ll_intent_drop_lock(&it);
3820 /* set lock data in case this is a new lock */
3821 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3822 rc = ll_layout_lock_set(&lockh, mode, inode);
3831 * This function checks if there exists a LAYOUT lock on the client side,
3832 * or enqueues it if it doesn't have one in cache.
3834 * This function will not hold layout lock so it may be revoked any time after
3835 * this function returns. Any operations depend on layout should be redone
3838 * This function should be called before lov_io_init() to get an uptodate
3839 * layout version, the caller should save the version number and after IO
3840 * is finished, this function should be called again to verify that layout
3841 * is not changed during IO time.
3843 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3845 struct ll_inode_info *lli = ll_i2info(inode);
3846 struct ll_sb_info *sbi = ll_i2sbi(inode);
3850 *gen = ll_layout_version_get(lli);
3851 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3855 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3856 LASSERT(S_ISREG(inode->i_mode));
3858 /* take layout lock mutex to enqueue layout lock exclusively. */
3859 mutex_lock(&lli->lli_layout_mutex);
3861 rc = ll_layout_refresh_locked(inode);
3865 *gen = ll_layout_version_get(lli);
3867 mutex_unlock(&lli->lli_layout_mutex);
3873 * This function send a restore request to the MDT
3875 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3877 struct hsm_user_request *hur;
3881 len = sizeof(struct hsm_user_request) +
3882 sizeof(struct hsm_user_item);
3883 OBD_ALLOC(hur, len);
3887 hur->hur_request.hr_action = HUA_RESTORE;
3888 hur->hur_request.hr_archive_id = 0;
3889 hur->hur_request.hr_flags = 0;
3890 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3891 sizeof(hur->hur_user_item[0].hui_fid));
3892 hur->hur_user_item[0].hui_extent.offset = offset;
3893 hur->hur_user_item[0].hui_extent.length = length;
3894 hur->hur_request.hr_itemcount = 1;
3895 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,