4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
89 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
90 struct lustre_handle *fh)
92 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr_blocks = inode->i_blocks;
99 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
101 op_data->op_handle = *fh;
102 op_data->op_capa1 = ll_mdscapa_get(inode);
104 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
105 op_data->op_bias |= MDS_DATA_MODIFIED;
109 * Packs all the attributes into @op_data for the CLOSE rpc.
111 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
112 struct obd_client_handle *och)
116 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
117 ATTR_MTIME | ATTR_MTIME_SET |
118 ATTR_CTIME | ATTR_CTIME_SET;
120 if (!(och->och_flags & FMODE_WRITE))
123 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
170 ll_i2mdexp(inode)->exp_obd->obd_name,
171 PFID(ll_inode2fid(inode)), rc);
174 /* DATA_MODIFIED flag was successfully sent on close, cancel data
175 * modification flag. */
176 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
177 struct ll_inode_info *lli = ll_i2info(inode);
179 spin_lock(&lli->lli_lock);
180 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
181 spin_unlock(&lli->lli_lock);
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 if (req) /* This is close request */
200 ptlrpc_req_finished(req);
204 int ll_md_real_close(struct inode *inode, fmode_t fmode)
206 struct ll_inode_info *lli = ll_i2info(inode);
207 struct obd_client_handle **och_p;
208 struct obd_client_handle *och;
213 if (fmode & FMODE_WRITE) {
214 och_p = &lli->lli_mds_write_och;
215 och_usecount = &lli->lli_open_fd_write_count;
216 } else if (fmode & FMODE_EXEC) {
217 och_p = &lli->lli_mds_exec_och;
218 och_usecount = &lli->lli_open_fd_exec_count;
220 LASSERT(fmode & FMODE_READ);
221 och_p = &lli->lli_mds_read_och;
222 och_usecount = &lli->lli_open_fd_read_count;
225 mutex_lock(&lli->lli_och_mutex);
226 if (*och_usecount > 0) {
227 /* There are still users of this handle, so skip
229 mutex_unlock(&lli->lli_och_mutex);
235 mutex_unlock(&lli->lli_och_mutex);
238 /* There might be a race and this handle may already
240 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
247 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251 struct ll_inode_info *lli = ll_i2info(inode);
255 /* clear group lock, if present */
256 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
257 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
259 if (fd->fd_lease_och != NULL) {
262 /* Usually the lease is not released when the
263 * application crashed, we need to release here. */
264 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
265 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
266 PFID(&lli->lli_fid), rc, lease_broken);
268 fd->fd_lease_och = NULL;
271 if (fd->fd_och != NULL) {
272 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
277 /* Let's see if we have good enough OPEN lock on the file and if
278 we can skip talking to MDS */
279 if (file->f_dentry->d_inode) { /* Can this ever be false? */
281 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
282 struct lustre_handle lockh;
283 struct inode *inode = file->f_dentry->d_inode;
284 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
286 mutex_lock(&lli->lli_och_mutex);
287 if (fd->fd_omode & FMODE_WRITE) {
289 LASSERT(lli->lli_open_fd_write_count);
290 lli->lli_open_fd_write_count--;
291 } else if (fd->fd_omode & FMODE_EXEC) {
293 LASSERT(lli->lli_open_fd_exec_count);
294 lli->lli_open_fd_exec_count--;
297 LASSERT(lli->lli_open_fd_read_count);
298 lli->lli_open_fd_read_count--;
300 mutex_unlock(&lli->lli_och_mutex);
302 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
303 LDLM_IBITS, &policy, lockmode,
305 rc = ll_md_real_close(file->f_dentry->d_inode,
309 CERROR("released file has negative dentry: file = %p, "
310 "dentry = %p, name = %s\n",
311 file, file->f_dentry, file->f_dentry->d_name.name);
315 LUSTRE_FPRIVATE(file) = NULL;
316 ll_file_data_put(fd);
317 ll_capa_close(inode);
322 /* While this returns an error code, fput() the caller does not, so we need
323 * to make every effort to clean up all of our state here. Also, applications
324 * rarely check close errors and even if an error is returned they will not
325 * re-try the close call.
327 int ll_file_release(struct inode *inode, struct file *file)
329 struct ll_file_data *fd;
330 struct ll_sb_info *sbi = ll_i2sbi(inode);
331 struct ll_inode_info *lli = ll_i2info(inode);
335 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
336 PFID(ll_inode2fid(inode)), inode);
338 #ifdef CONFIG_FS_POSIX_ACL
339 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
340 inode == inode->i_sb->s_root->d_inode) {
341 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
344 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
345 fd->fd_flags &= ~LL_FILE_RMTACL;
346 rct_del(&sbi->ll_rct, current_pid());
347 et_search_free(&sbi->ll_et, current_pid());
352 if (inode->i_sb->s_root != file->f_dentry)
353 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
354 fd = LUSTRE_FPRIVATE(file);
357 /* The last ref on @file, maybe not the the owner pid of statahead,
358 * because parent and child process can share the same file handle. */
359 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
360 ll_deauthorize_statahead(inode, fd);
362 if (inode->i_sb->s_root == file->f_dentry) {
363 LUSTRE_FPRIVATE(file) = NULL;
364 ll_file_data_put(fd);
368 if (!S_ISDIR(inode->i_mode)) {
369 if (lli->lli_clob != NULL)
370 lov_read_and_clear_async_rc(lli->lli_clob);
371 lli->lli_async_rc = 0;
374 rc = ll_md_close(sbi->ll_md_exp, inode, file);
376 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
377 libcfs_debug_dumplog();
382 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
383 struct lookup_intent *itp)
385 struct dentry *de = file->f_dentry;
386 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
387 struct dentry *parent = de->d_parent;
388 const char *name = NULL;
390 struct md_op_data *op_data;
391 struct ptlrpc_request *req = NULL;
395 LASSERT(parent != NULL);
396 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
398 /* if server supports open-by-fid, or file name is invalid, don't pack
399 * name in open request */
400 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
401 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
402 name = de->d_name.name;
403 len = de->d_name.len;
406 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
407 name, len, 0, LUSTRE_OPC_ANY, NULL);
409 RETURN(PTR_ERR(op_data));
410 op_data->op_data = lmm;
411 op_data->op_data_size = lmmsize;
413 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
414 &ll_md_blocking_ast, 0);
415 ll_finish_md_op_data(op_data);
417 /* reason for keep own exit path - don`t flood log
418 * with messages with -ESTALE errors.
420 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
421 it_open_error(DISP_OPEN_OPEN, itp))
423 ll_release_openhandle(de, itp);
427 if (it_disposition(itp, DISP_LOOKUP_NEG))
428 GOTO(out, rc = -ENOENT);
430 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
431 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
432 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
436 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
437 if (!rc && itp->d.lustre.it_lock_mode)
438 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
441 ptlrpc_req_finished(req);
442 ll_intent_drop_lock(itp);
447 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
448 struct obd_client_handle *och)
450 struct ptlrpc_request *req = it->d.lustre.it_data;
451 struct mdt_body *body;
453 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454 och->och_fh = body->mbo_handle;
455 och->och_fid = body->mbo_fid1;
456 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
457 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
458 och->och_flags = it->it_flags;
460 return md_set_open_replay_data(md_exp, och, it);
463 static int ll_local_open(struct file *file, struct lookup_intent *it,
464 struct ll_file_data *fd, struct obd_client_handle *och)
466 struct inode *inode = file->f_dentry->d_inode;
469 LASSERT(!LUSTRE_FPRIVATE(file));
476 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
481 LUSTRE_FPRIVATE(file) = fd;
482 ll_readahead_init(inode, &fd->fd_ras);
483 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
485 /* ll_cl_context initialize */
486 rwlock_init(&fd->fd_lock);
487 INIT_LIST_HEAD(&fd->fd_lccs);
492 /* Open a file, and (for the very first open) create objects on the OSTs at
493 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
494 * creation or open until ll_lov_setstripe() ioctl is called.
496 * If we already have the stripe MD locally then we don't request it in
497 * md_open(), by passing a lmm_size = 0.
499 * It is up to the application to ensure no other processes open this file
500 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
501 * used. We might be able to avoid races of that sort by getting lli_open_sem
502 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
503 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
505 int ll_file_open(struct inode *inode, struct file *file)
507 struct ll_inode_info *lli = ll_i2info(inode);
508 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
509 .it_flags = file->f_flags };
510 struct obd_client_handle **och_p = NULL;
511 __u64 *och_usecount = NULL;
512 struct ll_file_data *fd;
516 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
517 PFID(ll_inode2fid(inode)), inode, file->f_flags);
519 it = file->private_data; /* XXX: compat macro */
520 file->private_data = NULL; /* prevent ll_local_open assertion */
522 fd = ll_file_data_get();
524 GOTO(out_openerr, rc = -ENOMEM);
527 if (S_ISDIR(inode->i_mode))
528 ll_authorize_statahead(inode, fd);
530 if (inode->i_sb->s_root == file->f_dentry) {
531 LUSTRE_FPRIVATE(file) = fd;
535 if (!it || !it->d.lustre.it_disposition) {
536 /* Convert f_flags into access mode. We cannot use file->f_mode,
537 * because everything but O_ACCMODE mask was stripped from
539 if ((oit.it_flags + 1) & O_ACCMODE)
541 if (file->f_flags & O_TRUNC)
542 oit.it_flags |= FMODE_WRITE;
544 /* kernel only call f_op->open in dentry_open. filp_open calls
545 * dentry_open after call to open_namei that checks permissions.
546 * Only nfsd_open call dentry_open directly without checking
547 * permissions and because of that this code below is safe. */
548 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
549 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
551 /* We do not want O_EXCL here, presumably we opened the file
552 * already? XXX - NFS implications? */
553 oit.it_flags &= ~O_EXCL;
555 /* bug20584, if "it_flags" contains O_CREAT, the file will be
556 * created if necessary, then "IT_CREAT" should be set to keep
557 * consistent with it */
558 if (oit.it_flags & O_CREAT)
559 oit.it_op |= IT_CREAT;
565 /* Let's see if we have file open on MDS already. */
566 if (it->it_flags & FMODE_WRITE) {
567 och_p = &lli->lli_mds_write_och;
568 och_usecount = &lli->lli_open_fd_write_count;
569 } else if (it->it_flags & FMODE_EXEC) {
570 och_p = &lli->lli_mds_exec_och;
571 och_usecount = &lli->lli_open_fd_exec_count;
573 och_p = &lli->lli_mds_read_och;
574 och_usecount = &lli->lli_open_fd_read_count;
577 mutex_lock(&lli->lli_och_mutex);
578 if (*och_p) { /* Open handle is present */
579 if (it_disposition(it, DISP_OPEN_OPEN)) {
580 /* Well, there's extra open request that we do not need,
581 let's close it somehow. This will decref request. */
582 rc = it_open_error(DISP_OPEN_OPEN, it);
584 mutex_unlock(&lli->lli_och_mutex);
585 GOTO(out_openerr, rc);
588 ll_release_openhandle(file->f_dentry, it);
592 rc = ll_local_open(file, it, fd, NULL);
595 mutex_unlock(&lli->lli_och_mutex);
596 GOTO(out_openerr, rc);
599 LASSERT(*och_usecount == 0);
600 if (!it->d.lustre.it_disposition) {
601 /* We cannot just request lock handle now, new ELC code
602 means that one of other OPEN locks for this file
603 could be cancelled, and since blocking ast handler
604 would attempt to grab och_mutex as well, that would
605 result in a deadlock */
606 mutex_unlock(&lli->lli_och_mutex);
608 * Normally called under two situations:
610 * 2. A race/condition on MDS resulting in no open
611 * handle to be returned from LOOKUP|OPEN request,
612 * for example if the target entry was a symlink.
614 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
616 * Always specify MDS_OPEN_BY_FID because we don't want
617 * to get file with different fid.
619 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
620 rc = ll_intent_file_open(file, NULL, 0, it);
622 GOTO(out_openerr, rc);
626 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
628 GOTO(out_och_free, rc = -ENOMEM);
632 /* md_intent_lock() didn't get a request ref if there was an
633 * open error, so don't do cleanup on the request here
635 /* XXX (green): Should not we bail out on any error here, not
636 * just open error? */
637 rc = it_open_error(DISP_OPEN_OPEN, it);
639 GOTO(out_och_free, rc);
641 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
642 "inode %p: disposition %x, status %d\n", inode,
643 it_disposition(it, ~0), it->d.lustre.it_status);
645 rc = ll_local_open(file, it, fd, *och_p);
647 GOTO(out_och_free, rc);
649 mutex_unlock(&lli->lli_och_mutex);
652 /* Must do this outside lli_och_mutex lock to prevent deadlock where
653 different kind of OPEN lock for this same inode gets cancelled
654 by ldlm_cancel_lru */
655 if (!S_ISREG(inode->i_mode))
656 GOTO(out_och_free, rc);
660 cl_lov_delay_create_clear(&file->f_flags);
661 GOTO(out_och_free, rc);
665 if (och_p && *och_p) {
666 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
667 *och_p = NULL; /* OBD_FREE writes some magic there */
670 mutex_unlock(&lli->lli_och_mutex);
673 if (lli->lli_opendir_key == fd)
674 ll_deauthorize_statahead(inode, fd);
676 ll_file_data_put(fd);
678 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
681 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
682 ptlrpc_req_finished(it->d.lustre.it_data);
683 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
689 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
690 struct ldlm_lock_desc *desc, void *data, int flag)
693 struct lustre_handle lockh;
697 case LDLM_CB_BLOCKING:
698 ldlm_lock2handle(lock, &lockh);
699 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
701 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
705 case LDLM_CB_CANCELING:
713 * Acquire a lease and open the file.
715 static struct obd_client_handle *
716 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
719 struct lookup_intent it = { .it_op = IT_OPEN };
720 struct ll_sb_info *sbi = ll_i2sbi(inode);
721 struct md_op_data *op_data;
722 struct ptlrpc_request *req = NULL;
723 struct lustre_handle old_handle = { 0 };
724 struct obd_client_handle *och = NULL;
729 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
730 RETURN(ERR_PTR(-EINVAL));
733 struct ll_inode_info *lli = ll_i2info(inode);
734 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
735 struct obd_client_handle **och_p;
738 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
739 RETURN(ERR_PTR(-EPERM));
741 /* Get the openhandle of the file */
743 mutex_lock(&lli->lli_och_mutex);
744 if (fd->fd_lease_och != NULL) {
745 mutex_unlock(&lli->lli_och_mutex);
749 if (fd->fd_och == NULL) {
750 if (file->f_mode & FMODE_WRITE) {
751 LASSERT(lli->lli_mds_write_och != NULL);
752 och_p = &lli->lli_mds_write_och;
753 och_usecount = &lli->lli_open_fd_write_count;
755 LASSERT(lli->lli_mds_read_och != NULL);
756 och_p = &lli->lli_mds_read_och;
757 och_usecount = &lli->lli_open_fd_read_count;
759 if (*och_usecount == 1) {
766 mutex_unlock(&lli->lli_och_mutex);
767 if (rc < 0) /* more than 1 opener */
770 LASSERT(fd->fd_och != NULL);
771 old_handle = fd->fd_och->och_fh;
776 RETURN(ERR_PTR(-ENOMEM));
778 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
779 LUSTRE_OPC_ANY, NULL);
781 GOTO(out, rc = PTR_ERR(op_data));
783 /* To tell the MDT this openhandle is from the same owner */
784 op_data->op_handle = old_handle;
786 it.it_flags = fmode | open_flags;
787 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
788 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
789 &ll_md_blocking_lease_ast,
790 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
791 * it can be cancelled which may mislead applications that the lease is
793 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
794 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
795 * doesn't deal with openhandle, so normal openhandle will be leaked. */
796 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
797 ll_finish_md_op_data(op_data);
798 ptlrpc_req_finished(req);
800 GOTO(out_release_it, rc);
802 if (it_disposition(&it, DISP_LOOKUP_NEG))
803 GOTO(out_release_it, rc = -ENOENT);
805 rc = it_open_error(DISP_OPEN_OPEN, &it);
807 GOTO(out_release_it, rc);
809 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
810 ll_och_fill(sbi->ll_md_exp, &it, och);
812 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
813 GOTO(out_close, rc = -EOPNOTSUPP);
815 /* already get lease, handle lease lock */
816 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
817 if (it.d.lustre.it_lock_mode == 0 ||
818 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
819 /* open lock must return for lease */
820 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
821 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
822 it.d.lustre.it_lock_bits);
823 GOTO(out_close, rc = -EPROTO);
826 ll_intent_release(&it);
830 /* Cancel open lock */
831 if (it.d.lustre.it_lock_mode != 0) {
832 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
833 it.d.lustre.it_lock_mode);
834 it.d.lustre.it_lock_mode = 0;
835 och->och_lease_handle.cookie = 0ULL;
837 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
839 CERROR("%s: error closing file "DFID": %d\n",
840 ll_get_fsname(inode->i_sb, NULL, 0),
841 PFID(&ll_i2info(inode)->lli_fid), rc2);
842 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
844 ll_intent_release(&it);
852 * Release lease and close the file.
853 * It will check if the lease has ever broken.
855 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
858 struct ldlm_lock *lock;
859 bool cancelled = true;
863 lock = ldlm_handle2lock(&och->och_lease_handle);
865 lock_res_and_lock(lock);
866 cancelled = ldlm_is_cancel(lock);
867 unlock_res_and_lock(lock);
871 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
872 PFID(&ll_i2info(inode)->lli_fid), cancelled);
875 ldlm_cli_cancel(&och->och_lease_handle, 0);
876 if (lease_broken != NULL)
877 *lease_broken = cancelled;
879 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
884 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
886 struct ll_inode_info *lli = ll_i2info(inode);
887 struct cl_object *obj = lli->lli_clob;
888 struct cl_attr *attr = vvp_env_thread_attr(env);
896 ll_inode_size_lock(inode);
898 /* merge timestamps the most recently obtained from mds with
899 timestamps obtained from osts */
900 LTIME_S(inode->i_atime) = lli->lli_atime;
901 LTIME_S(inode->i_mtime) = lli->lli_mtime;
902 LTIME_S(inode->i_ctime) = lli->lli_ctime;
904 atime = LTIME_S(inode->i_atime);
905 mtime = LTIME_S(inode->i_mtime);
906 ctime = LTIME_S(inode->i_ctime);
908 cl_object_attr_lock(obj);
909 rc = cl_object_attr_get(env, obj, attr);
910 cl_object_attr_unlock(obj);
913 GOTO(out_size_unlock, rc);
915 if (atime < attr->cat_atime)
916 atime = attr->cat_atime;
918 if (ctime < attr->cat_ctime)
919 ctime = attr->cat_ctime;
921 if (mtime < attr->cat_mtime)
922 mtime = attr->cat_mtime;
924 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
925 PFID(&lli->lli_fid), attr->cat_size);
927 i_size_write(inode, attr->cat_size);
928 inode->i_blocks = attr->cat_blocks;
930 LTIME_S(inode->i_atime) = atime;
931 LTIME_S(inode->i_mtime) = mtime;
932 LTIME_S(inode->i_ctime) = ctime;
935 ll_inode_size_unlock(inode);
940 static bool file_is_noatime(const struct file *file)
942 const struct vfsmount *mnt = file->f_path.mnt;
943 const struct inode *inode = file->f_path.dentry->d_inode;
945 /* Adapted from file_accessed() and touch_atime().*/
946 if (file->f_flags & O_NOATIME)
949 if (inode->i_flags & S_NOATIME)
952 if (IS_NOATIME(inode))
955 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
958 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
961 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
967 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
969 struct inode *inode = file->f_dentry->d_inode;
971 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
973 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
974 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
975 file->f_flags & O_DIRECT ||
978 io->ci_obj = ll_i2info(inode)->lli_clob;
979 io->ci_lockreq = CILR_MAYBE;
980 if (ll_file_nolock(file)) {
981 io->ci_lockreq = CILR_NEVER;
982 io->ci_no_srvlock = 1;
983 } else if (file->f_flags & O_APPEND) {
984 io->ci_lockreq = CILR_MANDATORY;
987 io->ci_noatime = file_is_noatime(file);
991 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
992 struct file *file, enum cl_io_type iot,
993 loff_t *ppos, size_t count)
995 struct vvp_io *vio = vvp_env_io(env);
996 struct inode *inode = file->f_dentry->d_inode;
997 struct ll_inode_info *lli = ll_i2info(inode);
998 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1002 struct range_lock range;
1006 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1007 file->f_dentry->d_name.name, iot, *ppos, count);
1010 io = vvp_env_thread_io(env);
1011 ll_io_init(io, file, iot == CIT_WRITE);
1013 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1014 bool range_locked = false;
1016 if (file->f_flags & O_APPEND)
1017 range_lock_init(&range, 0, LUSTRE_EOF);
1019 range_lock_init(&range, *ppos, *ppos + count - 1);
1021 vio->vui_fd = LUSTRE_FPRIVATE(file);
1022 vio->vui_io_subtype = args->via_io_subtype;
1024 switch (vio->vui_io_subtype) {
1026 vio->vui_iov = args->u.normal.via_iov;
1027 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1028 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1029 vio->vui_iocb = args->u.normal.via_iocb;
1030 /* Direct IO reads must also take range lock,
1031 * or multiple reads will try to work on the same pages
1032 * See LU-6227 for details. */
1033 if (((iot == CIT_WRITE) ||
1034 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1035 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1036 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1038 rc = range_lock(&lli->lli_write_tree, &range);
1042 range_locked = true;
1044 down_read(&lli->lli_trunc_sem);
1047 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1048 vio->u.splice.vui_flags = args->u.splice.via_flags;
1051 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1055 ll_cl_add(file, env, io);
1056 rc = cl_io_loop(env, io);
1057 ll_cl_remove(file, env);
1059 if (args->via_io_subtype == IO_NORMAL)
1060 up_read(&lli->lli_trunc_sem);
1062 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1064 range_unlock(&lli->lli_write_tree, &range);
1067 /* cl_io_rw_init() handled IO */
1071 if (io->ci_nob > 0) {
1072 result += io->ci_nob;
1073 count -= io->ci_nob;
1074 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1076 /* prepare IO restart */
1077 if (count > 0 && args->via_io_subtype == IO_NORMAL) {
1078 args->u.normal.via_iov = vio->vui_iov;
1079 args->u.normal.via_nrsegs = vio->vui_tot_nrsegs;
1084 cl_io_fini(env, io);
1086 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1088 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1089 file->f_dentry->d_name.name,
1090 iot == CIT_READ ? "read" : "write",
1091 *ppos, count, result);
1095 if (iot == CIT_READ) {
1097 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1098 LPROC_LL_READ_BYTES, result);
1099 } else if (iot == CIT_WRITE) {
1101 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1102 LPROC_LL_WRITE_BYTES, result);
1103 fd->fd_write_failed = false;
1104 } else if (rc != -ERESTARTSYS) {
1105 fd->fd_write_failed = true;
1109 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1111 return result > 0 ? result : rc;
1115 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1117 static int ll_file_get_iov_count(const struct iovec *iov,
1118 unsigned long *nr_segs, size_t *count)
1123 for (seg = 0; seg < *nr_segs; seg++) {
1124 const struct iovec *iv = &iov[seg];
1127 * If any segment has a negative length, or the cumulative
1128 * length ever wraps negative then return -EINVAL.
1131 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1133 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1138 cnt -= iv->iov_len; /* This segment is no good */
1145 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1146 unsigned long nr_segs, loff_t pos)
1149 struct vvp_io_args *args;
1150 struct iovec *local_iov;
1156 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1160 env = cl_env_get(&refcheck);
1162 RETURN(PTR_ERR(env));
1165 local_iov = &ll_env_info(env)->lti_local_iov;
1168 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1169 if (local_iov == NULL) {
1170 cl_env_put(env, &refcheck);
1174 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1177 args = ll_env_args(env, IO_NORMAL);
1178 args->u.normal.via_iov = local_iov;
1179 args->u.normal.via_nrsegs = nr_segs;
1180 args->u.normal.via_iocb = iocb;
1182 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1183 &iocb->ki_pos, count);
1185 cl_env_put(env, &refcheck);
1188 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1193 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1197 struct iovec iov = { .iov_base = buf, .iov_len = count };
1198 struct kiocb *kiocb;
1203 env = cl_env_get(&refcheck);
1205 RETURN(PTR_ERR(env));
1207 kiocb = &ll_env_info(env)->lti_kiocb;
1208 init_sync_kiocb(kiocb, file);
1209 kiocb->ki_pos = *ppos;
1210 #ifdef HAVE_KIOCB_KI_LEFT
1211 kiocb->ki_left = count;
1213 kiocb->ki_nbytes = count;
1216 result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
1217 *ppos = kiocb->ki_pos;
1219 cl_env_put(env, &refcheck);
1224 * Write to a file (through the page cache).
1227 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1228 unsigned long nr_segs, loff_t pos)
1231 struct vvp_io_args *args;
1232 struct iovec *local_iov;
1238 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1242 env = cl_env_get(&refcheck);
1244 RETURN(PTR_ERR(env));
1247 local_iov = &ll_env_info(env)->lti_local_iov;
1250 OBD_ALLOC(local_iov, sizeof(*iov) * nr_segs);
1251 if (local_iov == NULL) {
1252 cl_env_put(env, &refcheck);
1256 memcpy(local_iov, iov, sizeof(*iov) * nr_segs);
1259 args = ll_env_args(env, IO_NORMAL);
1260 args->u.normal.via_iov = local_iov;
1261 args->u.normal.via_nrsegs = nr_segs;
1262 args->u.normal.via_iocb = iocb;
1264 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1265 &iocb->ki_pos, count);
1266 cl_env_put(env, &refcheck);
1269 OBD_FREE(local_iov, sizeof(*iov) * nr_segs);
1274 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1275 size_t count, loff_t *ppos)
1278 struct iovec iov = { .iov_base = (void __user *)buf,
1280 struct kiocb *kiocb;
1285 env = cl_env_get(&refcheck);
1287 RETURN(PTR_ERR(env));
1289 kiocb = &ll_env_info(env)->lti_kiocb;
1290 init_sync_kiocb(kiocb, file);
1291 kiocb->ki_pos = *ppos;
1292 #ifdef HAVE_KIOCB_KI_LEFT
1293 kiocb->ki_left = count;
1295 kiocb->ki_nbytes = count;
1298 result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1299 *ppos = kiocb->ki_pos;
1301 cl_env_put(env, &refcheck);
1306 * Send file content (through pagecache) somewhere with helper
1308 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1309 struct pipe_inode_info *pipe, size_t count,
1313 struct vvp_io_args *args;
1318 env = cl_env_get(&refcheck);
1320 RETURN(PTR_ERR(env));
1322 args = ll_env_args(env, IO_SPLICE);
1323 args->u.splice.via_pipe = pipe;
1324 args->u.splice.via_flags = flags;
1326 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1327 cl_env_put(env, &refcheck);
1331 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1332 __u64 flags, struct lov_user_md *lum,
1335 struct lookup_intent oit = {
1337 .it_flags = flags | MDS_OPEN_BY_FID,
1342 ll_inode_size_lock(inode);
1343 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1345 GOTO(out_unlock, rc);
1347 ll_release_openhandle(file->f_dentry, &oit);
1350 ll_inode_size_unlock(inode);
1351 ll_intent_release(&oit);
1352 cl_lov_delay_create_clear(&file->f_flags);
1357 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1358 struct lov_mds_md **lmmp, int *lmm_size,
1359 struct ptlrpc_request **request)
1361 struct ll_sb_info *sbi = ll_i2sbi(inode);
1362 struct mdt_body *body;
1363 struct lov_mds_md *lmm = NULL;
1364 struct ptlrpc_request *req = NULL;
1365 struct md_op_data *op_data;
1368 rc = ll_get_default_mdsize(sbi, &lmmsize);
1372 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1373 strlen(filename), lmmsize,
1374 LUSTRE_OPC_ANY, NULL);
1375 if (IS_ERR(op_data))
1376 RETURN(PTR_ERR(op_data));
1378 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1379 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1380 ll_finish_md_op_data(op_data);
1382 CDEBUG(D_INFO, "md_getattr_name failed "
1383 "on %s: rc %d\n", filename, rc);
1387 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1388 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1390 lmmsize = body->mbo_eadatasize;
1392 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1394 GOTO(out, rc = -ENODATA);
1397 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1398 LASSERT(lmm != NULL);
1400 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1401 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1402 GOTO(out, rc = -EPROTO);
1406 * This is coming from the MDS, so is probably in
1407 * little endian. We convert it to host endian before
1408 * passing it to userspace.
1410 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1413 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1414 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1417 /* if function called for directory - we should
1418 * avoid swab not existent lsm objects */
1419 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1420 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1421 if (S_ISREG(body->mbo_mode))
1422 lustre_swab_lov_user_md_objects(
1423 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1425 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1426 lustre_swab_lov_user_md_v3(
1427 (struct lov_user_md_v3 *)lmm);
1428 if (S_ISREG(body->mbo_mode))
1429 lustre_swab_lov_user_md_objects(
1430 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1437 *lmm_size = lmmsize;
1442 static int ll_lov_setea(struct inode *inode, struct file *file,
1445 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1446 struct lov_user_md *lump;
1447 int lum_size = sizeof(struct lov_user_md) +
1448 sizeof(struct lov_user_ost_data);
1452 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1455 OBD_ALLOC_LARGE(lump, lum_size);
1459 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1460 OBD_FREE_LARGE(lump, lum_size);
1464 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1466 OBD_FREE_LARGE(lump, lum_size);
1470 static int ll_file_getstripe(struct inode *inode,
1471 struct lov_user_md __user *lum)
1478 env = cl_env_get(&refcheck);
1480 RETURN(PTR_ERR(env));
1482 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1483 cl_env_put(env, &refcheck);
1487 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1490 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1491 struct lov_user_md *klum;
1493 __u64 flags = FMODE_WRITE;
1496 rc = ll_copy_user_md(lum, &klum);
1501 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1505 put_user(0, &lum->lmm_stripe_count);
1507 ll_layout_refresh(inode, &gen);
1508 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1511 OBD_FREE(klum, lum_size);
1516 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1518 struct ll_inode_info *lli = ll_i2info(inode);
1519 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1520 struct ll_grouplock grouplock;
1525 CWARN("group id for group lock must not be 0\n");
1529 if (ll_file_nolock(file))
1530 RETURN(-EOPNOTSUPP);
1532 spin_lock(&lli->lli_lock);
1533 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1534 CWARN("group lock already existed with gid %lu\n",
1535 fd->fd_grouplock.lg_gid);
1536 spin_unlock(&lli->lli_lock);
1539 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1540 spin_unlock(&lli->lli_lock);
1542 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1543 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1547 spin_lock(&lli->lli_lock);
1548 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1549 spin_unlock(&lli->lli_lock);
1550 CERROR("another thread just won the race\n");
1551 cl_put_grouplock(&grouplock);
1555 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1556 fd->fd_grouplock = grouplock;
1557 spin_unlock(&lli->lli_lock);
1559 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1563 static int ll_put_grouplock(struct inode *inode, struct file *file,
1566 struct ll_inode_info *lli = ll_i2info(inode);
1567 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1568 struct ll_grouplock grouplock;
1571 spin_lock(&lli->lli_lock);
1572 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1573 spin_unlock(&lli->lli_lock);
1574 CWARN("no group lock held\n");
1578 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1580 if (fd->fd_grouplock.lg_gid != arg) {
1581 CWARN("group lock %lu doesn't match current id %lu\n",
1582 arg, fd->fd_grouplock.lg_gid);
1583 spin_unlock(&lli->lli_lock);
1587 grouplock = fd->fd_grouplock;
1588 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1589 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1590 spin_unlock(&lli->lli_lock);
1592 cl_put_grouplock(&grouplock);
1593 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1598 * Close inode open handle
1600 * \param dentry [in] dentry which contains the inode
1601 * \param it [in,out] intent which contains open info and result
1604 * \retval <0 failure
1606 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1608 struct inode *inode = dentry->d_inode;
1609 struct obd_client_handle *och;
1615 /* Root ? Do nothing. */
1616 if (dentry->d_inode->i_sb->s_root == dentry)
1619 /* No open handle to close? Move away */
1620 if (!it_disposition(it, DISP_OPEN_OPEN))
1623 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1625 OBD_ALLOC(och, sizeof(*och));
1627 GOTO(out, rc = -ENOMEM);
1629 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1631 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1634 /* this one is in place of ll_file_open */
1635 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1636 ptlrpc_req_finished(it->d.lustre.it_data);
1637 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1643 * Get size for inode for which FIEMAP mapping is requested.
1644 * Make the FIEMAP get_info call and returns the result.
1645 * \param fiemap kernel buffer to hold extens
1646 * \param num_bytes kernel buffer size
1648 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1654 struct ll_fiemap_info_key fmkey = { .name = KEY_FIEMAP, };
1657 /* Checks for fiemap flags */
1658 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1659 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1663 /* Check for FIEMAP_FLAG_SYNC */
1664 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1665 rc = filemap_fdatawrite(inode->i_mapping);
1670 env = cl_env_get(&refcheck);
1672 RETURN(PTR_ERR(env));
1674 if (i_size_read(inode) == 0) {
1675 rc = ll_glimpse_size(inode);
1680 fmkey.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1681 obdo_from_inode(&fmkey.oa, inode, OBD_MD_FLSIZE);
1682 obdo_set_parent_fid(&fmkey.oa, &ll_i2info(inode)->lli_fid);
1684 /* If filesize is 0, then there would be no objects for mapping */
1685 if (fmkey.oa.o_size == 0) {
1686 fiemap->fm_mapped_extents = 0;
1690 fmkey.fiemap = *fiemap;
1692 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1693 &fmkey, fiemap, &num_bytes);
1695 cl_env_put(env, &refcheck);
1699 int ll_fid2path(struct inode *inode, void __user *arg)
1701 struct obd_export *exp = ll_i2mdexp(inode);
1702 const struct getinfo_fid2path __user *gfin = arg;
1704 struct getinfo_fid2path *gfout;
1710 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1711 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1714 /* Only need to get the buflen */
1715 if (get_user(pathlen, &gfin->gf_pathlen))
1718 if (pathlen > PATH_MAX)
1721 outsize = sizeof(*gfout) + pathlen;
1722 OBD_ALLOC(gfout, outsize);
1726 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1727 GOTO(gf_free, rc = -EFAULT);
1729 /* Call mdc_iocontrol */
1730 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1734 if (copy_to_user(arg, gfout, outsize))
1738 OBD_FREE(gfout, outsize);
1742 static int ll_ioctl_fiemap(struct inode *inode, struct fiemap __user *arg)
1744 struct fiemap *fiemap;
1750 /* Get the extent count so we can calculate the size of
1751 * required fiemap buffer */
1752 if (get_user(extent_count, &arg->fm_extent_count))
1756 (SIZE_MAX - sizeof(*fiemap)) / sizeof(struct ll_fiemap_extent))
1758 num_bytes = sizeof(*fiemap) + (extent_count *
1759 sizeof(struct ll_fiemap_extent));
1761 OBD_ALLOC_LARGE(fiemap, num_bytes);
1765 /* get the fiemap value */
1766 if (copy_from_user(fiemap, arg, sizeof(*fiemap)))
1767 GOTO(error, rc = -EFAULT);
1769 /* If fm_extent_count is non-zero, read the first extent since
1770 * it is used to calculate end_offset and device from previous
1772 if (extent_count != 0) {
1773 if (copy_from_user(&fiemap->fm_extents[0],
1774 (char __user *)arg + sizeof(*fiemap),
1775 sizeof(struct ll_fiemap_extent)))
1776 GOTO(error, rc = -EFAULT);
1779 rc = ll_do_fiemap(inode, fiemap, num_bytes);
1783 ret_bytes = sizeof(struct fiemap);
1785 if (extent_count != 0)
1786 ret_bytes += (fiemap->fm_mapped_extents *
1787 sizeof(struct ll_fiemap_extent));
1789 if (copy_to_user((void __user *)arg, fiemap, ret_bytes))
1793 OBD_FREE_LARGE(fiemap, num_bytes);
1798 * Read the data_version for inode.
1800 * This value is computed using stripe object version on OST.
1801 * Version is computed using server side locking.
1803 * @param flags if do sync on the OST side;
1805 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1806 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1808 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1815 /* If no file object initialized, we consider its version is 0. */
1816 if (ll_i2info(inode)->lli_clob == NULL) {
1821 env = cl_env_get(&refcheck);
1823 RETURN(PTR_ERR(env));
1825 rc = cl_object_data_version(env, ll_i2info(inode)->lli_clob,
1826 data_version, flags);
1827 cl_env_put(env, &refcheck);
1832 * Trigger a HSM release request for the provided inode.
1834 int ll_hsm_release(struct inode *inode)
1836 struct cl_env_nest nest;
1838 struct obd_client_handle *och = NULL;
1839 __u64 data_version = 0;
1843 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1844 ll_get_fsname(inode->i_sb, NULL, 0),
1845 PFID(&ll_i2info(inode)->lli_fid));
1847 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1849 GOTO(out, rc = PTR_ERR(och));
1851 /* Grab latest data_version and [am]time values */
1852 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1856 env = cl_env_nested_get(&nest);
1858 GOTO(out, rc = PTR_ERR(env));
1860 ll_merge_attr(env, inode);
1861 cl_env_nested_put(&nest, env);
1863 /* Release the file.
1864 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1865 * we still need it to pack l_remote_handle to MDT. */
1866 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1872 if (och != NULL && !IS_ERR(och)) /* close the file */
1873 ll_lease_close(och, inode, NULL);
1878 struct ll_swap_stack {
1879 struct iattr ia1, ia2;
1881 struct inode *inode1, *inode2;
1882 bool check_dv1, check_dv2;
1885 static int ll_swap_layouts(struct file *file1, struct file *file2,
1886 struct lustre_swap_layouts *lsl)
1888 struct mdc_swap_layouts msl;
1889 struct md_op_data *op_data;
1892 struct ll_swap_stack *llss = NULL;
1895 OBD_ALLOC_PTR(llss);
1899 llss->inode1 = file1->f_dentry->d_inode;
1900 llss->inode2 = file2->f_dentry->d_inode;
1902 if (!S_ISREG(llss->inode2->i_mode))
1903 GOTO(free, rc = -EINVAL);
1905 if (inode_permission(llss->inode1, MAY_WRITE) ||
1906 inode_permission(llss->inode2, MAY_WRITE))
1907 GOTO(free, rc = -EPERM);
1909 if (llss->inode2->i_sb != llss->inode1->i_sb)
1910 GOTO(free, rc = -EXDEV);
1912 /* we use 2 bool because it is easier to swap than 2 bits */
1913 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1914 llss->check_dv1 = true;
1916 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1917 llss->check_dv2 = true;
1919 /* we cannot use lsl->sl_dvX directly because we may swap them */
1920 llss->dv1 = lsl->sl_dv1;
1921 llss->dv2 = lsl->sl_dv2;
1923 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1924 if (rc == 0) /* same file, done! */
1927 if (rc < 0) { /* sequentialize it */
1928 swap(llss->inode1, llss->inode2);
1930 swap(llss->dv1, llss->dv2);
1931 swap(llss->check_dv1, llss->check_dv2);
1935 if (gid != 0) { /* application asks to flush dirty cache */
1936 rc = ll_get_grouplock(llss->inode1, file1, gid);
1940 rc = ll_get_grouplock(llss->inode2, file2, gid);
1942 ll_put_grouplock(llss->inode1, file1, gid);
1947 /* to be able to restore mtime and atime after swap
1948 * we need to first save them */
1950 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1951 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1952 llss->ia1.ia_atime = llss->inode1->i_atime;
1953 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1954 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1955 llss->ia2.ia_atime = llss->inode2->i_atime;
1956 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1959 /* ultimate check, before swaping the layouts we check if
1960 * dataversion has changed (if requested) */
1961 if (llss->check_dv1) {
1962 rc = ll_data_version(llss->inode1, &dv, 0);
1965 if (dv != llss->dv1)
1966 GOTO(putgl, rc = -EAGAIN);
1969 if (llss->check_dv2) {
1970 rc = ll_data_version(llss->inode2, &dv, 0);
1973 if (dv != llss->dv2)
1974 GOTO(putgl, rc = -EAGAIN);
1977 /* struct md_op_data is used to send the swap args to the mdt
1978 * only flags is missing, so we use struct mdc_swap_layouts
1979 * through the md_op_data->op_data */
1980 /* flags from user space have to be converted before they are send to
1981 * server, no flag is sent today, they are only used on the client */
1984 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1985 0, LUSTRE_OPC_ANY, &msl);
1986 if (IS_ERR(op_data))
1987 GOTO(free, rc = PTR_ERR(op_data));
1989 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1990 sizeof(*op_data), op_data, NULL);
1991 ll_finish_md_op_data(op_data);
1995 ll_put_grouplock(llss->inode2, file2, gid);
1996 ll_put_grouplock(llss->inode1, file1, gid);
1999 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2003 /* clear useless flags */
2004 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2005 llss->ia1.ia_valid &= ~ATTR_MTIME;
2006 llss->ia2.ia_valid &= ~ATTR_MTIME;
2009 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2010 llss->ia1.ia_valid &= ~ATTR_ATIME;
2011 llss->ia2.ia_valid &= ~ATTR_ATIME;
2014 /* update time if requested */
2016 if (llss->ia2.ia_valid != 0) {
2017 mutex_lock(&llss->inode1->i_mutex);
2018 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2019 mutex_unlock(&llss->inode1->i_mutex);
2022 if (llss->ia1.ia_valid != 0) {
2025 mutex_lock(&llss->inode2->i_mutex);
2026 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2027 mutex_unlock(&llss->inode2->i_mutex);
2039 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2041 struct md_op_data *op_data;
2045 /* Detect out-of range masks */
2046 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2049 /* Non-root users are forbidden to set or clear flags which are
2050 * NOT defined in HSM_USER_MASK. */
2051 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2052 !cfs_capable(CFS_CAP_SYS_ADMIN))
2055 /* Detect out-of range archive id */
2056 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2057 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2060 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2061 LUSTRE_OPC_ANY, hss);
2062 if (IS_ERR(op_data))
2063 RETURN(PTR_ERR(op_data));
2065 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2066 sizeof(*op_data), op_data, NULL);
2068 ll_finish_md_op_data(op_data);
2073 static int ll_hsm_import(struct inode *inode, struct file *file,
2074 struct hsm_user_import *hui)
2076 struct hsm_state_set *hss = NULL;
2077 struct iattr *attr = NULL;
2081 if (!S_ISREG(inode->i_mode))
2087 GOTO(out, rc = -ENOMEM);
2089 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2090 hss->hss_archive_id = hui->hui_archive_id;
2091 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2092 rc = ll_hsm_state_set(inode, hss);
2096 OBD_ALLOC_PTR(attr);
2098 GOTO(out, rc = -ENOMEM);
2100 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2101 attr->ia_mode |= S_IFREG;
2102 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2103 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2104 attr->ia_size = hui->hui_size;
2105 attr->ia_mtime.tv_sec = hui->hui_mtime;
2106 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2107 attr->ia_atime.tv_sec = hui->hui_atime;
2108 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2110 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2111 ATTR_UID | ATTR_GID |
2112 ATTR_MTIME | ATTR_MTIME_SET |
2113 ATTR_ATIME | ATTR_ATIME_SET;
2115 mutex_lock(&inode->i_mutex);
2117 rc = ll_setattr_raw(file->f_dentry, attr, true);
2121 mutex_unlock(&inode->i_mutex);
2133 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2135 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2136 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2140 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2142 struct inode *inode = file->f_dentry->d_inode;
2143 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2147 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2148 PFID(ll_inode2fid(inode)), inode, cmd);
2149 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2151 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2152 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2156 case LL_IOC_GETFLAGS:
2157 /* Get the current value of the file flags */
2158 return put_user(fd->fd_flags, (int __user *)arg);
2159 case LL_IOC_SETFLAGS:
2160 case LL_IOC_CLRFLAGS:
2161 /* Set or clear specific file flags */
2162 /* XXX This probably needs checks to ensure the flags are
2163 * not abused, and to handle any flag side effects.
2165 if (get_user(flags, (int __user *) arg))
2168 if (cmd == LL_IOC_SETFLAGS) {
2169 if ((flags & LL_FILE_IGNORE_LOCK) &&
2170 !(file->f_flags & O_DIRECT)) {
2171 CERROR("%s: unable to disable locking on "
2172 "non-O_DIRECT file\n", current->comm);
2176 fd->fd_flags |= flags;
2178 fd->fd_flags &= ~flags;
2181 case LL_IOC_LOV_SETSTRIPE:
2182 RETURN(ll_lov_setstripe(inode, file, arg));
2183 case LL_IOC_LOV_SETEA:
2184 RETURN(ll_lov_setea(inode, file, arg));
2185 case LL_IOC_LOV_SWAP_LAYOUTS: {
2187 struct lustre_swap_layouts lsl;
2189 if (copy_from_user(&lsl, (char __user *)arg,
2190 sizeof(struct lustre_swap_layouts)))
2193 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2196 file2 = fget(lsl.sl_fd);
2201 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2202 rc = ll_swap_layouts(file, file2, &lsl);
2206 case LL_IOC_LOV_GETSTRIPE:
2207 RETURN(ll_file_getstripe(inode,
2208 (struct lov_user_md __user *)arg));
2209 case FSFILT_IOC_FIEMAP:
2210 RETURN(ll_ioctl_fiemap(inode, (struct fiemap __user *)arg));
2211 case FSFILT_IOC_GETFLAGS:
2212 case FSFILT_IOC_SETFLAGS:
2213 RETURN(ll_iocontrol(inode, file, cmd, arg));
2214 case FSFILT_IOC_GETVERSION_OLD:
2215 case FSFILT_IOC_GETVERSION:
2216 RETURN(put_user(inode->i_generation, (int __user *)arg));
2217 case LL_IOC_GROUP_LOCK:
2218 RETURN(ll_get_grouplock(inode, file, arg));
2219 case LL_IOC_GROUP_UNLOCK:
2220 RETURN(ll_put_grouplock(inode, file, arg));
2221 case IOC_OBD_STATFS:
2222 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2224 /* We need to special case any other ioctls we want to handle,
2225 * to send them to the MDS/OST as appropriate and to properly
2226 * network encode the arg field.
2227 case FSFILT_IOC_SETVERSION_OLD:
2228 case FSFILT_IOC_SETVERSION:
2230 case LL_IOC_FLUSHCTX:
2231 RETURN(ll_flush_ctx(inode));
2232 case LL_IOC_PATH2FID: {
2233 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2234 sizeof(struct lu_fid)))
2239 case LL_IOC_GETPARENT:
2240 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2242 case OBD_IOC_FID2PATH:
2243 RETURN(ll_fid2path(inode, (void __user *)arg));
2244 case LL_IOC_DATA_VERSION: {
2245 struct ioc_data_version idv;
2248 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2251 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2252 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2255 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2261 case LL_IOC_GET_MDTIDX: {
2264 mdtidx = ll_get_mdt_idx(inode);
2268 if (put_user((int)mdtidx, (int __user *)arg))
2273 case OBD_IOC_GETDTNAME:
2274 case OBD_IOC_GETMDNAME:
2275 RETURN(ll_get_obd_name(inode, cmd, arg));
2276 case LL_IOC_HSM_STATE_GET: {
2277 struct md_op_data *op_data;
2278 struct hsm_user_state *hus;
2285 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2286 LUSTRE_OPC_ANY, hus);
2287 if (IS_ERR(op_data)) {
2289 RETURN(PTR_ERR(op_data));
2292 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2295 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2298 ll_finish_md_op_data(op_data);
2302 case LL_IOC_HSM_STATE_SET: {
2303 struct hsm_state_set *hss;
2310 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2315 rc = ll_hsm_state_set(inode, hss);
2320 case LL_IOC_HSM_ACTION: {
2321 struct md_op_data *op_data;
2322 struct hsm_current_action *hca;
2329 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2330 LUSTRE_OPC_ANY, hca);
2331 if (IS_ERR(op_data)) {
2333 RETURN(PTR_ERR(op_data));
2336 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2339 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2342 ll_finish_md_op_data(op_data);
2346 case LL_IOC_SET_LEASE: {
2347 struct ll_inode_info *lli = ll_i2info(inode);
2348 struct obd_client_handle *och = NULL;
2353 case LL_LEASE_WRLCK:
2354 if (!(file->f_mode & FMODE_WRITE))
2356 fmode = FMODE_WRITE;
2358 case LL_LEASE_RDLCK:
2359 if (!(file->f_mode & FMODE_READ))
2363 case LL_LEASE_UNLCK:
2364 mutex_lock(&lli->lli_och_mutex);
2365 if (fd->fd_lease_och != NULL) {
2366 och = fd->fd_lease_och;
2367 fd->fd_lease_och = NULL;
2369 mutex_unlock(&lli->lli_och_mutex);
2374 fmode = och->och_flags;
2375 rc = ll_lease_close(och, inode, &lease_broken);
2382 RETURN(ll_lease_type_from_fmode(fmode));
2387 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2389 /* apply for lease */
2390 och = ll_lease_open(inode, file, fmode, 0);
2392 RETURN(PTR_ERR(och));
2395 mutex_lock(&lli->lli_och_mutex);
2396 if (fd->fd_lease_och == NULL) {
2397 fd->fd_lease_och = och;
2400 mutex_unlock(&lli->lli_och_mutex);
2402 /* impossible now that only excl is supported for now */
2403 ll_lease_close(och, inode, &lease_broken);
2408 case LL_IOC_GET_LEASE: {
2409 struct ll_inode_info *lli = ll_i2info(inode);
2410 struct ldlm_lock *lock = NULL;
2413 mutex_lock(&lli->lli_och_mutex);
2414 if (fd->fd_lease_och != NULL) {
2415 struct obd_client_handle *och = fd->fd_lease_och;
2417 lock = ldlm_handle2lock(&och->och_lease_handle);
2419 lock_res_and_lock(lock);
2420 if (!ldlm_is_cancel(lock))
2421 fmode = och->och_flags;
2423 unlock_res_and_lock(lock);
2424 LDLM_LOCK_PUT(lock);
2427 mutex_unlock(&lli->lli_och_mutex);
2429 RETURN(ll_lease_type_from_fmode(fmode));
2431 case LL_IOC_HSM_IMPORT: {
2432 struct hsm_user_import *hui;
2438 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2443 rc = ll_hsm_import(inode, file, hui);
2453 ll_iocontrol_call(inode, file, cmd, arg, &err))
2456 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2457 (void __user *)arg));
2462 #ifndef HAVE_FILE_LLSEEK_SIZE
2463 static inline loff_t
2464 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2466 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2468 if (offset > maxsize)
2471 if (offset != file->f_pos) {
2472 file->f_pos = offset;
2473 file->f_version = 0;
2479 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2480 loff_t maxsize, loff_t eof)
2482 struct inode *inode = file->f_dentry->d_inode;
2490 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2491 * position-querying operation. Avoid rewriting the "same"
2492 * f_pos value back to the file because a concurrent read(),
2493 * write() or lseek() might have altered it
2498 * f_lock protects against read/modify/write race with other
2499 * SEEK_CURs. Note that parallel writes and reads behave
2502 mutex_lock(&inode->i_mutex);
2503 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2504 mutex_unlock(&inode->i_mutex);
2508 * In the generic case the entire file is data, so as long as
2509 * offset isn't at the end of the file then the offset is data.
2516 * There is a virtual hole at the end of the file, so as long as
2517 * offset isn't i_size or larger, return i_size.
2525 return llseek_execute(file, offset, maxsize);
2529 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2531 struct inode *inode = file->f_dentry->d_inode;
2532 loff_t retval, eof = 0;
2535 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2536 (origin == SEEK_CUR) ? file->f_pos : 0);
2537 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2538 PFID(ll_inode2fid(inode)), inode, retval, retval,
2540 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2542 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2543 retval = ll_glimpse_size(inode);
2546 eof = i_size_read(inode);
2549 retval = ll_generic_file_llseek_size(file, offset, origin,
2550 ll_file_maxbytes(inode), eof);
2554 static int ll_flush(struct file *file, fl_owner_t id)
2556 struct inode *inode = file->f_dentry->d_inode;
2557 struct ll_inode_info *lli = ll_i2info(inode);
2558 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2561 LASSERT(!S_ISDIR(inode->i_mode));
2563 /* catch async errors that were recorded back when async writeback
2564 * failed for pages in this mapping. */
2565 rc = lli->lli_async_rc;
2566 lli->lli_async_rc = 0;
2567 if (lli->lli_clob != NULL) {
2568 err = lov_read_and_clear_async_rc(lli->lli_clob);
2573 /* The application has been told write failure already.
2574 * Do not report failure again. */
2575 if (fd->fd_write_failed)
2577 return rc ? -EIO : 0;
2581 * Called to make sure a portion of file has been written out.
2582 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2584 * Return how many pages have been written.
2586 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2587 enum cl_fsync_mode mode, int ignore_layout)
2589 struct cl_env_nest nest;
2592 struct obd_capa *capa = NULL;
2593 struct cl_fsync_io *fio;
2597 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2598 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2601 env = cl_env_nested_get(&nest);
2603 RETURN(PTR_ERR(env));
2605 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2607 io = vvp_env_thread_io(env);
2608 io->ci_obj = ll_i2info(inode)->lli_clob;
2609 io->ci_ignore_layout = ignore_layout;
2611 /* initialize parameters for sync */
2612 fio = &io->u.ci_fsync;
2613 fio->fi_capa = capa;
2614 fio->fi_start = start;
2616 fio->fi_fid = ll_inode2fid(inode);
2617 fio->fi_mode = mode;
2618 fio->fi_nr_written = 0;
2620 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2621 result = cl_io_loop(env, io);
2623 result = io->ci_result;
2625 result = fio->fi_nr_written;
2626 cl_io_fini(env, io);
2627 cl_env_nested_put(&nest, env);
2635 * When dentry is provided (the 'else' case), *file->f_dentry may be
2636 * null and dentry must be used directly rather than pulled from
2637 * *file->f_dentry as is done otherwise.
2640 #ifdef HAVE_FILE_FSYNC_4ARGS
2641 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2643 struct dentry *dentry = file->f_dentry;
2644 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2645 int ll_fsync(struct file *file, int datasync)
2647 struct dentry *dentry = file->f_dentry;
2649 loff_t end = LLONG_MAX;
2651 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2654 loff_t end = LLONG_MAX;
2656 struct inode *inode = dentry->d_inode;
2657 struct ll_inode_info *lli = ll_i2info(inode);
2658 struct ptlrpc_request *req;
2659 struct obd_capa *oc;
2663 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2664 PFID(ll_inode2fid(inode)), inode);
2665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2667 #ifdef HAVE_FILE_FSYNC_4ARGS
2668 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2669 mutex_lock(&inode->i_mutex);
2671 /* fsync's caller has already called _fdata{sync,write}, we want
2672 * that IO to finish before calling the osc and mdc sync methods */
2673 rc = filemap_fdatawait(inode->i_mapping);
2676 /* catch async errors that were recorded back when async writeback
2677 * failed for pages in this mapping. */
2678 if (!S_ISDIR(inode->i_mode)) {
2679 err = lli->lli_async_rc;
2680 lli->lli_async_rc = 0;
2683 err = lov_read_and_clear_async_rc(lli->lli_clob);
2688 oc = ll_mdscapa_get(inode);
2689 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2695 ptlrpc_req_finished(req);
2697 if (S_ISREG(inode->i_mode)) {
2698 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2700 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2701 if (rc == 0 && err < 0)
2704 fd->fd_write_failed = true;
2706 fd->fd_write_failed = false;
2709 #ifdef HAVE_FILE_FSYNC_4ARGS
2710 mutex_unlock(&inode->i_mutex);
2716 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2718 struct inode *inode = file->f_dentry->d_inode;
2719 struct ll_sb_info *sbi = ll_i2sbi(inode);
2720 struct ldlm_enqueue_info einfo = {
2721 .ei_type = LDLM_FLOCK,
2722 .ei_cb_cp = ldlm_flock_completion_ast,
2723 .ei_cbdata = file_lock,
2725 struct md_op_data *op_data;
2726 struct lustre_handle lockh = {0};
2727 ldlm_policy_data_t flock = {{0}};
2728 int fl_type = file_lock->fl_type;
2734 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2735 PFID(ll_inode2fid(inode)), file_lock);
2737 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2739 if (file_lock->fl_flags & FL_FLOCK) {
2740 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2741 /* flocks are whole-file locks */
2742 flock.l_flock.end = OFFSET_MAX;
2743 /* For flocks owner is determined by the local file desctiptor*/
2744 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2745 } else if (file_lock->fl_flags & FL_POSIX) {
2746 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2747 flock.l_flock.start = file_lock->fl_start;
2748 flock.l_flock.end = file_lock->fl_end;
2752 flock.l_flock.pid = file_lock->fl_pid;
2754 /* Somewhat ugly workaround for svc lockd.
2755 * lockd installs custom fl_lmops->lm_compare_owner that checks
2756 * for the fl_owner to be the same (which it always is on local node
2757 * I guess between lockd processes) and then compares pid.
2758 * As such we assign pid to the owner field to make it all work,
2759 * conflict with normal locks is unlikely since pid space and
2760 * pointer space for current->files are not intersecting */
2761 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2762 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2766 einfo.ei_mode = LCK_PR;
2769 /* An unlock request may or may not have any relation to
2770 * existing locks so we may not be able to pass a lock handle
2771 * via a normal ldlm_lock_cancel() request. The request may even
2772 * unlock a byte range in the middle of an existing lock. In
2773 * order to process an unlock request we need all of the same
2774 * information that is given with a normal read or write record
2775 * lock request. To avoid creating another ldlm unlock (cancel)
2776 * message we'll treat a LCK_NL flock request as an unlock. */
2777 einfo.ei_mode = LCK_NL;
2780 einfo.ei_mode = LCK_PW;
2783 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2798 flags = LDLM_FL_BLOCK_NOWAIT;
2804 flags = LDLM_FL_TEST_LOCK;
2807 CERROR("unknown fcntl lock command: %d\n", cmd);
2811 /* Save the old mode so that if the mode in the lock changes we
2812 * can decrement the appropriate reader or writer refcount. */
2813 file_lock->fl_type = einfo.ei_mode;
2815 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2816 LUSTRE_OPC_ANY, NULL);
2817 if (IS_ERR(op_data))
2818 RETURN(PTR_ERR(op_data));
2820 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2821 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2822 flock.l_flock.pid, flags, einfo.ei_mode,
2823 flock.l_flock.start, flock.l_flock.end);
2825 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2828 /* Restore the file lock type if not TEST lock. */
2829 if (!(flags & LDLM_FL_TEST_LOCK))
2830 file_lock->fl_type = fl_type;
2832 if ((file_lock->fl_flags & FL_FLOCK) &&
2833 (rc == 0 || file_lock->fl_type == F_UNLCK))
2834 rc2 = flock_lock_file_wait(file, file_lock);
2835 if ((file_lock->fl_flags & FL_POSIX) &&
2836 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2837 !(flags & LDLM_FL_TEST_LOCK))
2838 rc2 = posix_lock_file_wait(file, file_lock);
2840 if (rc2 && file_lock->fl_type != F_UNLCK) {
2841 einfo.ei_mode = LCK_NL;
2842 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2847 ll_finish_md_op_data(op_data);
2852 int ll_get_fid_by_name(struct inode *parent, const char *name,
2853 int namelen, struct lu_fid *fid)
2855 struct md_op_data *op_data = NULL;
2856 struct mdt_body *body;
2857 struct ptlrpc_request *req;
2861 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2862 LUSTRE_OPC_ANY, NULL);
2863 if (IS_ERR(op_data))
2864 RETURN(PTR_ERR(op_data));
2866 op_data->op_valid = OBD_MD_FLID;
2867 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2868 ll_finish_md_op_data(op_data);
2872 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2874 GOTO(out_req, rc = -EFAULT);
2876 *fid = body->mbo_fid1;
2878 ptlrpc_req_finished(req);
2882 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2883 const char *name, int namelen)
2885 struct dentry *dchild = NULL;
2886 struct inode *child_inode = NULL;
2887 struct md_op_data *op_data;
2888 struct ptlrpc_request *request = NULL;
2893 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2894 name, PFID(ll_inode2fid(parent)), mdtidx);
2896 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2897 0, LUSTRE_OPC_ANY, NULL);
2898 if (IS_ERR(op_data))
2899 RETURN(PTR_ERR(op_data));
2901 /* Get child FID first */
2902 qstr.hash = full_name_hash(name, namelen);
2905 dchild = d_lookup(file->f_dentry, &qstr);
2906 if (dchild != NULL) {
2907 if (dchild->d_inode != NULL) {
2908 child_inode = igrab(dchild->d_inode);
2909 if (child_inode != NULL) {
2910 mutex_lock(&child_inode->i_mutex);
2911 op_data->op_fid3 = *ll_inode2fid(child_inode);
2912 ll_invalidate_aliases(child_inode);
2917 rc = ll_get_fid_by_name(parent, name, namelen,
2923 if (!fid_is_sane(&op_data->op_fid3)) {
2924 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2925 ll_get_fsname(parent->i_sb, NULL, 0), name,
2926 PFID(&op_data->op_fid3));
2927 GOTO(out_free, rc = -EINVAL);
2930 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2935 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2936 PFID(&op_data->op_fid3), mdtidx);
2937 GOTO(out_free, rc = 0);
2940 op_data->op_mds = mdtidx;
2941 op_data->op_cli_flags = CLI_MIGRATE;
2942 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2943 namelen, name, namelen, &request);
2945 ll_update_times(request, parent);
2947 ptlrpc_req_finished(request);
2952 if (child_inode != NULL) {
2953 clear_nlink(child_inode);
2954 mutex_unlock(&child_inode->i_mutex);
2958 ll_finish_md_op_data(op_data);
2963 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2971 * test if some locks matching bits and l_req_mode are acquired
2972 * - bits can be in different locks
2973 * - if found clear the common lock bits in *bits
2974 * - the bits not found, are kept in *bits
2976 * \param bits [IN] searched lock bits [IN]
2977 * \param l_req_mode [IN] searched lock mode
2978 * \retval boolean, true iff all bits are found
2980 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2982 struct lustre_handle lockh;
2983 ldlm_policy_data_t policy;
2984 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2985 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2994 fid = &ll_i2info(inode)->lli_fid;
2995 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2996 ldlm_lockname[mode]);
2998 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2999 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3000 policy.l_inodebits.bits = *bits & (1 << i);
3001 if (policy.l_inodebits.bits == 0)
3004 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3005 &policy, mode, &lockh)) {
3006 struct ldlm_lock *lock;
3008 lock = ldlm_handle2lock(&lockh);
3011 ~(lock->l_policy_data.l_inodebits.bits);
3012 LDLM_LOCK_PUT(lock);
3014 *bits &= ~policy.l_inodebits.bits;
3021 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3022 struct lustre_handle *lockh, __u64 flags,
3025 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3030 fid = &ll_i2info(inode)->lli_fid;
3031 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3033 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3034 fid, LDLM_IBITS, &policy, mode, lockh);
3039 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3041 /* Already unlinked. Just update nlink and return success */
3042 if (rc == -ENOENT) {
3044 /* This path cannot be hit for regular files unless in
3045 * case of obscure races, so no need to to validate
3047 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3049 } else if (rc != 0) {
3050 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3051 "%s: revalidate FID "DFID" error: rc = %d\n",
3052 ll_get_fsname(inode->i_sb, NULL, 0),
3053 PFID(ll_inode2fid(inode)), rc);
3059 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3061 struct inode *inode = dentry->d_inode;
3062 struct ptlrpc_request *req = NULL;
3063 struct obd_export *exp;
3067 LASSERT(inode != NULL);
3069 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3070 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3072 exp = ll_i2mdexp(inode);
3074 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3075 * But under CMD case, it caused some lock issues, should be fixed
3076 * with new CMD ibits lock. See bug 12718 */
3077 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3078 struct lookup_intent oit = { .it_op = IT_GETATTR };
3079 struct md_op_data *op_data;
3081 if (ibits == MDS_INODELOCK_LOOKUP)
3082 oit.it_op = IT_LOOKUP;
3084 /* Call getattr by fid, so do not provide name at all. */
3085 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3086 dentry->d_inode, NULL, 0, 0,
3087 LUSTRE_OPC_ANY, NULL);
3088 if (IS_ERR(op_data))
3089 RETURN(PTR_ERR(op_data));
3091 rc = md_intent_lock(exp, op_data, &oit, &req,
3092 &ll_md_blocking_ast, 0);
3093 ll_finish_md_op_data(op_data);
3095 rc = ll_inode_revalidate_fini(inode, rc);
3099 rc = ll_revalidate_it_finish(req, &oit, dentry);
3101 ll_intent_release(&oit);
3105 /* Unlinked? Unhash dentry, so it is not picked up later by
3106 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3107 here to preserve get_cwd functionality on 2.6.
3109 if (!dentry->d_inode->i_nlink)
3110 d_lustre_invalidate(dentry, 0);
3112 ll_lookup_finish_locks(&oit, dentry);
3113 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3114 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3115 u64 valid = OBD_MD_FLGETATTR;
3116 struct md_op_data *op_data;
3119 if (S_ISREG(inode->i_mode)) {
3120 rc = ll_get_default_mdsize(sbi, &ealen);
3123 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3126 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3127 0, ealen, LUSTRE_OPC_ANY,
3129 if (IS_ERR(op_data))
3130 RETURN(PTR_ERR(op_data));
3132 op_data->op_valid = valid;
3133 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3134 * capa for this inode. Because we only keep capas of dirs
3136 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3137 ll_finish_md_op_data(op_data);
3139 rc = ll_inode_revalidate_fini(inode, rc);
3143 rc = ll_prep_inode(&inode, req, NULL, NULL);
3146 ptlrpc_req_finished(req);
3150 static int ll_merge_md_attr(struct inode *inode)
3152 struct cl_attr attr = { 0 };
3155 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3156 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3157 &attr, ll_md_blocking_ast);
3161 set_nlink(inode, attr.cat_nlink);
3162 inode->i_blocks = attr.cat_blocks;
3163 i_size_write(inode, attr.cat_size);
3165 ll_i2info(inode)->lli_atime = attr.cat_atime;
3166 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3167 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3173 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3175 struct inode *inode = dentry->d_inode;
3179 rc = __ll_inode_revalidate(dentry, ibits);
3183 /* if object isn't regular file, don't validate size */
3184 if (!S_ISREG(inode->i_mode)) {
3185 if (S_ISDIR(inode->i_mode) &&
3186 ll_i2info(inode)->lli_lsm_md != NULL) {
3187 rc = ll_merge_md_attr(inode);
3192 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3193 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3194 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3196 /* In case of restore, the MDT has the right size and has
3197 * already send it back without granting the layout lock,
3198 * inode is up-to-date so glimpse is useless.
3199 * Also to glimpse we need the layout, in case of a running
3200 * restore the MDT holds the layout lock so the glimpse will
3201 * block up to the end of restore (getattr will block)
3203 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3204 rc = ll_glimpse_size(inode);
3209 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3211 struct inode *inode = de->d_inode;
3212 struct ll_sb_info *sbi = ll_i2sbi(inode);
3213 struct ll_inode_info *lli = ll_i2info(inode);
3216 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3217 MDS_INODELOCK_LOOKUP);
3218 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3223 stat->dev = inode->i_sb->s_dev;
3224 if (ll_need_32bit_api(sbi))
3225 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3227 stat->ino = inode->i_ino;
3228 stat->mode = inode->i_mode;
3229 stat->uid = inode->i_uid;
3230 stat->gid = inode->i_gid;
3231 stat->rdev = inode->i_rdev;
3232 stat->atime = inode->i_atime;
3233 stat->mtime = inode->i_mtime;
3234 stat->ctime = inode->i_ctime;
3235 stat->blksize = 1 << inode->i_blkbits;
3237 stat->nlink = inode->i_nlink;
3238 stat->size = i_size_read(inode);
3239 stat->blocks = inode->i_blocks;
3244 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3245 __u64 start, __u64 len)
3249 struct ll_user_fiemap *fiemap;
3250 unsigned int extent_count = fieinfo->fi_extents_max;
3252 num_bytes = sizeof(*fiemap) + (extent_count *
3253 sizeof(struct ll_fiemap_extent));
3254 OBD_ALLOC_LARGE(fiemap, num_bytes);
3259 fiemap->fm_flags = fieinfo->fi_flags;
3260 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3261 fiemap->fm_start = start;
3262 fiemap->fm_length = len;
3263 if (extent_count > 0)
3264 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3265 sizeof(struct ll_fiemap_extent));
3267 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3269 fieinfo->fi_flags = fiemap->fm_flags;
3270 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3271 if (extent_count > 0)
3272 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3273 fiemap->fm_mapped_extents *
3274 sizeof(struct ll_fiemap_extent));
3276 OBD_FREE_LARGE(fiemap, num_bytes);
3280 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3282 struct ll_inode_info *lli = ll_i2info(inode);
3283 struct posix_acl *acl = NULL;
3286 spin_lock(&lli->lli_lock);
3287 /* VFS' acl_permission_check->check_acl will release the refcount */
3288 acl = posix_acl_dup(lli->lli_posix_acl);
3289 spin_unlock(&lli->lli_lock);
3294 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3296 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3297 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3299 ll_check_acl(struct inode *inode, int mask)
3302 # ifdef CONFIG_FS_POSIX_ACL
3303 struct posix_acl *acl;
3307 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3308 if (flags & IPERM_FLAG_RCU)
3311 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3316 rc = posix_acl_permission(inode, acl, mask);
3317 posix_acl_release(acl);
3320 # else /* !CONFIG_FS_POSIX_ACL */
3322 # endif /* CONFIG_FS_POSIX_ACL */
3324 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3326 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3327 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3329 # ifdef HAVE_INODE_PERMISION_2ARGS
3330 int ll_inode_permission(struct inode *inode, int mask)
3332 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3337 struct ll_sb_info *sbi;
3338 struct root_squash_info *squash;
3339 struct cred *cred = NULL;
3340 const struct cred *old_cred = NULL;
3342 bool squash_id = false;
3345 #ifdef MAY_NOT_BLOCK
3346 if (mask & MAY_NOT_BLOCK)
3348 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3349 if (flags & IPERM_FLAG_RCU)
3353 /* as root inode are NOT getting validated in lookup operation,
3354 * need to do it before permission check. */
3356 if (inode == inode->i_sb->s_root->d_inode) {
3357 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3358 MDS_INODELOCK_LOOKUP);
3363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3364 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3366 /* squash fsuid/fsgid if needed */
3367 sbi = ll_i2sbi(inode);
3368 squash = &sbi->ll_squash;
3369 if (unlikely(squash->rsi_uid != 0 &&
3370 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3371 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3375 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3376 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3377 squash->rsi_uid, squash->rsi_gid);
3379 /* update current process's credentials
3380 * and FS capability */
3381 cred = prepare_creds();
3385 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3386 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3387 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3388 if ((1 << cap) & CFS_CAP_FS_MASK)
3389 cap_lower(cred->cap_effective, cap);
3391 old_cred = override_creds(cred);
3394 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3396 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3397 rc = lustre_check_remote_perm(inode, mask);
3399 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3401 /* restore current process's credentials and FS capability */
3403 revert_creds(old_cred);
3410 /* -o localflock - only provides locally consistent flock locks */
3411 struct file_operations ll_file_operations = {
3412 .read = ll_file_read,
3413 .aio_read = ll_file_aio_read,
3414 .write = ll_file_write,
3415 .aio_write = ll_file_aio_write,
3416 .unlocked_ioctl = ll_file_ioctl,
3417 .open = ll_file_open,
3418 .release = ll_file_release,
3419 .mmap = ll_file_mmap,
3420 .llseek = ll_file_seek,
3421 .splice_read = ll_file_splice_read,
3426 struct file_operations ll_file_operations_flock = {
3427 .read = ll_file_read,
3428 .aio_read = ll_file_aio_read,
3429 .write = ll_file_write,
3430 .aio_write = ll_file_aio_write,
3431 .unlocked_ioctl = ll_file_ioctl,
3432 .open = ll_file_open,
3433 .release = ll_file_release,
3434 .mmap = ll_file_mmap,
3435 .llseek = ll_file_seek,
3436 .splice_read = ll_file_splice_read,
3439 .flock = ll_file_flock,
3440 .lock = ll_file_flock
3443 /* These are for -o noflock - to return ENOSYS on flock calls */
3444 struct file_operations ll_file_operations_noflock = {
3445 .read = ll_file_read,
3446 .aio_read = ll_file_aio_read,
3447 .write = ll_file_write,
3448 .aio_write = ll_file_aio_write,
3449 .unlocked_ioctl = ll_file_ioctl,
3450 .open = ll_file_open,
3451 .release = ll_file_release,
3452 .mmap = ll_file_mmap,
3453 .llseek = ll_file_seek,
3454 .splice_read = ll_file_splice_read,
3457 .flock = ll_file_noflock,
3458 .lock = ll_file_noflock
3461 struct inode_operations ll_file_inode_operations = {
3462 .setattr = ll_setattr,
3463 .getattr = ll_getattr,
3464 .permission = ll_inode_permission,
3465 .setxattr = ll_setxattr,
3466 .getxattr = ll_getxattr,
3467 .listxattr = ll_listxattr,
3468 .removexattr = ll_removexattr,
3469 .fiemap = ll_fiemap,
3470 #ifdef HAVE_IOP_GET_ACL
3471 .get_acl = ll_get_acl,
3475 /* dynamic ioctl number support routins */
3476 static struct llioc_ctl_data {
3477 struct rw_semaphore ioc_sem;
3478 struct list_head ioc_head;
3480 __RWSEM_INITIALIZER(llioc.ioc_sem),
3481 LIST_HEAD_INIT(llioc.ioc_head)
3486 struct list_head iocd_list;
3487 unsigned int iocd_size;
3488 llioc_callback_t iocd_cb;
3489 unsigned int iocd_count;
3490 unsigned int iocd_cmd[0];
3493 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3496 struct llioc_data *in_data = NULL;
3499 if (cb == NULL || cmd == NULL ||
3500 count > LLIOC_MAX_CMD || count < 0)
3503 size = sizeof(*in_data) + count * sizeof(unsigned int);
3504 OBD_ALLOC(in_data, size);
3505 if (in_data == NULL)
3508 memset(in_data, 0, sizeof(*in_data));
3509 in_data->iocd_size = size;
3510 in_data->iocd_cb = cb;
3511 in_data->iocd_count = count;
3512 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3514 down_write(&llioc.ioc_sem);
3515 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3516 up_write(&llioc.ioc_sem);
3521 void ll_iocontrol_unregister(void *magic)
3523 struct llioc_data *tmp;
3528 down_write(&llioc.ioc_sem);
3529 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3531 unsigned int size = tmp->iocd_size;
3533 list_del(&tmp->iocd_list);
3534 up_write(&llioc.ioc_sem);
3536 OBD_FREE(tmp, size);
3540 up_write(&llioc.ioc_sem);
3542 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3545 EXPORT_SYMBOL(ll_iocontrol_register);
3546 EXPORT_SYMBOL(ll_iocontrol_unregister);
3548 static enum llioc_iter
3549 ll_iocontrol_call(struct inode *inode, struct file *file,
3550 unsigned int cmd, unsigned long arg, int *rcp)
3552 enum llioc_iter ret = LLIOC_CONT;
3553 struct llioc_data *data;
3554 int rc = -EINVAL, i;
3556 down_read(&llioc.ioc_sem);
3557 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3558 for (i = 0; i < data->iocd_count; i++) {
3559 if (cmd != data->iocd_cmd[i])
3562 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3566 if (ret == LLIOC_STOP)
3569 up_read(&llioc.ioc_sem);
3576 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3578 struct ll_inode_info *lli = ll_i2info(inode);
3579 struct cl_object *obj = lli->lli_clob;
3580 struct cl_env_nest nest;
3588 env = cl_env_nested_get(&nest);
3590 RETURN(PTR_ERR(env));
3592 rc = cl_conf_set(env, lli->lli_clob, conf);
3596 if (conf->coc_opc == OBJECT_CONF_SET) {
3597 struct ldlm_lock *lock = conf->coc_lock;
3598 struct cl_layout cl = {
3602 LASSERT(lock != NULL);
3603 LASSERT(ldlm_has_layout(lock));
3605 /* it can only be allowed to match after layout is
3606 * applied to inode otherwise false layout would be
3607 * seen. Applying layout shoud happen before dropping
3608 * the intent lock. */
3609 ldlm_lock_allow_match(lock);
3611 rc = cl_object_layout_get(env, obj, &cl);
3616 DFID": layout version change: %u -> %u\n",
3617 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3619 ll_layout_version_set(lli, cl.cl_layout_gen);
3623 cl_env_nested_put(&nest, env);
3628 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3629 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3632 struct ll_sb_info *sbi = ll_i2sbi(inode);
3633 struct obd_capa *oc;
3634 struct ptlrpc_request *req;
3635 struct mdt_body *body;
3642 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3643 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3644 lock->l_lvb_data, lock->l_lvb_len);
3646 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3649 /* if layout lock was granted right away, the layout is returned
3650 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3651 * blocked and then granted via completion ast, we have to fetch
3652 * layout here. Please note that we can't use the LVB buffer in
3653 * completion AST because it doesn't have a large enough buffer */
3654 oc = ll_mdscapa_get(inode);
3655 rc = ll_get_default_mdsize(sbi, &lmmsize);
3657 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3658 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3664 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3666 GOTO(out, rc = -EPROTO);
3668 lmmsize = body->mbo_eadatasize;
3669 if (lmmsize == 0) /* empty layout */
3672 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3674 GOTO(out, rc = -EFAULT);
3676 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3677 if (lvbdata == NULL)
3678 GOTO(out, rc = -ENOMEM);
3680 memcpy(lvbdata, lmm, lmmsize);
3681 lock_res_and_lock(lock);
3682 if (lock->l_lvb_data != NULL)
3683 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3685 lock->l_lvb_data = lvbdata;
3686 lock->l_lvb_len = lmmsize;
3687 unlock_res_and_lock(lock);
3692 ptlrpc_req_finished(req);
3697 * Apply the layout to the inode. Layout lock is held and will be released
3700 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3701 struct inode *inode)
3703 struct ll_inode_info *lli = ll_i2info(inode);
3704 struct ll_sb_info *sbi = ll_i2sbi(inode);
3705 struct ldlm_lock *lock;
3706 struct lustre_md md = { NULL };
3707 struct cl_object_conf conf;
3710 bool wait_layout = false;
3713 LASSERT(lustre_handle_is_used(lockh));
3715 lock = ldlm_handle2lock(lockh);
3716 LASSERT(lock != NULL);
3717 LASSERT(ldlm_has_layout(lock));
3719 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3720 PFID(&lli->lli_fid), inode);
3722 /* in case this is a caching lock and reinstate with new inode */
3723 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3725 lock_res_and_lock(lock);
3726 lvb_ready = ldlm_is_lvb_ready(lock);
3727 unlock_res_and_lock(lock);
3728 /* checking lvb_ready is racy but this is okay. The worst case is
3729 * that multi processes may configure the file on the same time. */
3734 rc = ll_layout_fetch(inode, lock);
3738 /* for layout lock, lmm is returned in lock's lvb.
3739 * lvb_data is immutable if the lock is held so it's safe to access it
3740 * without res lock. See the description in ldlm_lock_decref_internal()
3741 * for the condition to free lvb_data of layout lock */
3742 if (lock->l_lvb_data != NULL) {
3743 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3744 lock->l_lvb_data, lock->l_lvb_len);
3746 CERROR("%s: file "DFID" unpackmd error: %d\n",
3747 ll_get_fsname(inode->i_sb, NULL, 0),
3748 PFID(&lli->lli_fid), rc);
3752 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3753 lock->l_lvb_data, lock->l_lvb_len);
3758 /* set layout to file. Unlikely this will fail as old layout was
3759 * surely eliminated */
3760 memset(&conf, 0, sizeof conf);
3761 conf.coc_opc = OBJECT_CONF_SET;
3762 conf.coc_inode = inode;
3763 conf.coc_lock = lock;
3764 conf.u.coc_md = &md;
3765 rc = ll_layout_conf(inode, &conf);
3768 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3770 /* refresh layout failed, need to wait */
3771 wait_layout = rc == -EBUSY;
3775 LDLM_LOCK_PUT(lock);
3776 ldlm_lock_decref(lockh, mode);
3778 /* wait for IO to complete if it's still being used. */
3780 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3781 ll_get_fsname(inode->i_sb, NULL, 0),
3782 PFID(&lli->lli_fid), inode);
3784 memset(&conf, 0, sizeof conf);
3785 conf.coc_opc = OBJECT_CONF_WAIT;
3786 conf.coc_inode = inode;
3787 rc = ll_layout_conf(inode, &conf);
3791 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3792 ll_get_fsname(inode->i_sb, NULL, 0),
3793 PFID(&lli->lli_fid), rc);
3798 static int ll_layout_refresh_locked(struct inode *inode)
3800 struct ll_inode_info *lli = ll_i2info(inode);
3801 struct ll_sb_info *sbi = ll_i2sbi(inode);
3802 struct md_op_data *op_data;
3803 struct lookup_intent it;
3804 struct lustre_handle lockh;
3806 struct ldlm_enqueue_info einfo = {
3807 .ei_type = LDLM_IBITS,
3809 .ei_cb_bl = &ll_md_blocking_ast,
3810 .ei_cb_cp = &ldlm_completion_ast,
3816 /* mostly layout lock is caching on the local side, so try to match
3817 * it before grabbing layout lock mutex. */
3818 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3819 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3820 if (mode != 0) { /* hit cached lock */
3821 rc = ll_layout_lock_set(&lockh, mode, inode);
3828 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3829 0, 0, LUSTRE_OPC_ANY, NULL);
3830 if (IS_ERR(op_data))
3831 RETURN(PTR_ERR(op_data));
3833 /* have to enqueue one */
3834 memset(&it, 0, sizeof(it));
3835 it.it_op = IT_LAYOUT;
3836 lockh.cookie = 0ULL;
3838 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3839 ll_get_fsname(inode->i_sb, NULL, 0),
3840 PFID(&lli->lli_fid), inode);
3842 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3843 if (it.d.lustre.it_data != NULL)
3844 ptlrpc_req_finished(it.d.lustre.it_data);
3845 it.d.lustre.it_data = NULL;
3847 ll_finish_md_op_data(op_data);
3849 mode = it.d.lustre.it_lock_mode;
3850 it.d.lustre.it_lock_mode = 0;
3851 ll_intent_drop_lock(&it);
3854 /* set lock data in case this is a new lock */
3855 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3856 rc = ll_layout_lock_set(&lockh, mode, inode);
3865 * This function checks if there exists a LAYOUT lock on the client side,
3866 * or enqueues it if it doesn't have one in cache.
3868 * This function will not hold layout lock so it may be revoked any time after
3869 * this function returns. Any operations depend on layout should be redone
3872 * This function should be called before lov_io_init() to get an uptodate
3873 * layout version, the caller should save the version number and after IO
3874 * is finished, this function should be called again to verify that layout
3875 * is not changed during IO time.
3877 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3879 struct ll_inode_info *lli = ll_i2info(inode);
3880 struct ll_sb_info *sbi = ll_i2sbi(inode);
3884 *gen = ll_layout_version_get(lli);
3885 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3889 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3890 LASSERT(S_ISREG(inode->i_mode));
3892 /* take layout lock mutex to enqueue layout lock exclusively. */
3893 mutex_lock(&lli->lli_layout_mutex);
3895 rc = ll_layout_refresh_locked(inode);
3899 *gen = ll_layout_version_get(lli);
3901 mutex_unlock(&lli->lli_layout_mutex);
3907 * This function send a restore request to the MDT
3909 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3911 struct hsm_user_request *hur;
3915 len = sizeof(struct hsm_user_request) +
3916 sizeof(struct hsm_user_item);
3917 OBD_ALLOC(hur, len);
3921 hur->hur_request.hr_action = HUA_RESTORE;
3922 hur->hur_request.hr_archive_id = 0;
3923 hur->hur_request.hr_flags = 0;
3924 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3925 sizeof(hur->hur_user_item[0].hui_fid));
3926 hur->hur_user_item[0].hui_extent.offset = offset;
3927 hur->hur_user_item[0].hui_extent.length = length;
3928 hur->hur_request.hr_itemcount = 1;
3929 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,