4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <linux/user_namespace.h>
49 #ifdef HAVE_UIDGID_HEADER
50 # include <linux/uidgid.h>
52 #include <lustre/ll_fiemap.h>
53 #include <lustre_ioctl.h>
55 #include "cl_object.h"
57 #include "llite_internal.h"
58 #include "vvp_internal.h"
61 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
63 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
66 static enum llioc_iter
67 ll_iocontrol_call(struct inode *inode, struct file *file,
68 unsigned int cmd, unsigned long arg, int *rcp);
70 static struct ll_file_data *ll_file_data_get(void)
72 struct ll_file_data *fd;
74 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
78 fd->fd_write_failed = false;
83 static void ll_file_data_put(struct ll_file_data *fd)
86 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
89 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
90 struct lustre_handle *fh)
92 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
93 op_data->op_attr.ia_mode = inode->i_mode;
94 op_data->op_attr.ia_atime = inode->i_atime;
95 op_data->op_attr.ia_mtime = inode->i_mtime;
96 op_data->op_attr.ia_ctime = inode->i_ctime;
97 op_data->op_attr.ia_size = i_size_read(inode);
98 op_data->op_attr_blocks = inode->i_blocks;
99 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
101 op_data->op_handle = *fh;
102 op_data->op_capa1 = ll_mdscapa_get(inode);
104 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
105 op_data->op_bias |= MDS_DATA_MODIFIED;
109 * Packs all the attributes into @op_data for the CLOSE rpc.
111 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
112 struct obd_client_handle *och)
116 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
117 ATTR_MTIME | ATTR_MTIME_SET |
118 ATTR_CTIME | ATTR_CTIME_SET;
120 if (!(och->och_flags & FMODE_WRITE))
123 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
146 * XXX: in case of LMV, is this correct to access
149 CERROR("Invalid MDC connection handle "LPX64"\n",
150 ll_i2mdexp(inode)->exp_handle.h_cookie);
154 OBD_ALLOC_PTR(op_data);
156 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
158 ll_prepare_close(inode, op_data, och);
159 if (data_version != NULL) {
160 /* Pass in data_version implies release. */
161 op_data->op_bias |= MDS_HSM_RELEASE;
162 op_data->op_data_version = *data_version;
163 op_data->op_lease_handle = och->och_lease_handle;
164 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 rc = md_close(md_exp, op_data, och->och_mod, &req);
169 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
170 ll_i2mdexp(inode)->exp_obd->obd_name,
171 PFID(ll_inode2fid(inode)), rc);
174 /* DATA_MODIFIED flag was successfully sent on close, cancel data
175 * modification flag. */
176 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
177 struct ll_inode_info *lli = ll_i2info(inode);
179 spin_lock(&lli->lli_lock);
180 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
181 spin_unlock(&lli->lli_lock);
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
191 ll_finish_md_op_data(op_data);
195 md_clear_open_replay_data(md_exp, och);
196 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
199 if (req) /* This is close request */
200 ptlrpc_req_finished(req);
204 int ll_md_real_close(struct inode *inode, fmode_t fmode)
206 struct ll_inode_info *lli = ll_i2info(inode);
207 struct obd_client_handle **och_p;
208 struct obd_client_handle *och;
213 if (fmode & FMODE_WRITE) {
214 och_p = &lli->lli_mds_write_och;
215 och_usecount = &lli->lli_open_fd_write_count;
216 } else if (fmode & FMODE_EXEC) {
217 och_p = &lli->lli_mds_exec_och;
218 och_usecount = &lli->lli_open_fd_exec_count;
220 LASSERT(fmode & FMODE_READ);
221 och_p = &lli->lli_mds_read_och;
222 och_usecount = &lli->lli_open_fd_read_count;
225 mutex_lock(&lli->lli_och_mutex);
226 if (*och_usecount > 0) {
227 /* There are still users of this handle, so skip
229 mutex_unlock(&lli->lli_och_mutex);
235 mutex_unlock(&lli->lli_och_mutex);
238 /* There might be a race and this handle may already
240 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
247 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251 struct ll_inode_info *lli = ll_i2info(inode);
255 /* clear group lock, if present */
256 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
257 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
259 if (fd->fd_lease_och != NULL) {
262 /* Usually the lease is not released when the
263 * application crashed, we need to release here. */
264 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
265 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
266 PFID(&lli->lli_fid), rc, lease_broken);
268 fd->fd_lease_och = NULL;
271 if (fd->fd_och != NULL) {
272 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
277 /* Let's see if we have good enough OPEN lock on the file and if
278 we can skip talking to MDS */
279 if (file->f_dentry->d_inode) { /* Can this ever be false? */
281 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
282 struct lustre_handle lockh;
283 struct inode *inode = file->f_dentry->d_inode;
284 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
286 mutex_lock(&lli->lli_och_mutex);
287 if (fd->fd_omode & FMODE_WRITE) {
289 LASSERT(lli->lli_open_fd_write_count);
290 lli->lli_open_fd_write_count--;
291 } else if (fd->fd_omode & FMODE_EXEC) {
293 LASSERT(lli->lli_open_fd_exec_count);
294 lli->lli_open_fd_exec_count--;
297 LASSERT(lli->lli_open_fd_read_count);
298 lli->lli_open_fd_read_count--;
300 mutex_unlock(&lli->lli_och_mutex);
302 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
303 LDLM_IBITS, &policy, lockmode,
305 rc = ll_md_real_close(file->f_dentry->d_inode,
309 CERROR("released file has negative dentry: file = %p, "
310 "dentry = %p, name = %s\n",
311 file, file->f_dentry, file->f_dentry->d_name.name);
315 LUSTRE_FPRIVATE(file) = NULL;
316 ll_file_data_put(fd);
317 ll_capa_close(inode);
322 /* While this returns an error code, fput() the caller does not, so we need
323 * to make every effort to clean up all of our state here. Also, applications
324 * rarely check close errors and even if an error is returned they will not
325 * re-try the close call.
327 int ll_file_release(struct inode *inode, struct file *file)
329 struct ll_file_data *fd;
330 struct ll_sb_info *sbi = ll_i2sbi(inode);
331 struct ll_inode_info *lli = ll_i2info(inode);
335 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
336 PFID(ll_inode2fid(inode)), inode);
338 #ifdef CONFIG_FS_POSIX_ACL
339 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
340 inode == inode->i_sb->s_root->d_inode) {
341 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
344 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
345 fd->fd_flags &= ~LL_FILE_RMTACL;
346 rct_del(&sbi->ll_rct, current_pid());
347 et_search_free(&sbi->ll_et, current_pid());
352 if (inode->i_sb->s_root != file->f_dentry)
353 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
354 fd = LUSTRE_FPRIVATE(file);
357 /* The last ref on @file, maybe not the the owner pid of statahead,
358 * because parent and child process can share the same file handle. */
359 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
360 ll_deauthorize_statahead(inode, fd);
362 if (inode->i_sb->s_root == file->f_dentry) {
363 LUSTRE_FPRIVATE(file) = NULL;
364 ll_file_data_put(fd);
368 if (!S_ISDIR(inode->i_mode)) {
369 if (lli->lli_clob != NULL)
370 lov_read_and_clear_async_rc(lli->lli_clob);
371 lli->lli_async_rc = 0;
374 rc = ll_md_close(sbi->ll_md_exp, inode, file);
376 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
377 libcfs_debug_dumplog();
382 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
383 struct lookup_intent *itp)
385 struct dentry *de = file->f_dentry;
386 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
387 struct dentry *parent = de->d_parent;
388 const char *name = NULL;
390 struct md_op_data *op_data;
391 struct ptlrpc_request *req = NULL;
395 LASSERT(parent != NULL);
396 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
398 /* if server supports open-by-fid, or file name is invalid, don't pack
399 * name in open request */
400 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
401 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
402 name = de->d_name.name;
403 len = de->d_name.len;
406 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
407 name, len, 0, LUSTRE_OPC_ANY, NULL);
409 RETURN(PTR_ERR(op_data));
410 op_data->op_data = lmm;
411 op_data->op_data_size = lmmsize;
413 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
414 &ll_md_blocking_ast, 0);
415 ll_finish_md_op_data(op_data);
417 /* reason for keep own exit path - don`t flood log
418 * with messages with -ESTALE errors.
420 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
421 it_open_error(DISP_OPEN_OPEN, itp))
423 ll_release_openhandle(de, itp);
427 if (it_disposition(itp, DISP_LOOKUP_NEG))
428 GOTO(out, rc = -ENOENT);
430 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
431 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
432 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
436 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
437 if (!rc && itp->d.lustre.it_lock_mode)
438 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
441 ptlrpc_req_finished(req);
442 ll_intent_drop_lock(itp);
447 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
448 struct obd_client_handle *och)
450 struct ptlrpc_request *req = it->d.lustre.it_data;
451 struct mdt_body *body;
453 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454 och->och_fh = body->mbo_handle;
455 och->och_fid = body->mbo_fid1;
456 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
457 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
458 och->och_flags = it->it_flags;
460 return md_set_open_replay_data(md_exp, och, it);
463 static int ll_local_open(struct file *file, struct lookup_intent *it,
464 struct ll_file_data *fd, struct obd_client_handle *och)
466 struct inode *inode = file->f_dentry->d_inode;
469 LASSERT(!LUSTRE_FPRIVATE(file));
476 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
481 LUSTRE_FPRIVATE(file) = fd;
482 ll_readahead_init(inode, &fd->fd_ras);
483 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
485 /* ll_cl_context initialize */
486 rwlock_init(&fd->fd_lock);
487 INIT_LIST_HEAD(&fd->fd_lccs);
492 /* Open a file, and (for the very first open) create objects on the OSTs at
493 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
494 * creation or open until ll_lov_setstripe() ioctl is called.
496 * If we already have the stripe MD locally then we don't request it in
497 * md_open(), by passing a lmm_size = 0.
499 * It is up to the application to ensure no other processes open this file
500 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
501 * used. We might be able to avoid races of that sort by getting lli_open_sem
502 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
503 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
505 int ll_file_open(struct inode *inode, struct file *file)
507 struct ll_inode_info *lli = ll_i2info(inode);
508 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
509 .it_flags = file->f_flags };
510 struct obd_client_handle **och_p = NULL;
511 __u64 *och_usecount = NULL;
512 struct ll_file_data *fd;
516 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
517 PFID(ll_inode2fid(inode)), inode, file->f_flags);
519 it = file->private_data; /* XXX: compat macro */
520 file->private_data = NULL; /* prevent ll_local_open assertion */
522 fd = ll_file_data_get();
524 GOTO(out_openerr, rc = -ENOMEM);
527 if (S_ISDIR(inode->i_mode))
528 ll_authorize_statahead(inode, fd);
530 if (inode->i_sb->s_root == file->f_dentry) {
531 LUSTRE_FPRIVATE(file) = fd;
535 if (!it || !it->d.lustre.it_disposition) {
536 /* Convert f_flags into access mode. We cannot use file->f_mode,
537 * because everything but O_ACCMODE mask was stripped from
539 if ((oit.it_flags + 1) & O_ACCMODE)
541 if (file->f_flags & O_TRUNC)
542 oit.it_flags |= FMODE_WRITE;
544 /* kernel only call f_op->open in dentry_open. filp_open calls
545 * dentry_open after call to open_namei that checks permissions.
546 * Only nfsd_open call dentry_open directly without checking
547 * permissions and because of that this code below is safe. */
548 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
549 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
551 /* We do not want O_EXCL here, presumably we opened the file
552 * already? XXX - NFS implications? */
553 oit.it_flags &= ~O_EXCL;
555 /* bug20584, if "it_flags" contains O_CREAT, the file will be
556 * created if necessary, then "IT_CREAT" should be set to keep
557 * consistent with it */
558 if (oit.it_flags & O_CREAT)
559 oit.it_op |= IT_CREAT;
565 /* Let's see if we have file open on MDS already. */
566 if (it->it_flags & FMODE_WRITE) {
567 och_p = &lli->lli_mds_write_och;
568 och_usecount = &lli->lli_open_fd_write_count;
569 } else if (it->it_flags & FMODE_EXEC) {
570 och_p = &lli->lli_mds_exec_och;
571 och_usecount = &lli->lli_open_fd_exec_count;
573 och_p = &lli->lli_mds_read_och;
574 och_usecount = &lli->lli_open_fd_read_count;
577 mutex_lock(&lli->lli_och_mutex);
578 if (*och_p) { /* Open handle is present */
579 if (it_disposition(it, DISP_OPEN_OPEN)) {
580 /* Well, there's extra open request that we do not need,
581 let's close it somehow. This will decref request. */
582 rc = it_open_error(DISP_OPEN_OPEN, it);
584 mutex_unlock(&lli->lli_och_mutex);
585 GOTO(out_openerr, rc);
588 ll_release_openhandle(file->f_dentry, it);
592 rc = ll_local_open(file, it, fd, NULL);
595 mutex_unlock(&lli->lli_och_mutex);
596 GOTO(out_openerr, rc);
599 LASSERT(*och_usecount == 0);
600 if (!it->d.lustre.it_disposition) {
601 /* We cannot just request lock handle now, new ELC code
602 means that one of other OPEN locks for this file
603 could be cancelled, and since blocking ast handler
604 would attempt to grab och_mutex as well, that would
605 result in a deadlock */
606 mutex_unlock(&lli->lli_och_mutex);
608 * Normally called under two situations:
610 * 2. A race/condition on MDS resulting in no open
611 * handle to be returned from LOOKUP|OPEN request,
612 * for example if the target entry was a symlink.
614 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
616 * Always specify MDS_OPEN_BY_FID because we don't want
617 * to get file with different fid.
619 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
620 rc = ll_intent_file_open(file, NULL, 0, it);
622 GOTO(out_openerr, rc);
626 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
628 GOTO(out_och_free, rc = -ENOMEM);
632 /* md_intent_lock() didn't get a request ref if there was an
633 * open error, so don't do cleanup on the request here
635 /* XXX (green): Should not we bail out on any error here, not
636 * just open error? */
637 rc = it_open_error(DISP_OPEN_OPEN, it);
639 GOTO(out_och_free, rc);
641 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
642 "inode %p: disposition %x, status %d\n", inode,
643 it_disposition(it, ~0), it->d.lustre.it_status);
645 rc = ll_local_open(file, it, fd, *och_p);
647 GOTO(out_och_free, rc);
649 mutex_unlock(&lli->lli_och_mutex);
652 /* Must do this outside lli_och_mutex lock to prevent deadlock where
653 different kind of OPEN lock for this same inode gets cancelled
654 by ldlm_cancel_lru */
655 if (!S_ISREG(inode->i_mode))
656 GOTO(out_och_free, rc);
660 cl_lov_delay_create_clear(&file->f_flags);
661 GOTO(out_och_free, rc);
665 if (och_p && *och_p) {
666 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
667 *och_p = NULL; /* OBD_FREE writes some magic there */
670 mutex_unlock(&lli->lli_och_mutex);
673 if (lli->lli_opendir_key == fd)
674 ll_deauthorize_statahead(inode, fd);
676 ll_file_data_put(fd);
678 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
681 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
682 ptlrpc_req_finished(it->d.lustre.it_data);
683 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
689 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
690 struct ldlm_lock_desc *desc, void *data, int flag)
693 struct lustre_handle lockh;
697 case LDLM_CB_BLOCKING:
698 ldlm_lock2handle(lock, &lockh);
699 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
701 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
705 case LDLM_CB_CANCELING:
713 * Acquire a lease and open the file.
715 static struct obd_client_handle *
716 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
719 struct lookup_intent it = { .it_op = IT_OPEN };
720 struct ll_sb_info *sbi = ll_i2sbi(inode);
721 struct md_op_data *op_data;
722 struct ptlrpc_request *req = NULL;
723 struct lustre_handle old_handle = { 0 };
724 struct obd_client_handle *och = NULL;
729 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
730 RETURN(ERR_PTR(-EINVAL));
733 struct ll_inode_info *lli = ll_i2info(inode);
734 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
735 struct obd_client_handle **och_p;
738 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
739 RETURN(ERR_PTR(-EPERM));
741 /* Get the openhandle of the file */
743 mutex_lock(&lli->lli_och_mutex);
744 if (fd->fd_lease_och != NULL) {
745 mutex_unlock(&lli->lli_och_mutex);
749 if (fd->fd_och == NULL) {
750 if (file->f_mode & FMODE_WRITE) {
751 LASSERT(lli->lli_mds_write_och != NULL);
752 och_p = &lli->lli_mds_write_och;
753 och_usecount = &lli->lli_open_fd_write_count;
755 LASSERT(lli->lli_mds_read_och != NULL);
756 och_p = &lli->lli_mds_read_och;
757 och_usecount = &lli->lli_open_fd_read_count;
759 if (*och_usecount == 1) {
766 mutex_unlock(&lli->lli_och_mutex);
767 if (rc < 0) /* more than 1 opener */
770 LASSERT(fd->fd_och != NULL);
771 old_handle = fd->fd_och->och_fh;
776 RETURN(ERR_PTR(-ENOMEM));
778 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
779 LUSTRE_OPC_ANY, NULL);
781 GOTO(out, rc = PTR_ERR(op_data));
783 /* To tell the MDT this openhandle is from the same owner */
784 op_data->op_handle = old_handle;
786 it.it_flags = fmode | open_flags;
787 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
788 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
789 &ll_md_blocking_lease_ast,
790 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
791 * it can be cancelled which may mislead applications that the lease is
793 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
794 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
795 * doesn't deal with openhandle, so normal openhandle will be leaked. */
796 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
797 ll_finish_md_op_data(op_data);
798 ptlrpc_req_finished(req);
800 GOTO(out_release_it, rc);
802 if (it_disposition(&it, DISP_LOOKUP_NEG))
803 GOTO(out_release_it, rc = -ENOENT);
805 rc = it_open_error(DISP_OPEN_OPEN, &it);
807 GOTO(out_release_it, rc);
809 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
810 ll_och_fill(sbi->ll_md_exp, &it, och);
812 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
813 GOTO(out_close, rc = -EOPNOTSUPP);
815 /* already get lease, handle lease lock */
816 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
817 if (it.d.lustre.it_lock_mode == 0 ||
818 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
819 /* open lock must return for lease */
820 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
821 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
822 it.d.lustre.it_lock_bits);
823 GOTO(out_close, rc = -EPROTO);
826 ll_intent_release(&it);
830 /* Cancel open lock */
831 if (it.d.lustre.it_lock_mode != 0) {
832 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
833 it.d.lustre.it_lock_mode);
834 it.d.lustre.it_lock_mode = 0;
835 och->och_lease_handle.cookie = 0ULL;
837 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
839 CERROR("%s: error closing file "DFID": %d\n",
840 ll_get_fsname(inode->i_sb, NULL, 0),
841 PFID(&ll_i2info(inode)->lli_fid), rc2);
842 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
844 ll_intent_release(&it);
852 * Release lease and close the file.
853 * It will check if the lease has ever broken.
855 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
858 struct ldlm_lock *lock;
859 bool cancelled = true;
863 lock = ldlm_handle2lock(&och->och_lease_handle);
865 lock_res_and_lock(lock);
866 cancelled = ldlm_is_cancel(lock);
867 unlock_res_and_lock(lock);
871 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
872 PFID(&ll_i2info(inode)->lli_fid), cancelled);
875 ldlm_cli_cancel(&och->och_lease_handle, 0);
876 if (lease_broken != NULL)
877 *lease_broken = cancelled;
879 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
884 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
886 struct ll_inode_info *lli = ll_i2info(inode);
887 struct cl_object *obj = lli->lli_clob;
888 struct cl_attr *attr = vvp_env_thread_attr(env);
896 ll_inode_size_lock(inode);
898 /* merge timestamps the most recently obtained from mds with
899 timestamps obtained from osts */
900 LTIME_S(inode->i_atime) = lli->lli_atime;
901 LTIME_S(inode->i_mtime) = lli->lli_mtime;
902 LTIME_S(inode->i_ctime) = lli->lli_ctime;
904 atime = LTIME_S(inode->i_atime);
905 mtime = LTIME_S(inode->i_mtime);
906 ctime = LTIME_S(inode->i_ctime);
908 cl_object_attr_lock(obj);
909 rc = cl_object_attr_get(env, obj, attr);
910 cl_object_attr_unlock(obj);
913 GOTO(out_size_unlock, rc);
915 if (atime < attr->cat_atime)
916 atime = attr->cat_atime;
918 if (ctime < attr->cat_ctime)
919 ctime = attr->cat_ctime;
921 if (mtime < attr->cat_mtime)
922 mtime = attr->cat_mtime;
924 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
925 PFID(&lli->lli_fid), attr->cat_size);
927 i_size_write(inode, attr->cat_size);
928 inode->i_blocks = attr->cat_blocks;
930 LTIME_S(inode->i_atime) = atime;
931 LTIME_S(inode->i_mtime) = mtime;
932 LTIME_S(inode->i_ctime) = ctime;
935 ll_inode_size_unlock(inode);
940 static bool file_is_noatime(const struct file *file)
942 const struct vfsmount *mnt = file->f_path.mnt;
943 const struct inode *inode = file->f_path.dentry->d_inode;
945 /* Adapted from file_accessed() and touch_atime().*/
946 if (file->f_flags & O_NOATIME)
949 if (inode->i_flags & S_NOATIME)
952 if (IS_NOATIME(inode))
955 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
958 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
961 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
967 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
969 struct inode *inode = file->f_dentry->d_inode;
971 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
973 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
974 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
975 file->f_flags & O_DIRECT ||
978 io->ci_obj = ll_i2info(inode)->lli_clob;
979 io->ci_lockreq = CILR_MAYBE;
980 if (ll_file_nolock(file)) {
981 io->ci_lockreq = CILR_NEVER;
982 io->ci_no_srvlock = 1;
983 } else if (file->f_flags & O_APPEND) {
984 io->ci_lockreq = CILR_MANDATORY;
987 io->ci_noatime = file_is_noatime(file);
991 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
992 struct file *file, enum cl_io_type iot,
993 loff_t *ppos, size_t count)
995 struct inode *inode = file->f_dentry->d_inode;
996 struct ll_inode_info *lli = ll_i2info(inode);
998 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1001 struct range_lock range;
1004 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1005 file->f_dentry->d_name.name, iot, *ppos, count);
1008 io = vvp_env_thread_io(env);
1009 ll_io_init(io, file, iot == CIT_WRITE);
1011 /* The maximum Lustre file size is variable, based on the
1012 * OST maximum object size and number of stripes. This
1013 * needs another check in addition to the VFS checks earlier. */
1014 end = (io->u.ci_wr.wr_append ? i_size_read(inode) : *ppos) + count;
1015 if (end > ll_file_maxbytes(inode)) {
1017 CDEBUG(D_INODE, "%s: file "DFID" offset %llu > maxbytes "LPU64
1018 ": rc = %zd\n", ll_get_fsname(inode->i_sb, NULL, 0),
1019 PFID(&lli->lli_fid), end, ll_file_maxbytes(inode),
1024 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1025 struct vvp_io *vio = vvp_env_io(env);
1026 bool range_locked = false;
1028 if (file->f_flags & O_APPEND)
1029 range_lock_init(&range, 0, LUSTRE_EOF);
1031 range_lock_init(&range, *ppos, *ppos + count - 1);
1033 vio->vui_fd = LUSTRE_FPRIVATE(file);
1034 vio->vui_io_subtype = args->via_io_subtype;
1036 switch (vio->vui_io_subtype) {
1038 vio->vui_iov = args->u.normal.via_iov;
1039 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1040 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1041 vio->vui_iocb = args->u.normal.via_iocb;
1042 /* Direct IO reads must also take range lock,
1043 * or multiple reads will try to work on the same pages
1044 * See LU-6227 for details. */
1045 if (((iot == CIT_WRITE) ||
1046 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1047 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1048 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1050 result = range_lock(&lli->lli_write_tree,
1055 range_locked = true;
1057 down_read(&lli->lli_trunc_sem);
1060 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1061 vio->u.splice.vui_flags = args->u.splice.via_flags;
1064 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1068 ll_cl_add(file, env, io);
1069 result = cl_io_loop(env, io);
1070 ll_cl_remove(file, env);
1072 if (args->via_io_subtype == IO_NORMAL)
1073 up_read(&lli->lli_trunc_sem);
1075 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1077 range_unlock(&lli->lli_write_tree, &range);
1080 /* cl_io_rw_init() handled IO */
1081 result = io->ci_result;
1084 if (io->ci_nob > 0) {
1085 result = io->ci_nob;
1086 *ppos = io->u.ci_wr.wr.crw_pos;
1090 cl_io_fini(env, io);
1091 /* If any bit been read/written (result != 0), we just return
1092 * short read/write instead of restart io. */
1093 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1094 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1095 iot == CIT_READ ? "read" : "write",
1096 file->f_dentry->d_name.name, *ppos, count);
1097 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1101 if (iot == CIT_READ) {
1103 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1104 LPROC_LL_READ_BYTES, result);
1105 } else if (iot == CIT_WRITE) {
1107 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1108 LPROC_LL_WRITE_BYTES, result);
1109 fd->fd_write_failed = false;
1110 } else if (result != -ERESTARTSYS) {
1111 fd->fd_write_failed = true;
1114 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1121 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1123 static int ll_file_get_iov_count(const struct iovec *iov,
1124 unsigned long *nr_segs, size_t *count)
1129 for (seg = 0; seg < *nr_segs; seg++) {
1130 const struct iovec *iv = &iov[seg];
1133 * If any segment has a negative length, or the cumulative
1134 * length ever wraps negative then return -EINVAL.
1137 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1139 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1144 cnt -= iv->iov_len; /* This segment is no good */
1151 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1152 unsigned long nr_segs, loff_t pos)
1155 struct vvp_io_args *args;
1161 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1165 env = cl_env_get(&refcheck);
1167 RETURN(PTR_ERR(env));
1169 args = ll_env_args(env, IO_NORMAL);
1170 args->u.normal.via_iov = (struct iovec *)iov;
1171 args->u.normal.via_nrsegs = nr_segs;
1172 args->u.normal.via_iocb = iocb;
1174 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1175 &iocb->ki_pos, count);
1176 cl_env_put(env, &refcheck);
1180 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1184 struct iovec *local_iov;
1185 struct kiocb *kiocb;
1190 env = cl_env_get(&refcheck);
1192 RETURN(PTR_ERR(env));
1194 local_iov = &ll_env_info(env)->lti_local_iov;
1195 kiocb = &ll_env_info(env)->lti_kiocb;
1196 local_iov->iov_base = (void __user *)buf;
1197 local_iov->iov_len = count;
1198 init_sync_kiocb(kiocb, file);
1199 kiocb->ki_pos = *ppos;
1200 #ifdef HAVE_KIOCB_KI_LEFT
1201 kiocb->ki_left = count;
1203 kiocb->ki_nbytes = count;
1206 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1207 *ppos = kiocb->ki_pos;
1209 cl_env_put(env, &refcheck);
1214 * Write to a file (through the page cache).
1217 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1218 unsigned long nr_segs, loff_t pos)
1221 struct vvp_io_args *args;
1227 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1231 env = cl_env_get(&refcheck);
1233 RETURN(PTR_ERR(env));
1235 args = ll_env_args(env, IO_NORMAL);
1236 args->u.normal.via_iov = (struct iovec *)iov;
1237 args->u.normal.via_nrsegs = nr_segs;
1238 args->u.normal.via_iocb = iocb;
1240 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1241 &iocb->ki_pos, count);
1242 cl_env_put(env, &refcheck);
1246 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1247 size_t count, loff_t *ppos)
1250 struct iovec *local_iov;
1251 struct kiocb *kiocb;
1256 env = cl_env_get(&refcheck);
1258 RETURN(PTR_ERR(env));
1260 local_iov = &ll_env_info(env)->lti_local_iov;
1261 kiocb = &ll_env_info(env)->lti_kiocb;
1262 local_iov->iov_base = (void __user *)buf;
1263 local_iov->iov_len = count;
1264 init_sync_kiocb(kiocb, file);
1265 kiocb->ki_pos = *ppos;
1266 #ifdef HAVE_KIOCB_KI_LEFT
1267 kiocb->ki_left = count;
1269 kiocb->ki_nbytes = count;
1272 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1273 *ppos = kiocb->ki_pos;
1275 cl_env_put(env, &refcheck);
1280 * Send file content (through pagecache) somewhere with helper
1282 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1283 struct pipe_inode_info *pipe, size_t count,
1287 struct vvp_io_args *args;
1292 env = cl_env_get(&refcheck);
1294 RETURN(PTR_ERR(env));
1296 args = ll_env_args(env, IO_SPLICE);
1297 args->u.splice.via_pipe = pipe;
1298 args->u.splice.via_flags = flags;
1300 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1301 cl_env_put(env, &refcheck);
1305 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1306 __u64 flags, struct lov_user_md *lum,
1309 struct lookup_intent oit = {
1311 .it_flags = flags | MDS_OPEN_BY_FID,
1316 ll_inode_size_lock(inode);
1317 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1319 GOTO(out_unlock, rc);
1321 ll_release_openhandle(file->f_dentry, &oit);
1324 ll_inode_size_unlock(inode);
1325 ll_intent_release(&oit);
1326 cl_lov_delay_create_clear(&file->f_flags);
1331 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1332 struct lov_mds_md **lmmp, int *lmm_size,
1333 struct ptlrpc_request **request)
1335 struct ll_sb_info *sbi = ll_i2sbi(inode);
1336 struct mdt_body *body;
1337 struct lov_mds_md *lmm = NULL;
1338 struct ptlrpc_request *req = NULL;
1339 struct md_op_data *op_data;
1342 rc = ll_get_default_mdsize(sbi, &lmmsize);
1346 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1347 strlen(filename), lmmsize,
1348 LUSTRE_OPC_ANY, NULL);
1349 if (IS_ERR(op_data))
1350 RETURN(PTR_ERR(op_data));
1352 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1353 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1354 ll_finish_md_op_data(op_data);
1356 CDEBUG(D_INFO, "md_getattr_name failed "
1357 "on %s: rc %d\n", filename, rc);
1361 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1362 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1364 lmmsize = body->mbo_eadatasize;
1366 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1368 GOTO(out, rc = -ENODATA);
1371 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1372 LASSERT(lmm != NULL);
1374 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1375 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1376 GOTO(out, rc = -EPROTO);
1380 * This is coming from the MDS, so is probably in
1381 * little endian. We convert it to host endian before
1382 * passing it to userspace.
1384 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1387 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1388 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1391 /* if function called for directory - we should
1392 * avoid swab not existent lsm objects */
1393 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1394 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1395 if (S_ISREG(body->mbo_mode))
1396 lustre_swab_lov_user_md_objects(
1397 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1399 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1400 lustre_swab_lov_user_md_v3(
1401 (struct lov_user_md_v3 *)lmm);
1402 if (S_ISREG(body->mbo_mode))
1403 lustre_swab_lov_user_md_objects(
1404 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1411 *lmm_size = lmmsize;
1416 static int ll_lov_setea(struct inode *inode, struct file *file,
1419 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1420 struct lov_user_md *lump;
1421 int lum_size = sizeof(struct lov_user_md) +
1422 sizeof(struct lov_user_ost_data);
1426 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1429 OBD_ALLOC_LARGE(lump, lum_size);
1433 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1434 OBD_FREE_LARGE(lump, lum_size);
1438 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1440 OBD_FREE_LARGE(lump, lum_size);
1444 static int ll_file_getstripe(struct inode *inode,
1445 struct lov_user_md __user *lum)
1452 env = cl_env_get(&refcheck);
1454 RETURN(PTR_ERR(env));
1456 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1457 cl_env_put(env, &refcheck);
1461 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1464 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1465 struct lov_user_md *klum;
1467 __u64 flags = FMODE_WRITE;
1470 rc = ll_copy_user_md(lum, &klum);
1475 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1479 put_user(0, &lum->lmm_stripe_count);
1481 ll_layout_refresh(inode, &gen);
1482 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1485 OBD_FREE(klum, lum_size);
1490 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1492 struct ll_inode_info *lli = ll_i2info(inode);
1493 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1494 struct ll_grouplock grouplock;
1499 CWARN("group id for group lock must not be 0\n");
1503 if (ll_file_nolock(file))
1504 RETURN(-EOPNOTSUPP);
1506 spin_lock(&lli->lli_lock);
1507 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1508 CWARN("group lock already existed with gid %lu\n",
1509 fd->fd_grouplock.lg_gid);
1510 spin_unlock(&lli->lli_lock);
1513 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1514 spin_unlock(&lli->lli_lock);
1516 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1517 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1521 spin_lock(&lli->lli_lock);
1522 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1523 spin_unlock(&lli->lli_lock);
1524 CERROR("another thread just won the race\n");
1525 cl_put_grouplock(&grouplock);
1529 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1530 fd->fd_grouplock = grouplock;
1531 spin_unlock(&lli->lli_lock);
1533 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1537 static int ll_put_grouplock(struct inode *inode, struct file *file,
1540 struct ll_inode_info *lli = ll_i2info(inode);
1541 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1542 struct ll_grouplock grouplock;
1545 spin_lock(&lli->lli_lock);
1546 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1547 spin_unlock(&lli->lli_lock);
1548 CWARN("no group lock held\n");
1552 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1554 if (fd->fd_grouplock.lg_gid != arg) {
1555 CWARN("group lock %lu doesn't match current id %lu\n",
1556 arg, fd->fd_grouplock.lg_gid);
1557 spin_unlock(&lli->lli_lock);
1561 grouplock = fd->fd_grouplock;
1562 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1563 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1564 spin_unlock(&lli->lli_lock);
1566 cl_put_grouplock(&grouplock);
1567 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1572 * Close inode open handle
1574 * \param dentry [in] dentry which contains the inode
1575 * \param it [in,out] intent which contains open info and result
1578 * \retval <0 failure
1580 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1582 struct inode *inode = dentry->d_inode;
1583 struct obd_client_handle *och;
1589 /* Root ? Do nothing. */
1590 if (dentry->d_inode->i_sb->s_root == dentry)
1593 /* No open handle to close? Move away */
1594 if (!it_disposition(it, DISP_OPEN_OPEN))
1597 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1599 OBD_ALLOC(och, sizeof(*och));
1601 GOTO(out, rc = -ENOMEM);
1603 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1605 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1608 /* this one is in place of ll_file_open */
1609 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1610 ptlrpc_req_finished(it->d.lustre.it_data);
1611 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1617 * Get size for inode for which FIEMAP mapping is requested.
1618 * Make the FIEMAP get_info call and returns the result.
1619 * \param fiemap kernel buffer to hold extens
1620 * \param num_bytes kernel buffer size
1622 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1628 struct ll_fiemap_info_key fmkey = { .name = KEY_FIEMAP, };
1631 /* Checks for fiemap flags */
1632 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1633 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1637 /* Check for FIEMAP_FLAG_SYNC */
1638 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1639 rc = filemap_fdatawrite(inode->i_mapping);
1644 env = cl_env_get(&refcheck);
1646 RETURN(PTR_ERR(env));
1648 if (i_size_read(inode) == 0) {
1649 rc = ll_glimpse_size(inode);
1654 fmkey.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1655 obdo_from_inode(&fmkey.oa, inode, OBD_MD_FLSIZE);
1656 obdo_set_parent_fid(&fmkey.oa, &ll_i2info(inode)->lli_fid);
1658 /* If filesize is 0, then there would be no objects for mapping */
1659 if (fmkey.oa.o_size == 0) {
1660 fiemap->fm_mapped_extents = 0;
1664 fmkey.fiemap = *fiemap;
1666 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1667 &fmkey, fiemap, &num_bytes);
1669 cl_env_put(env, &refcheck);
1673 int ll_fid2path(struct inode *inode, void __user *arg)
1675 struct obd_export *exp = ll_i2mdexp(inode);
1676 const struct getinfo_fid2path __user *gfin = arg;
1678 struct getinfo_fid2path *gfout;
1684 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1685 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1688 /* Only need to get the buflen */
1689 if (get_user(pathlen, &gfin->gf_pathlen))
1692 if (pathlen > PATH_MAX)
1695 outsize = sizeof(*gfout) + pathlen;
1696 OBD_ALLOC(gfout, outsize);
1700 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1701 GOTO(gf_free, rc = -EFAULT);
1703 /* Call mdc_iocontrol */
1704 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1708 if (copy_to_user(arg, gfout, outsize))
1712 OBD_FREE(gfout, outsize);
1716 static int ll_ioctl_fiemap(struct inode *inode, struct fiemap __user *arg)
1718 struct fiemap *fiemap;
1724 /* Get the extent count so we can calculate the size of
1725 * required fiemap buffer */
1726 if (get_user(extent_count, &arg->fm_extent_count))
1730 (SIZE_MAX - sizeof(*fiemap)) / sizeof(struct ll_fiemap_extent))
1732 num_bytes = sizeof(*fiemap) + (extent_count *
1733 sizeof(struct ll_fiemap_extent));
1735 OBD_ALLOC_LARGE(fiemap, num_bytes);
1739 /* get the fiemap value */
1740 if (copy_from_user(fiemap, arg, sizeof(*fiemap)))
1741 GOTO(error, rc = -EFAULT);
1743 /* If fm_extent_count is non-zero, read the first extent since
1744 * it is used to calculate end_offset and device from previous
1746 if (extent_count != 0) {
1747 if (copy_from_user(&fiemap->fm_extents[0],
1748 (char __user *)arg + sizeof(*fiemap),
1749 sizeof(struct ll_fiemap_extent)))
1750 GOTO(error, rc = -EFAULT);
1753 rc = ll_do_fiemap(inode, fiemap, num_bytes);
1757 ret_bytes = sizeof(struct fiemap);
1759 if (extent_count != 0)
1760 ret_bytes += (fiemap->fm_mapped_extents *
1761 sizeof(struct ll_fiemap_extent));
1763 if (copy_to_user((void __user *)arg, fiemap, ret_bytes))
1767 OBD_FREE_LARGE(fiemap, num_bytes);
1772 * Read the data_version for inode.
1774 * This value is computed using stripe object version on OST.
1775 * Version is computed using server side locking.
1777 * @param flags if do sync on the OST side;
1779 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1780 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1782 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1789 /* If no file object initialized, we consider its version is 0. */
1790 if (ll_i2info(inode)->lli_clob == NULL) {
1795 env = cl_env_get(&refcheck);
1797 RETURN(PTR_ERR(env));
1799 rc = cl_object_data_version(env, ll_i2info(inode)->lli_clob,
1800 data_version, flags);
1801 cl_env_put(env, &refcheck);
1806 * Trigger a HSM release request for the provided inode.
1808 int ll_hsm_release(struct inode *inode)
1810 struct cl_env_nest nest;
1812 struct obd_client_handle *och = NULL;
1813 __u64 data_version = 0;
1817 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1818 ll_get_fsname(inode->i_sb, NULL, 0),
1819 PFID(&ll_i2info(inode)->lli_fid));
1821 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1823 GOTO(out, rc = PTR_ERR(och));
1825 /* Grab latest data_version and [am]time values */
1826 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1830 env = cl_env_nested_get(&nest);
1832 GOTO(out, rc = PTR_ERR(env));
1834 ll_merge_attr(env, inode);
1835 cl_env_nested_put(&nest, env);
1837 /* Release the file.
1838 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1839 * we still need it to pack l_remote_handle to MDT. */
1840 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1846 if (och != NULL && !IS_ERR(och)) /* close the file */
1847 ll_lease_close(och, inode, NULL);
1852 struct ll_swap_stack {
1853 struct iattr ia1, ia2;
1855 struct inode *inode1, *inode2;
1856 bool check_dv1, check_dv2;
1859 static int ll_swap_layouts(struct file *file1, struct file *file2,
1860 struct lustre_swap_layouts *lsl)
1862 struct mdc_swap_layouts msl;
1863 struct md_op_data *op_data;
1866 struct ll_swap_stack *llss = NULL;
1869 OBD_ALLOC_PTR(llss);
1873 llss->inode1 = file1->f_dentry->d_inode;
1874 llss->inode2 = file2->f_dentry->d_inode;
1876 if (!S_ISREG(llss->inode2->i_mode))
1877 GOTO(free, rc = -EINVAL);
1879 if (inode_permission(llss->inode1, MAY_WRITE) ||
1880 inode_permission(llss->inode2, MAY_WRITE))
1881 GOTO(free, rc = -EPERM);
1883 if (llss->inode2->i_sb != llss->inode1->i_sb)
1884 GOTO(free, rc = -EXDEV);
1886 /* we use 2 bool because it is easier to swap than 2 bits */
1887 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1888 llss->check_dv1 = true;
1890 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1891 llss->check_dv2 = true;
1893 /* we cannot use lsl->sl_dvX directly because we may swap them */
1894 llss->dv1 = lsl->sl_dv1;
1895 llss->dv2 = lsl->sl_dv2;
1897 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1898 if (rc == 0) /* same file, done! */
1901 if (rc < 0) { /* sequentialize it */
1902 swap(llss->inode1, llss->inode2);
1904 swap(llss->dv1, llss->dv2);
1905 swap(llss->check_dv1, llss->check_dv2);
1909 if (gid != 0) { /* application asks to flush dirty cache */
1910 rc = ll_get_grouplock(llss->inode1, file1, gid);
1914 rc = ll_get_grouplock(llss->inode2, file2, gid);
1916 ll_put_grouplock(llss->inode1, file1, gid);
1921 /* to be able to restore mtime and atime after swap
1922 * we need to first save them */
1924 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1925 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1926 llss->ia1.ia_atime = llss->inode1->i_atime;
1927 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1928 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1929 llss->ia2.ia_atime = llss->inode2->i_atime;
1930 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1933 /* ultimate check, before swaping the layouts we check if
1934 * dataversion has changed (if requested) */
1935 if (llss->check_dv1) {
1936 rc = ll_data_version(llss->inode1, &dv, 0);
1939 if (dv != llss->dv1)
1940 GOTO(putgl, rc = -EAGAIN);
1943 if (llss->check_dv2) {
1944 rc = ll_data_version(llss->inode2, &dv, 0);
1947 if (dv != llss->dv2)
1948 GOTO(putgl, rc = -EAGAIN);
1951 /* struct md_op_data is used to send the swap args to the mdt
1952 * only flags is missing, so we use struct mdc_swap_layouts
1953 * through the md_op_data->op_data */
1954 /* flags from user space have to be converted before they are send to
1955 * server, no flag is sent today, they are only used on the client */
1958 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1959 0, LUSTRE_OPC_ANY, &msl);
1960 if (IS_ERR(op_data))
1961 GOTO(free, rc = PTR_ERR(op_data));
1963 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1964 sizeof(*op_data), op_data, NULL);
1965 ll_finish_md_op_data(op_data);
1969 ll_put_grouplock(llss->inode2, file2, gid);
1970 ll_put_grouplock(llss->inode1, file1, gid);
1973 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1977 /* clear useless flags */
1978 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1979 llss->ia1.ia_valid &= ~ATTR_MTIME;
1980 llss->ia2.ia_valid &= ~ATTR_MTIME;
1983 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1984 llss->ia1.ia_valid &= ~ATTR_ATIME;
1985 llss->ia2.ia_valid &= ~ATTR_ATIME;
1988 /* update time if requested */
1990 if (llss->ia2.ia_valid != 0) {
1991 mutex_lock(&llss->inode1->i_mutex);
1992 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1993 mutex_unlock(&llss->inode1->i_mutex);
1996 if (llss->ia1.ia_valid != 0) {
1999 mutex_lock(&llss->inode2->i_mutex);
2000 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2001 mutex_unlock(&llss->inode2->i_mutex);
2013 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2015 struct md_op_data *op_data;
2019 /* Detect out-of range masks */
2020 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2023 /* Non-root users are forbidden to set or clear flags which are
2024 * NOT defined in HSM_USER_MASK. */
2025 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2026 !cfs_capable(CFS_CAP_SYS_ADMIN))
2029 /* Detect out-of range archive id */
2030 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2031 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2034 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2035 LUSTRE_OPC_ANY, hss);
2036 if (IS_ERR(op_data))
2037 RETURN(PTR_ERR(op_data));
2039 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2040 sizeof(*op_data), op_data, NULL);
2042 ll_finish_md_op_data(op_data);
2047 static int ll_hsm_import(struct inode *inode, struct file *file,
2048 struct hsm_user_import *hui)
2050 struct hsm_state_set *hss = NULL;
2051 struct iattr *attr = NULL;
2055 if (!S_ISREG(inode->i_mode))
2061 GOTO(out, rc = -ENOMEM);
2063 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2064 hss->hss_archive_id = hui->hui_archive_id;
2065 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2066 rc = ll_hsm_state_set(inode, hss);
2070 OBD_ALLOC_PTR(attr);
2072 GOTO(out, rc = -ENOMEM);
2074 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2075 attr->ia_mode |= S_IFREG;
2076 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2077 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2078 attr->ia_size = hui->hui_size;
2079 attr->ia_mtime.tv_sec = hui->hui_mtime;
2080 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2081 attr->ia_atime.tv_sec = hui->hui_atime;
2082 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2084 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2085 ATTR_UID | ATTR_GID |
2086 ATTR_MTIME | ATTR_MTIME_SET |
2087 ATTR_ATIME | ATTR_ATIME_SET;
2089 mutex_lock(&inode->i_mutex);
2091 rc = ll_setattr_raw(file->f_dentry, attr, true);
2095 mutex_unlock(&inode->i_mutex);
2107 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2109 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2110 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2114 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2116 struct inode *inode = file->f_dentry->d_inode;
2117 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2121 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2122 PFID(ll_inode2fid(inode)), inode, cmd);
2123 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2125 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2126 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2130 case LL_IOC_GETFLAGS:
2131 /* Get the current value of the file flags */
2132 return put_user(fd->fd_flags, (int __user *)arg);
2133 case LL_IOC_SETFLAGS:
2134 case LL_IOC_CLRFLAGS:
2135 /* Set or clear specific file flags */
2136 /* XXX This probably needs checks to ensure the flags are
2137 * not abused, and to handle any flag side effects.
2139 if (get_user(flags, (int __user *) arg))
2142 if (cmd == LL_IOC_SETFLAGS) {
2143 if ((flags & LL_FILE_IGNORE_LOCK) &&
2144 !(file->f_flags & O_DIRECT)) {
2145 CERROR("%s: unable to disable locking on "
2146 "non-O_DIRECT file\n", current->comm);
2150 fd->fd_flags |= flags;
2152 fd->fd_flags &= ~flags;
2155 case LL_IOC_LOV_SETSTRIPE:
2156 RETURN(ll_lov_setstripe(inode, file, arg));
2157 case LL_IOC_LOV_SETEA:
2158 RETURN(ll_lov_setea(inode, file, arg));
2159 case LL_IOC_LOV_SWAP_LAYOUTS: {
2161 struct lustre_swap_layouts lsl;
2163 if (copy_from_user(&lsl, (char __user *)arg,
2164 sizeof(struct lustre_swap_layouts)))
2167 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2170 file2 = fget(lsl.sl_fd);
2175 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2176 rc = ll_swap_layouts(file, file2, &lsl);
2180 case LL_IOC_LOV_GETSTRIPE:
2181 RETURN(ll_file_getstripe(inode,
2182 (struct lov_user_md __user *)arg));
2183 case FSFILT_IOC_FIEMAP:
2184 RETURN(ll_ioctl_fiemap(inode, (struct fiemap __user *)arg));
2185 case FSFILT_IOC_GETFLAGS:
2186 case FSFILT_IOC_SETFLAGS:
2187 RETURN(ll_iocontrol(inode, file, cmd, arg));
2188 case FSFILT_IOC_GETVERSION_OLD:
2189 case FSFILT_IOC_GETVERSION:
2190 RETURN(put_user(inode->i_generation, (int __user *)arg));
2191 case LL_IOC_GROUP_LOCK:
2192 RETURN(ll_get_grouplock(inode, file, arg));
2193 case LL_IOC_GROUP_UNLOCK:
2194 RETURN(ll_put_grouplock(inode, file, arg));
2195 case IOC_OBD_STATFS:
2196 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2198 /* We need to special case any other ioctls we want to handle,
2199 * to send them to the MDS/OST as appropriate and to properly
2200 * network encode the arg field.
2201 case FSFILT_IOC_SETVERSION_OLD:
2202 case FSFILT_IOC_SETVERSION:
2204 case LL_IOC_FLUSHCTX:
2205 RETURN(ll_flush_ctx(inode));
2206 case LL_IOC_PATH2FID: {
2207 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2208 sizeof(struct lu_fid)))
2213 case LL_IOC_GETPARENT:
2214 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2216 case OBD_IOC_FID2PATH:
2217 RETURN(ll_fid2path(inode, (void __user *)arg));
2218 case LL_IOC_DATA_VERSION: {
2219 struct ioc_data_version idv;
2222 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2225 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2226 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2229 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2235 case LL_IOC_GET_MDTIDX: {
2238 mdtidx = ll_get_mdt_idx(inode);
2242 if (put_user((int)mdtidx, (int __user *)arg))
2247 case OBD_IOC_GETDTNAME:
2248 case OBD_IOC_GETMDNAME:
2249 RETURN(ll_get_obd_name(inode, cmd, arg));
2250 case LL_IOC_HSM_STATE_GET: {
2251 struct md_op_data *op_data;
2252 struct hsm_user_state *hus;
2259 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2260 LUSTRE_OPC_ANY, hus);
2261 if (IS_ERR(op_data)) {
2263 RETURN(PTR_ERR(op_data));
2266 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2269 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2272 ll_finish_md_op_data(op_data);
2276 case LL_IOC_HSM_STATE_SET: {
2277 struct hsm_state_set *hss;
2284 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2289 rc = ll_hsm_state_set(inode, hss);
2294 case LL_IOC_HSM_ACTION: {
2295 struct md_op_data *op_data;
2296 struct hsm_current_action *hca;
2303 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2304 LUSTRE_OPC_ANY, hca);
2305 if (IS_ERR(op_data)) {
2307 RETURN(PTR_ERR(op_data));
2310 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2313 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2316 ll_finish_md_op_data(op_data);
2320 case LL_IOC_SET_LEASE: {
2321 struct ll_inode_info *lli = ll_i2info(inode);
2322 struct obd_client_handle *och = NULL;
2327 case LL_LEASE_WRLCK:
2328 if (!(file->f_mode & FMODE_WRITE))
2330 fmode = FMODE_WRITE;
2332 case LL_LEASE_RDLCK:
2333 if (!(file->f_mode & FMODE_READ))
2337 case LL_LEASE_UNLCK:
2338 mutex_lock(&lli->lli_och_mutex);
2339 if (fd->fd_lease_och != NULL) {
2340 och = fd->fd_lease_och;
2341 fd->fd_lease_och = NULL;
2343 mutex_unlock(&lli->lli_och_mutex);
2348 fmode = och->och_flags;
2349 rc = ll_lease_close(och, inode, &lease_broken);
2356 RETURN(ll_lease_type_from_fmode(fmode));
2361 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2363 /* apply for lease */
2364 och = ll_lease_open(inode, file, fmode, 0);
2366 RETURN(PTR_ERR(och));
2369 mutex_lock(&lli->lli_och_mutex);
2370 if (fd->fd_lease_och == NULL) {
2371 fd->fd_lease_och = och;
2374 mutex_unlock(&lli->lli_och_mutex);
2376 /* impossible now that only excl is supported for now */
2377 ll_lease_close(och, inode, &lease_broken);
2382 case LL_IOC_GET_LEASE: {
2383 struct ll_inode_info *lli = ll_i2info(inode);
2384 struct ldlm_lock *lock = NULL;
2387 mutex_lock(&lli->lli_och_mutex);
2388 if (fd->fd_lease_och != NULL) {
2389 struct obd_client_handle *och = fd->fd_lease_och;
2391 lock = ldlm_handle2lock(&och->och_lease_handle);
2393 lock_res_and_lock(lock);
2394 if (!ldlm_is_cancel(lock))
2395 fmode = och->och_flags;
2397 unlock_res_and_lock(lock);
2398 LDLM_LOCK_PUT(lock);
2401 mutex_unlock(&lli->lli_och_mutex);
2403 RETURN(ll_lease_type_from_fmode(fmode));
2405 case LL_IOC_HSM_IMPORT: {
2406 struct hsm_user_import *hui;
2412 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2417 rc = ll_hsm_import(inode, file, hui);
2427 ll_iocontrol_call(inode, file, cmd, arg, &err))
2430 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2431 (void __user *)arg));
2436 #ifndef HAVE_FILE_LLSEEK_SIZE
2437 static inline loff_t
2438 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2440 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2442 if (offset > maxsize)
2445 if (offset != file->f_pos) {
2446 file->f_pos = offset;
2447 file->f_version = 0;
2453 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2454 loff_t maxsize, loff_t eof)
2456 struct inode *inode = file->f_dentry->d_inode;
2464 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2465 * position-querying operation. Avoid rewriting the "same"
2466 * f_pos value back to the file because a concurrent read(),
2467 * write() or lseek() might have altered it
2472 * f_lock protects against read/modify/write race with other
2473 * SEEK_CURs. Note that parallel writes and reads behave
2476 mutex_lock(&inode->i_mutex);
2477 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2478 mutex_unlock(&inode->i_mutex);
2482 * In the generic case the entire file is data, so as long as
2483 * offset isn't at the end of the file then the offset is data.
2490 * There is a virtual hole at the end of the file, so as long as
2491 * offset isn't i_size or larger, return i_size.
2499 return llseek_execute(file, offset, maxsize);
2503 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2505 struct inode *inode = file->f_dentry->d_inode;
2506 loff_t retval, eof = 0;
2509 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2510 (origin == SEEK_CUR) ? file->f_pos : 0);
2511 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2512 PFID(ll_inode2fid(inode)), inode, retval, retval,
2514 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2516 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2517 retval = ll_glimpse_size(inode);
2520 eof = i_size_read(inode);
2523 retval = ll_generic_file_llseek_size(file, offset, origin,
2524 ll_file_maxbytes(inode), eof);
2528 static int ll_flush(struct file *file, fl_owner_t id)
2530 struct inode *inode = file->f_dentry->d_inode;
2531 struct ll_inode_info *lli = ll_i2info(inode);
2532 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2535 LASSERT(!S_ISDIR(inode->i_mode));
2537 /* catch async errors that were recorded back when async writeback
2538 * failed for pages in this mapping. */
2539 rc = lli->lli_async_rc;
2540 lli->lli_async_rc = 0;
2541 if (lli->lli_clob != NULL) {
2542 err = lov_read_and_clear_async_rc(lli->lli_clob);
2547 /* The application has been told write failure already.
2548 * Do not report failure again. */
2549 if (fd->fd_write_failed)
2551 return rc ? -EIO : 0;
2555 * Called to make sure a portion of file has been written out.
2556 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2558 * Return how many pages have been written.
2560 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2561 enum cl_fsync_mode mode, int ignore_layout)
2563 struct cl_env_nest nest;
2566 struct obd_capa *capa = NULL;
2567 struct cl_fsync_io *fio;
2571 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2572 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2575 env = cl_env_nested_get(&nest);
2577 RETURN(PTR_ERR(env));
2579 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2581 io = vvp_env_thread_io(env);
2582 io->ci_obj = ll_i2info(inode)->lli_clob;
2583 io->ci_ignore_layout = ignore_layout;
2585 /* initialize parameters for sync */
2586 fio = &io->u.ci_fsync;
2587 fio->fi_capa = capa;
2588 fio->fi_start = start;
2590 fio->fi_fid = ll_inode2fid(inode);
2591 fio->fi_mode = mode;
2592 fio->fi_nr_written = 0;
2594 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2595 result = cl_io_loop(env, io);
2597 result = io->ci_result;
2599 result = fio->fi_nr_written;
2600 cl_io_fini(env, io);
2601 cl_env_nested_put(&nest, env);
2609 * When dentry is provided (the 'else' case), *file->f_dentry may be
2610 * null and dentry must be used directly rather than pulled from
2611 * *file->f_dentry as is done otherwise.
2614 #ifdef HAVE_FILE_FSYNC_4ARGS
2615 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2617 struct dentry *dentry = file->f_dentry;
2618 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2619 int ll_fsync(struct file *file, int datasync)
2621 struct dentry *dentry = file->f_dentry;
2623 loff_t end = LLONG_MAX;
2625 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2628 loff_t end = LLONG_MAX;
2630 struct inode *inode = dentry->d_inode;
2631 struct ll_inode_info *lli = ll_i2info(inode);
2632 struct ptlrpc_request *req;
2633 struct obd_capa *oc;
2637 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2638 PFID(ll_inode2fid(inode)), inode);
2639 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2641 #ifdef HAVE_FILE_FSYNC_4ARGS
2642 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2643 mutex_lock(&inode->i_mutex);
2645 /* fsync's caller has already called _fdata{sync,write}, we want
2646 * that IO to finish before calling the osc and mdc sync methods */
2647 rc = filemap_fdatawait(inode->i_mapping);
2650 /* catch async errors that were recorded back when async writeback
2651 * failed for pages in this mapping. */
2652 if (!S_ISDIR(inode->i_mode)) {
2653 err = lli->lli_async_rc;
2654 lli->lli_async_rc = 0;
2657 err = lov_read_and_clear_async_rc(lli->lli_clob);
2662 oc = ll_mdscapa_get(inode);
2663 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2669 ptlrpc_req_finished(req);
2671 if (S_ISREG(inode->i_mode)) {
2672 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2674 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2675 if (rc == 0 && err < 0)
2678 fd->fd_write_failed = true;
2680 fd->fd_write_failed = false;
2683 #ifdef HAVE_FILE_FSYNC_4ARGS
2684 mutex_unlock(&inode->i_mutex);
2690 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2692 struct inode *inode = file->f_dentry->d_inode;
2693 struct ll_sb_info *sbi = ll_i2sbi(inode);
2694 struct ldlm_enqueue_info einfo = {
2695 .ei_type = LDLM_FLOCK,
2696 .ei_cb_cp = ldlm_flock_completion_ast,
2697 .ei_cbdata = file_lock,
2699 struct md_op_data *op_data;
2700 struct lustre_handle lockh = {0};
2701 ldlm_policy_data_t flock = {{0}};
2702 int fl_type = file_lock->fl_type;
2708 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2709 PFID(ll_inode2fid(inode)), file_lock);
2711 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2713 if (file_lock->fl_flags & FL_FLOCK) {
2714 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2715 /* flocks are whole-file locks */
2716 flock.l_flock.end = OFFSET_MAX;
2717 /* For flocks owner is determined by the local file desctiptor*/
2718 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2719 } else if (file_lock->fl_flags & FL_POSIX) {
2720 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2721 flock.l_flock.start = file_lock->fl_start;
2722 flock.l_flock.end = file_lock->fl_end;
2726 flock.l_flock.pid = file_lock->fl_pid;
2728 /* Somewhat ugly workaround for svc lockd.
2729 * lockd installs custom fl_lmops->lm_compare_owner that checks
2730 * for the fl_owner to be the same (which it always is on local node
2731 * I guess between lockd processes) and then compares pid.
2732 * As such we assign pid to the owner field to make it all work,
2733 * conflict with normal locks is unlikely since pid space and
2734 * pointer space for current->files are not intersecting */
2735 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2736 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2740 einfo.ei_mode = LCK_PR;
2743 /* An unlock request may or may not have any relation to
2744 * existing locks so we may not be able to pass a lock handle
2745 * via a normal ldlm_lock_cancel() request. The request may even
2746 * unlock a byte range in the middle of an existing lock. In
2747 * order to process an unlock request we need all of the same
2748 * information that is given with a normal read or write record
2749 * lock request. To avoid creating another ldlm unlock (cancel)
2750 * message we'll treat a LCK_NL flock request as an unlock. */
2751 einfo.ei_mode = LCK_NL;
2754 einfo.ei_mode = LCK_PW;
2757 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2772 flags = LDLM_FL_BLOCK_NOWAIT;
2778 flags = LDLM_FL_TEST_LOCK;
2781 CERROR("unknown fcntl lock command: %d\n", cmd);
2785 /* Save the old mode so that if the mode in the lock changes we
2786 * can decrement the appropriate reader or writer refcount. */
2787 file_lock->fl_type = einfo.ei_mode;
2789 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2790 LUSTRE_OPC_ANY, NULL);
2791 if (IS_ERR(op_data))
2792 RETURN(PTR_ERR(op_data));
2794 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2795 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2796 flock.l_flock.pid, flags, einfo.ei_mode,
2797 flock.l_flock.start, flock.l_flock.end);
2799 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2802 /* Restore the file lock type if not TEST lock. */
2803 if (!(flags & LDLM_FL_TEST_LOCK))
2804 file_lock->fl_type = fl_type;
2806 if ((file_lock->fl_flags & FL_FLOCK) &&
2807 (rc == 0 || file_lock->fl_type == F_UNLCK))
2808 rc2 = flock_lock_file_wait(file, file_lock);
2809 if ((file_lock->fl_flags & FL_POSIX) &&
2810 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2811 !(flags & LDLM_FL_TEST_LOCK))
2812 rc2 = posix_lock_file_wait(file, file_lock);
2814 if (rc2 && file_lock->fl_type != F_UNLCK) {
2815 einfo.ei_mode = LCK_NL;
2816 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2821 ll_finish_md_op_data(op_data);
2826 int ll_get_fid_by_name(struct inode *parent, const char *name,
2827 int namelen, struct lu_fid *fid)
2829 struct md_op_data *op_data = NULL;
2830 struct mdt_body *body;
2831 struct ptlrpc_request *req;
2835 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2836 LUSTRE_OPC_ANY, NULL);
2837 if (IS_ERR(op_data))
2838 RETURN(PTR_ERR(op_data));
2840 op_data->op_valid = OBD_MD_FLID;
2841 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2842 ll_finish_md_op_data(op_data);
2846 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2848 GOTO(out_req, rc = -EFAULT);
2850 *fid = body->mbo_fid1;
2852 ptlrpc_req_finished(req);
2856 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2857 const char *name, int namelen)
2859 struct dentry *dchild = NULL;
2860 struct inode *child_inode = NULL;
2861 struct md_op_data *op_data;
2862 struct ptlrpc_request *request = NULL;
2867 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2868 name, PFID(ll_inode2fid(parent)), mdtidx);
2870 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2871 0, LUSTRE_OPC_ANY, NULL);
2872 if (IS_ERR(op_data))
2873 RETURN(PTR_ERR(op_data));
2875 /* Get child FID first */
2876 qstr.hash = full_name_hash(name, namelen);
2879 dchild = d_lookup(file->f_dentry, &qstr);
2880 if (dchild != NULL) {
2881 if (dchild->d_inode != NULL) {
2882 child_inode = igrab(dchild->d_inode);
2883 if (child_inode != NULL) {
2884 mutex_lock(&child_inode->i_mutex);
2885 op_data->op_fid3 = *ll_inode2fid(child_inode);
2886 ll_invalidate_aliases(child_inode);
2891 rc = ll_get_fid_by_name(parent, name, namelen,
2897 if (!fid_is_sane(&op_data->op_fid3)) {
2898 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2899 ll_get_fsname(parent->i_sb, NULL, 0), name,
2900 PFID(&op_data->op_fid3));
2901 GOTO(out_free, rc = -EINVAL);
2904 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2909 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2910 PFID(&op_data->op_fid3), mdtidx);
2911 GOTO(out_free, rc = 0);
2914 op_data->op_mds = mdtidx;
2915 op_data->op_cli_flags = CLI_MIGRATE;
2916 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2917 namelen, name, namelen, &request);
2919 ll_update_times(request, parent);
2921 ptlrpc_req_finished(request);
2926 if (child_inode != NULL) {
2927 clear_nlink(child_inode);
2928 mutex_unlock(&child_inode->i_mutex);
2932 ll_finish_md_op_data(op_data);
2937 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2945 * test if some locks matching bits and l_req_mode are acquired
2946 * - bits can be in different locks
2947 * - if found clear the common lock bits in *bits
2948 * - the bits not found, are kept in *bits
2950 * \param bits [IN] searched lock bits [IN]
2951 * \param l_req_mode [IN] searched lock mode
2952 * \retval boolean, true iff all bits are found
2954 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2956 struct lustre_handle lockh;
2957 ldlm_policy_data_t policy;
2958 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2959 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2968 fid = &ll_i2info(inode)->lli_fid;
2969 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2970 ldlm_lockname[mode]);
2972 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2973 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2974 policy.l_inodebits.bits = *bits & (1 << i);
2975 if (policy.l_inodebits.bits == 0)
2978 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2979 &policy, mode, &lockh)) {
2980 struct ldlm_lock *lock;
2982 lock = ldlm_handle2lock(&lockh);
2985 ~(lock->l_policy_data.l_inodebits.bits);
2986 LDLM_LOCK_PUT(lock);
2988 *bits &= ~policy.l_inodebits.bits;
2995 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2996 struct lustre_handle *lockh, __u64 flags,
2999 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3004 fid = &ll_i2info(inode)->lli_fid;
3005 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3007 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3008 fid, LDLM_IBITS, &policy, mode, lockh);
3013 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3015 /* Already unlinked. Just update nlink and return success */
3016 if (rc == -ENOENT) {
3018 /* This path cannot be hit for regular files unless in
3019 * case of obscure races, so no need to to validate
3021 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3023 } else if (rc != 0) {
3024 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3025 "%s: revalidate FID "DFID" error: rc = %d\n",
3026 ll_get_fsname(inode->i_sb, NULL, 0),
3027 PFID(ll_inode2fid(inode)), rc);
3033 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3035 struct inode *inode = dentry->d_inode;
3036 struct ptlrpc_request *req = NULL;
3037 struct obd_export *exp;
3041 LASSERT(inode != NULL);
3043 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3044 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3046 exp = ll_i2mdexp(inode);
3048 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3049 * But under CMD case, it caused some lock issues, should be fixed
3050 * with new CMD ibits lock. See bug 12718 */
3051 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3052 struct lookup_intent oit = { .it_op = IT_GETATTR };
3053 struct md_op_data *op_data;
3055 if (ibits == MDS_INODELOCK_LOOKUP)
3056 oit.it_op = IT_LOOKUP;
3058 /* Call getattr by fid, so do not provide name at all. */
3059 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3060 dentry->d_inode, NULL, 0, 0,
3061 LUSTRE_OPC_ANY, NULL);
3062 if (IS_ERR(op_data))
3063 RETURN(PTR_ERR(op_data));
3065 rc = md_intent_lock(exp, op_data, &oit, &req,
3066 &ll_md_blocking_ast, 0);
3067 ll_finish_md_op_data(op_data);
3069 rc = ll_inode_revalidate_fini(inode, rc);
3073 rc = ll_revalidate_it_finish(req, &oit, dentry);
3075 ll_intent_release(&oit);
3079 /* Unlinked? Unhash dentry, so it is not picked up later by
3080 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3081 here to preserve get_cwd functionality on 2.6.
3083 if (!dentry->d_inode->i_nlink)
3084 d_lustre_invalidate(dentry, 0);
3086 ll_lookup_finish_locks(&oit, dentry);
3087 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3088 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3089 u64 valid = OBD_MD_FLGETATTR;
3090 struct md_op_data *op_data;
3093 if (S_ISREG(inode->i_mode)) {
3094 rc = ll_get_default_mdsize(sbi, &ealen);
3097 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3100 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3101 0, ealen, LUSTRE_OPC_ANY,
3103 if (IS_ERR(op_data))
3104 RETURN(PTR_ERR(op_data));
3106 op_data->op_valid = valid;
3107 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3108 * capa for this inode. Because we only keep capas of dirs
3110 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3111 ll_finish_md_op_data(op_data);
3113 rc = ll_inode_revalidate_fini(inode, rc);
3117 rc = ll_prep_inode(&inode, req, NULL, NULL);
3120 ptlrpc_req_finished(req);
3124 static int ll_merge_md_attr(struct inode *inode)
3126 struct cl_attr attr = { 0 };
3129 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3130 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3131 &attr, ll_md_blocking_ast);
3135 set_nlink(inode, attr.cat_nlink);
3136 inode->i_blocks = attr.cat_blocks;
3137 i_size_write(inode, attr.cat_size);
3139 ll_i2info(inode)->lli_atime = attr.cat_atime;
3140 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3141 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3147 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3149 struct inode *inode = dentry->d_inode;
3153 rc = __ll_inode_revalidate(dentry, ibits);
3157 /* if object isn't regular file, don't validate size */
3158 if (!S_ISREG(inode->i_mode)) {
3159 if (S_ISDIR(inode->i_mode) &&
3160 ll_i2info(inode)->lli_lsm_md != NULL) {
3161 rc = ll_merge_md_attr(inode);
3166 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3167 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3168 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3170 /* In case of restore, the MDT has the right size and has
3171 * already send it back without granting the layout lock,
3172 * inode is up-to-date so glimpse is useless.
3173 * Also to glimpse we need the layout, in case of a running
3174 * restore the MDT holds the layout lock so the glimpse will
3175 * block up to the end of restore (getattr will block)
3177 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3178 rc = ll_glimpse_size(inode);
3183 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3185 struct inode *inode = de->d_inode;
3186 struct ll_sb_info *sbi = ll_i2sbi(inode);
3187 struct ll_inode_info *lli = ll_i2info(inode);
3190 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3191 MDS_INODELOCK_LOOKUP);
3192 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3197 stat->dev = inode->i_sb->s_dev;
3198 if (ll_need_32bit_api(sbi))
3199 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3201 stat->ino = inode->i_ino;
3202 stat->mode = inode->i_mode;
3203 stat->uid = inode->i_uid;
3204 stat->gid = inode->i_gid;
3205 stat->rdev = inode->i_rdev;
3206 stat->atime = inode->i_atime;
3207 stat->mtime = inode->i_mtime;
3208 stat->ctime = inode->i_ctime;
3209 stat->blksize = 1 << inode->i_blkbits;
3211 stat->nlink = inode->i_nlink;
3212 stat->size = i_size_read(inode);
3213 stat->blocks = inode->i_blocks;
3218 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3219 __u64 start, __u64 len)
3223 struct ll_user_fiemap *fiemap;
3224 unsigned int extent_count = fieinfo->fi_extents_max;
3226 num_bytes = sizeof(*fiemap) + (extent_count *
3227 sizeof(struct ll_fiemap_extent));
3228 OBD_ALLOC_LARGE(fiemap, num_bytes);
3233 fiemap->fm_flags = fieinfo->fi_flags;
3234 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3235 fiemap->fm_start = start;
3236 fiemap->fm_length = len;
3237 if (extent_count > 0)
3238 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3239 sizeof(struct ll_fiemap_extent));
3241 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3243 fieinfo->fi_flags = fiemap->fm_flags;
3244 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3245 if (extent_count > 0)
3246 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3247 fiemap->fm_mapped_extents *
3248 sizeof(struct ll_fiemap_extent));
3250 OBD_FREE_LARGE(fiemap, num_bytes);
3254 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3256 struct ll_inode_info *lli = ll_i2info(inode);
3257 struct posix_acl *acl = NULL;
3260 spin_lock(&lli->lli_lock);
3261 /* VFS' acl_permission_check->check_acl will release the refcount */
3262 acl = posix_acl_dup(lli->lli_posix_acl);
3263 spin_unlock(&lli->lli_lock);
3268 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3270 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3271 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3273 ll_check_acl(struct inode *inode, int mask)
3276 # ifdef CONFIG_FS_POSIX_ACL
3277 struct posix_acl *acl;
3281 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3282 if (flags & IPERM_FLAG_RCU)
3285 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3290 rc = posix_acl_permission(inode, acl, mask);
3291 posix_acl_release(acl);
3294 # else /* !CONFIG_FS_POSIX_ACL */
3296 # endif /* CONFIG_FS_POSIX_ACL */
3298 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3300 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3301 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3303 # ifdef HAVE_INODE_PERMISION_2ARGS
3304 int ll_inode_permission(struct inode *inode, int mask)
3306 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3311 struct ll_sb_info *sbi;
3312 struct root_squash_info *squash;
3313 struct cred *cred = NULL;
3314 const struct cred *old_cred = NULL;
3316 bool squash_id = false;
3319 #ifdef MAY_NOT_BLOCK
3320 if (mask & MAY_NOT_BLOCK)
3322 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3323 if (flags & IPERM_FLAG_RCU)
3327 /* as root inode are NOT getting validated in lookup operation,
3328 * need to do it before permission check. */
3330 if (inode == inode->i_sb->s_root->d_inode) {
3331 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3332 MDS_INODELOCK_LOOKUP);
3337 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3338 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3340 /* squash fsuid/fsgid if needed */
3341 sbi = ll_i2sbi(inode);
3342 squash = &sbi->ll_squash;
3343 if (unlikely(squash->rsi_uid != 0 &&
3344 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3345 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3349 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3350 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3351 squash->rsi_uid, squash->rsi_gid);
3353 /* update current process's credentials
3354 * and FS capability */
3355 cred = prepare_creds();
3359 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3360 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3361 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3362 if ((1 << cap) & CFS_CAP_FS_MASK)
3363 cap_lower(cred->cap_effective, cap);
3365 old_cred = override_creds(cred);
3368 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3370 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3371 rc = lustre_check_remote_perm(inode, mask);
3373 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3375 /* restore current process's credentials and FS capability */
3377 revert_creds(old_cred);
3384 /* -o localflock - only provides locally consistent flock locks */
3385 struct file_operations ll_file_operations = {
3386 .read = ll_file_read,
3387 .aio_read = ll_file_aio_read,
3388 .write = ll_file_write,
3389 .aio_write = ll_file_aio_write,
3390 .unlocked_ioctl = ll_file_ioctl,
3391 .open = ll_file_open,
3392 .release = ll_file_release,
3393 .mmap = ll_file_mmap,
3394 .llseek = ll_file_seek,
3395 .splice_read = ll_file_splice_read,
3400 struct file_operations ll_file_operations_flock = {
3401 .read = ll_file_read,
3402 .aio_read = ll_file_aio_read,
3403 .write = ll_file_write,
3404 .aio_write = ll_file_aio_write,
3405 .unlocked_ioctl = ll_file_ioctl,
3406 .open = ll_file_open,
3407 .release = ll_file_release,
3408 .mmap = ll_file_mmap,
3409 .llseek = ll_file_seek,
3410 .splice_read = ll_file_splice_read,
3413 .flock = ll_file_flock,
3414 .lock = ll_file_flock
3417 /* These are for -o noflock - to return ENOSYS on flock calls */
3418 struct file_operations ll_file_operations_noflock = {
3419 .read = ll_file_read,
3420 .aio_read = ll_file_aio_read,
3421 .write = ll_file_write,
3422 .aio_write = ll_file_aio_write,
3423 .unlocked_ioctl = ll_file_ioctl,
3424 .open = ll_file_open,
3425 .release = ll_file_release,
3426 .mmap = ll_file_mmap,
3427 .llseek = ll_file_seek,
3428 .splice_read = ll_file_splice_read,
3431 .flock = ll_file_noflock,
3432 .lock = ll_file_noflock
3435 struct inode_operations ll_file_inode_operations = {
3436 .setattr = ll_setattr,
3437 .getattr = ll_getattr,
3438 .permission = ll_inode_permission,
3439 .setxattr = ll_setxattr,
3440 .getxattr = ll_getxattr,
3441 .listxattr = ll_listxattr,
3442 .removexattr = ll_removexattr,
3443 .fiemap = ll_fiemap,
3444 #ifdef HAVE_IOP_GET_ACL
3445 .get_acl = ll_get_acl,
3449 /* dynamic ioctl number support routins */
3450 static struct llioc_ctl_data {
3451 struct rw_semaphore ioc_sem;
3452 struct list_head ioc_head;
3454 __RWSEM_INITIALIZER(llioc.ioc_sem),
3455 LIST_HEAD_INIT(llioc.ioc_head)
3460 struct list_head iocd_list;
3461 unsigned int iocd_size;
3462 llioc_callback_t iocd_cb;
3463 unsigned int iocd_count;
3464 unsigned int iocd_cmd[0];
3467 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3470 struct llioc_data *in_data = NULL;
3473 if (cb == NULL || cmd == NULL ||
3474 count > LLIOC_MAX_CMD || count < 0)
3477 size = sizeof(*in_data) + count * sizeof(unsigned int);
3478 OBD_ALLOC(in_data, size);
3479 if (in_data == NULL)
3482 memset(in_data, 0, sizeof(*in_data));
3483 in_data->iocd_size = size;
3484 in_data->iocd_cb = cb;
3485 in_data->iocd_count = count;
3486 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3488 down_write(&llioc.ioc_sem);
3489 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3490 up_write(&llioc.ioc_sem);
3495 void ll_iocontrol_unregister(void *magic)
3497 struct llioc_data *tmp;
3502 down_write(&llioc.ioc_sem);
3503 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3505 unsigned int size = tmp->iocd_size;
3507 list_del(&tmp->iocd_list);
3508 up_write(&llioc.ioc_sem);
3510 OBD_FREE(tmp, size);
3514 up_write(&llioc.ioc_sem);
3516 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3519 EXPORT_SYMBOL(ll_iocontrol_register);
3520 EXPORT_SYMBOL(ll_iocontrol_unregister);
3522 static enum llioc_iter
3523 ll_iocontrol_call(struct inode *inode, struct file *file,
3524 unsigned int cmd, unsigned long arg, int *rcp)
3526 enum llioc_iter ret = LLIOC_CONT;
3527 struct llioc_data *data;
3528 int rc = -EINVAL, i;
3530 down_read(&llioc.ioc_sem);
3531 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3532 for (i = 0; i < data->iocd_count; i++) {
3533 if (cmd != data->iocd_cmd[i])
3536 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3540 if (ret == LLIOC_STOP)
3543 up_read(&llioc.ioc_sem);
3550 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3552 struct ll_inode_info *lli = ll_i2info(inode);
3553 struct cl_object *obj = lli->lli_clob;
3554 struct cl_env_nest nest;
3562 env = cl_env_nested_get(&nest);
3564 RETURN(PTR_ERR(env));
3566 rc = cl_conf_set(env, lli->lli_clob, conf);
3570 if (conf->coc_opc == OBJECT_CONF_SET) {
3571 struct ldlm_lock *lock = conf->coc_lock;
3572 struct cl_layout cl = {
3576 LASSERT(lock != NULL);
3577 LASSERT(ldlm_has_layout(lock));
3579 /* it can only be allowed to match after layout is
3580 * applied to inode otherwise false layout would be
3581 * seen. Applying layout shoud happen before dropping
3582 * the intent lock. */
3583 ldlm_lock_allow_match(lock);
3585 rc = cl_object_layout_get(env, obj, &cl);
3590 DFID": layout version change: %u -> %u\n",
3591 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3593 ll_layout_version_set(lli, cl.cl_layout_gen);
3597 cl_env_nested_put(&nest, env);
3602 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3603 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3606 struct ll_sb_info *sbi = ll_i2sbi(inode);
3607 struct obd_capa *oc;
3608 struct ptlrpc_request *req;
3609 struct mdt_body *body;
3616 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3617 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3618 lock->l_lvb_data, lock->l_lvb_len);
3620 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3623 /* if layout lock was granted right away, the layout is returned
3624 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3625 * blocked and then granted via completion ast, we have to fetch
3626 * layout here. Please note that we can't use the LVB buffer in
3627 * completion AST because it doesn't have a large enough buffer */
3628 oc = ll_mdscapa_get(inode);
3629 rc = ll_get_default_mdsize(sbi, &lmmsize);
3631 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3632 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3638 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3640 GOTO(out, rc = -EPROTO);
3642 lmmsize = body->mbo_eadatasize;
3643 if (lmmsize == 0) /* empty layout */
3646 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3648 GOTO(out, rc = -EFAULT);
3650 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3651 if (lvbdata == NULL)
3652 GOTO(out, rc = -ENOMEM);
3654 memcpy(lvbdata, lmm, lmmsize);
3655 lock_res_and_lock(lock);
3656 if (lock->l_lvb_data != NULL)
3657 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3659 lock->l_lvb_data = lvbdata;
3660 lock->l_lvb_len = lmmsize;
3661 unlock_res_and_lock(lock);
3666 ptlrpc_req_finished(req);
3671 * Apply the layout to the inode. Layout lock is held and will be released
3674 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3675 struct inode *inode)
3677 struct ll_inode_info *lli = ll_i2info(inode);
3678 struct ll_sb_info *sbi = ll_i2sbi(inode);
3679 struct ldlm_lock *lock;
3680 struct lustre_md md = { NULL };
3681 struct cl_object_conf conf;
3684 bool wait_layout = false;
3687 LASSERT(lustre_handle_is_used(lockh));
3689 lock = ldlm_handle2lock(lockh);
3690 LASSERT(lock != NULL);
3691 LASSERT(ldlm_has_layout(lock));
3693 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3694 PFID(&lli->lli_fid), inode);
3696 /* in case this is a caching lock and reinstate with new inode */
3697 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3699 lock_res_and_lock(lock);
3700 lvb_ready = ldlm_is_lvb_ready(lock);
3701 unlock_res_and_lock(lock);
3702 /* checking lvb_ready is racy but this is okay. The worst case is
3703 * that multi processes may configure the file on the same time. */
3708 rc = ll_layout_fetch(inode, lock);
3712 /* for layout lock, lmm is returned in lock's lvb.
3713 * lvb_data is immutable if the lock is held so it's safe to access it
3714 * without res lock. See the description in ldlm_lock_decref_internal()
3715 * for the condition to free lvb_data of layout lock */
3716 if (lock->l_lvb_data != NULL) {
3717 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3718 lock->l_lvb_data, lock->l_lvb_len);
3720 CERROR("%s: file "DFID" unpackmd error: %d\n",
3721 ll_get_fsname(inode->i_sb, NULL, 0),
3722 PFID(&lli->lli_fid), rc);
3726 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3727 lock->l_lvb_data, lock->l_lvb_len);
3732 /* set layout to file. Unlikely this will fail as old layout was
3733 * surely eliminated */
3734 memset(&conf, 0, sizeof conf);
3735 conf.coc_opc = OBJECT_CONF_SET;
3736 conf.coc_inode = inode;
3737 conf.coc_lock = lock;
3738 conf.u.coc_md = &md;
3739 rc = ll_layout_conf(inode, &conf);
3742 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3744 /* refresh layout failed, need to wait */
3745 wait_layout = rc == -EBUSY;
3749 LDLM_LOCK_PUT(lock);
3750 ldlm_lock_decref(lockh, mode);
3752 /* wait for IO to complete if it's still being used. */
3754 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3755 ll_get_fsname(inode->i_sb, NULL, 0),
3756 PFID(&lli->lli_fid), inode);
3758 memset(&conf, 0, sizeof conf);
3759 conf.coc_opc = OBJECT_CONF_WAIT;
3760 conf.coc_inode = inode;
3761 rc = ll_layout_conf(inode, &conf);
3765 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3766 ll_get_fsname(inode->i_sb, NULL, 0),
3767 PFID(&lli->lli_fid), rc);
3772 static int ll_layout_refresh_locked(struct inode *inode)
3774 struct ll_inode_info *lli = ll_i2info(inode);
3775 struct ll_sb_info *sbi = ll_i2sbi(inode);
3776 struct md_op_data *op_data;
3777 struct lookup_intent it;
3778 struct lustre_handle lockh;
3780 struct ldlm_enqueue_info einfo = {
3781 .ei_type = LDLM_IBITS,
3783 .ei_cb_bl = &ll_md_blocking_ast,
3784 .ei_cb_cp = &ldlm_completion_ast,
3790 /* mostly layout lock is caching on the local side, so try to match
3791 * it before grabbing layout lock mutex. */
3792 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3793 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3794 if (mode != 0) { /* hit cached lock */
3795 rc = ll_layout_lock_set(&lockh, mode, inode);
3802 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3803 0, 0, LUSTRE_OPC_ANY, NULL);
3804 if (IS_ERR(op_data))
3805 RETURN(PTR_ERR(op_data));
3807 /* have to enqueue one */
3808 memset(&it, 0, sizeof(it));
3809 it.it_op = IT_LAYOUT;
3810 lockh.cookie = 0ULL;
3812 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3813 ll_get_fsname(inode->i_sb, NULL, 0),
3814 PFID(&lli->lli_fid), inode);
3816 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3817 if (it.d.lustre.it_data != NULL)
3818 ptlrpc_req_finished(it.d.lustre.it_data);
3819 it.d.lustre.it_data = NULL;
3821 ll_finish_md_op_data(op_data);
3823 mode = it.d.lustre.it_lock_mode;
3824 it.d.lustre.it_lock_mode = 0;
3825 ll_intent_drop_lock(&it);
3828 /* set lock data in case this is a new lock */
3829 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3830 rc = ll_layout_lock_set(&lockh, mode, inode);
3839 * This function checks if there exists a LAYOUT lock on the client side,
3840 * or enqueues it if it doesn't have one in cache.
3842 * This function will not hold layout lock so it may be revoked any time after
3843 * this function returns. Any operations depend on layout should be redone
3846 * This function should be called before lov_io_init() to get an uptodate
3847 * layout version, the caller should save the version number and after IO
3848 * is finished, this function should be called again to verify that layout
3849 * is not changed during IO time.
3851 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3853 struct ll_inode_info *lli = ll_i2info(inode);
3854 struct ll_sb_info *sbi = ll_i2sbi(inode);
3858 *gen = ll_layout_version_get(lli);
3859 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3863 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3864 LASSERT(S_ISREG(inode->i_mode));
3866 /* take layout lock mutex to enqueue layout lock exclusively. */
3867 mutex_lock(&lli->lli_layout_mutex);
3869 rc = ll_layout_refresh_locked(inode);
3873 *gen = ll_layout_version_get(lli);
3875 mutex_unlock(&lli->lli_layout_mutex);
3881 * This function send a restore request to the MDT
3883 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3885 struct hsm_user_request *hur;
3889 len = sizeof(struct hsm_user_request) +
3890 sizeof(struct hsm_user_item);
3891 OBD_ALLOC(hur, len);
3895 hur->hur_request.hr_action = HUA_RESTORE;
3896 hur->hur_request.hr_archive_id = 0;
3897 hur->hur_request.hr_flags = 0;
3898 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3899 sizeof(hur->hur_user_item[0].hui_fid));
3900 hur->hur_user_item[0].hui_extent.offset = offset;
3901 hur->hur_user_item[0].hui_extent.length = length;
3902 hur->hur_request.hr_itemcount = 1;
3903 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,